From 9ade58f0555430cec851e307c83c3a56c4a77d0b Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Tue, 2 May 2023 17:20:02 +0200
Subject: [PATCH 001/935] [ONNX] Sam fix (#23110)

* [WIP] Fix for the ONNX export

* Apply changes

* Remove commented code

* Resolve todo

* empty -> zeros

* fix slow tests

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 src/transformers/models/sam/modeling_sam.py | 49 +++++++++++----------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 393f20cfc83e..bf14a4b2413a 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -223,9 +223,7 @@ def _separate_heads(self, hidden_states: Tensor, num_attention_heads: int) -> Te
     def _recombine_heads(self, hidden_states: Tensor, point_batch_size: int) -> Tensor:
         batch, n_heads, n_tokens, c_per_head = hidden_states.shape
         hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states.reshape(
-            batch // max(1, point_batch_size), point_batch_size, n_tokens, n_heads * c_per_head
-        )
+        return hidden_states.reshape(batch // point_batch_size, point_batch_size, n_tokens, n_heads * c_per_head)
 
     def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor:
         # Input projections
@@ -482,7 +480,7 @@ def forward(
                 Whether or not to return the attentions tensors of all attention layers.
         """
         batch_size, num_channels, height, width = image_embeddings.shape
-        point_batch_size = max(1, sparse_prompt_embeddings.shape[1])
+        point_batch_size = sparse_prompt_embeddings.shape[1]
         # Concatenate output tokens
         output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
         output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1)
@@ -634,8 +632,18 @@ def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -
             torch.tensor(0.0, dtype=point_embedding.dtype, device=point_embedding.device),
         )
 
-        point_embedding[labels == 0] += self.point_embed[0].weight
-        point_embedding[labels == 1] += self.point_embed[1].weight
+        point_embedding = torch.where(
+            (labels == 0)[:, :, :, None],
+            point_embedding + self.point_embed[0].weight[None, None, :, :],
+            point_embedding,
+        )
+
+        point_embedding = torch.where(
+            (labels == 1)[:, :, :, None],
+            point_embedding + self.point_embed[1].weight[None, None, :, :],
+            point_embedding,
+        )
+
         return point_embedding
 
     def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
@@ -675,8 +683,7 @@ def forward(
             if input_labels is None:
                 raise ValueError("If points are provided, labels must also be provided.")
             point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None))
-            sparse_embeddings = torch.empty((batch_size, point_batch_size, 0, self.hidden_size), device=target_device)
-            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=2)
+            sparse_embeddings = point_embeddings
         if input_boxes is not None:
             batch_size = input_boxes.shape[0]
             box_embeddings = self._embed_boxes(input_boxes)
@@ -692,7 +699,7 @@ def forward(
             )
 
         if sparse_embeddings is None:
-            sparse_embeddings = torch.empty((batch_size, 0, 1, self.hidden_size), device=target_device)
+            sparse_embeddings = torch.zeros((batch_size, 1, 1, self.hidden_size), device=target_device)
 
         return sparse_embeddings, dense_embeddings
 
@@ -742,17 +749,13 @@ def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.
             Extracted positional embeddings according to relative positions.
         """
         max_rel_dist = int(2 * max(q_size, k_size) - 1)
-        # Interpolate rel pos if needed.
-        if rel_pos.shape[0] != max_rel_dist:
-            # Interpolate rel pos.
-            rel_pos_resized = F.interpolate(
-                rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
-                size=max_rel_dist,
-                mode="linear",
-            )
-            rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
-        else:
-            rel_pos_resized = rel_pos
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
 
         # Scale the coords with short length if shapes for q and k are different.
         q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
@@ -865,8 +868,7 @@ def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> Tup
 
         pad_h = (window_size - height % window_size) % window_size
         pad_w = (window_size - width % window_size) % window_size
-        if pad_h > 0 or pad_w > 0:
-            hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
         pad_height, pad_width = height + pad_h, width + pad_w
 
         hidden_states = hidden_states.reshape(
@@ -902,8 +904,7 @@ def window_unpartition(
             hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
         )
 
-        if pad_height > height or pad_width > width:
-            hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
         return hidden_states
 
     def forward(

From 805db1fe13b3155d61ac5571439f5d619e47022f Mon Sep 17 00:00:00 2001
From: Alex Punnen <alexcpn@gmail.com>
Date: Tue, 2 May 2023 22:37:30 +0530
Subject: [PATCH 002/935] num_noise_spans should be <= num_items #22246
 (#22938)

---
 examples/flax/language-modeling/run_t5_mlm_flax.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index 152760f4bf4b..f3cec97b2e82 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -418,13 +418,14 @@ def random_spans_noise_mask(self, length):
         orig_length = length
 
         num_noise_tokens = int(np.round(length * self.noise_density))
+        num_nonnoise_tokens = length - num_noise_tokens
         # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
         num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
-        num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
+        # num_noise_tokens should be less than num_noise_tokens and num_nonnoise_tokens
+        num_noise_spans = int(np.round(min(num_noise_tokens, num_nonnoise_tokens) / self.mean_noise_span_length))
 
         # avoid degeneracy by ensuring positive number of noise spans
         num_noise_spans = max(num_noise_spans, 1)
-        num_nonnoise_tokens = length - num_noise_tokens
 
         # pick the lengths of the noise spans and the non-noise spans
         def _random_segmentation(num_items, num_segments):

From 3ff89f29f542dc0e16b48e7b8d7710555f454d5c Mon Sep 17 00:00:00 2001
From: "Gregory (Gabriel) Barello"
 <48561156+gbarello-uipath@users.noreply.github.com>
Date: Tue, 2 May 2023 10:40:41 -0700
Subject: [PATCH 003/935] Fixed default config for `Pix2Struct` model to set
 `Pix2StructTextModel` to `is_decoder=True` (#23051)

added  as default keyword arg. to  in order to correctly configure the decoder
---
 src/transformers/models/pix2struct/configuration_pix2struct.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index 244cb2705867..32aa34941f8f 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -118,6 +118,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         tie_word_embeddings=False,
+        is_decoder=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -144,6 +145,7 @@ def __init__(
             eos_token_id=eos_token_id,
             decoder_start_token_id=decoder_start_token_id,
             tie_word_embeddings=tie_word_embeddings,
+            is_decoder=is_decoder,
             **kwargs,
         )
 

From 4b6aecb48e4961efef9edb8062dbbdd1f3d9385e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 2 May 2023 22:02:39 -0400
Subject: [PATCH 004/935] Pin numba for now (#23118)

---
 setup.py                                      | 4 +++-
 src/transformers/dependency_versions_table.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1eaee04a67b9..abafc03bc6f2 100644
--- a/setup.py
+++ b/setup.py
@@ -135,6 +135,7 @@
     "librosa",
     "nltk",
     "natten>=0.14.6",
+    "numba<0.57.0",  # Can be removed once unpinned.
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
@@ -286,7 +287,8 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
+# numba can be removed here once unpinned
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm", "numba")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 68ad7a1587f4..bae19acd3e1d 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -36,6 +36,7 @@
     "librosa": "librosa",
     "nltk": "nltk",
     "natten": "natten>=0.14.6",
+    "numba": "numba<0.57.0",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",

From b61d5b47f640308068139561f673765b2af39874 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 3 May 2023 11:21:59 +0200
Subject: [PATCH 005/935] =?UTF-8?q?[`Doctest`]=C2=A0Fix=20pix2struct=20doc?=
 =?UTF-8?q?test=20(#23121)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix pix2struct doctest
---
 src/transformers/models/pix2struct/modeling_pix2struct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 97fbbb57ac2a..ffaebc20372e 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1702,7 +1702,7 @@ def forward(
         >>> outputs = model(**inputs, labels=labels)
         >>> loss = outputs.loss
         >>> print(f"{loss.item():.5f}")
-        4.58370
+        5.95566
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.text_config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From ce31e3c8bf039ef42294e9434cc68bcf234bd5ef Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 3 May 2023 14:24:50 +0100
Subject: [PATCH 006/935] Generate: slow assisted generation test (#23125)

---
 tests/generation/test_utils.py                | 1 +
 tests/models/roberta/test_modeling_roberta.py | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 0dfb4368d7d0..3b96f2b2bdff 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1457,6 +1457,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
             for output in (output_contrastive, output_generate):
                 self._check_outputs(output, input_ids, model.config, use_cache=True)
 
+    @slow  # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%.
     def test_assisted_decoding_matches_greedy_search(self):
         # This test ensures that the assisted generation does not introduce output changes over greedy search.
         # It breaks the pattern in the tests above, for multiple reasons:
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 11b400115678..49caa67d4f6c 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -397,10 +397,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     )
     fx_compatible = True
 
-    @unittest.skip(reason="Fix me @gante")
-    def test_assisted_greedy_search_matches_greedy_search(self):
-        super().test_assisted_greedy_search_matches_greedy_search()
-
     def setUp(self):
         self.model_tester = RobertaModelTester(self)
         self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)

From a0bd464776149e8eb5b73f19d7b3c3bebe6886f9 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 3 May 2023 14:29:55 +0100
Subject: [PATCH 007/935] Generate: correct beam search length on score
 calculation for multi batch generation (#23127)

---
 src/transformers/generation/beam_search.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 5c423a6a1807..d8bd8010dce3 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -212,7 +212,7 @@ def process(
         eos_token_id: Optional[Union[int, List[int]]] = None,
         beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor]:
-        cur_len = input_ids.shape[-1]
+        cur_len = input_ids.shape[-1] + 1  # add up to the length which the next_scores is calculated on
         batch_size = len(self._beam_hyps)
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
             if self.num_beam_groups > 1:
@@ -287,7 +287,6 @@ def process(
                 )
 
             # Check if we are done so that we can save a pad step if all(done)
-            cur_len += 1  # add up to the length which the next_scores is calculated on
             self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
                 next_scores[batch_idx].max().item(), cur_len
             )
@@ -532,7 +531,7 @@ def process(
                 indicating to which beam the next tokens shall be added.
         """
 
-        cur_len = input_ids.shape[-1]
+        cur_len = input_ids.shape[-1] + 1  # add up to the length which the next_scores is calculated on
         batch_size = len(self._beam_hyps)
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
             if self.num_beam_groups > 1:
@@ -617,7 +616,6 @@ def process(
                 )
 
             # Check if we are done so that we can save a pad step if all(done)
-            cur_len += 1  # add up to the length which the next_scores is calculated on
             self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
                 next_scores[batch_idx].max().item(), cur_len
             )

From 2a16d8b275e4a76ee07c8c92ddb2e570ba3cbd9d Mon Sep 17 00:00:00 2001
From: Manuel <43467008+ManuelFay@users.noreply.github.com>
Date: Wed, 3 May 2023 15:36:30 +0200
Subject: [PATCH 008/935] improve unclear documentation (#23123)

---
 src/transformers/trainer_callback.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 8749e5f3f574..808c1b4702f9 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -534,7 +534,8 @@ class EarlyStoppingCallback(TrainerCallback):
             specified metric must improve to satisfy early stopping conditions. `
 
     This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric
-    in [`TrainerState`].
+    in [`TrainerState`]. Note that if the [`TrainingArguments`] argument *save_steps* differs from *eval_steps*, the
+    early stopping will not occur until the next save step.
     """
 
     def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0):

From 3a08dc63fd788f768e1f16a97db14d0015368940 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 3 May 2023 14:43:17 +0100
Subject: [PATCH 009/935] Generate: better warnings with pipelines (#23128)

---
 src/transformers/pipelines/base.py                 | 4 +++-
 src/transformers/pipelines/text2text_generation.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index b728e94f34ee..de6c9a8ec4d9 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -803,10 +803,12 @@ def __init__(
         self.torch_dtype = torch_dtype
         self.binary_output = binary_output
 
-        # Update config with task specific parameters
+        # Update config and generation_config with task specific parameters
         task_specific_params = self.model.config.task_specific_params
         if task_specific_params is not None and task in task_specific_params:
             self.model.config.update(task_specific_params.get(task))
+            if self.model.can_generate():
+                self.model.generation_config.update(**task_specific_params.get(task))
 
         self.call_count = 0
         self._batch_size = kwargs.pop("batch_size", None)
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index dbd45e6ff1be..48df10336a65 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -273,7 +273,8 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int) -> b
 
         if input_length < max_length:
             logger.warning(
-                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. You might "
+                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
+                "a summarization task, where outputs shorter than the input are typically wanted, you might "
                 f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})"
             )
 

From b53004fdcefa6b74ce883c3853c983d0beb14bdd Mon Sep 17 00:00:00 2001
From: Samin Yasar <saminc97@gmail.com>
Date: Wed, 3 May 2023 19:53:00 +0600
Subject: [PATCH 010/935] Add resources for LayoutLmV2 and reformat
 documentation resources (#23115)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add resources for layoutlmv2

* remove 🌎 from some resources
---
 docs/source/en/model_doc/layoutlmv2.mdx | 29 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/model_doc/layoutlmv2.mdx b/docs/source/en/model_doc/layoutlmv2.mdx
index 031cce83deb2..6d2a9dc3bbee 100644
--- a/docs/source/en/model_doc/layoutlmv2.mdx
+++ b/docs/source/en/model_doc/layoutlmv2.mdx
@@ -121,6 +121,28 @@ section below.
 In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
 [LayoutXLM's documentation page](layoutxlm).
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A notebook on how to [finetune LayoutLMv2 for text-classification on RVL-CDIP dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
+- See also: [Text classification task guide](../tasks/sequence_classification)
+
+<PipelineTag pipeline="question-answering"/>
+
+- A notebook on how to [finetune LayoutLMv2 for question-answering on DocVQA dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb).
+- See also: [Question answering task guide](../tasks/question_answering)
+- See also: [Document question answering task guide](../tasks/document_question_answering)
+
+
+<PipelineTag pipeline="token-classification"/>
+
+- A notebook on how to [finetune LayoutLMv2 for token-classification on CORD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/CORD/Fine_tuning_LayoutLMv2ForTokenClassification_on_CORD.ipynb).
+- A notebook on how to [finetune LayoutLMv2 for token-classification on FUNSD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb).
+- See also: [Token classification task guide](../tasks/token_classification)
+
 ## Usage: LayoutLMv2Processor
 
 The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally
@@ -266,13 +288,6 @@ print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
 
-## Documentation resources
-
-- [Document question answering task guide](../tasks/document_question_answering)
-- [Text classification task guide](../tasks/sequence_classification)
-- [Token classification task guide](../tasks/token_classification)
-- [Question answering task guide](../tasks/question_answering)
-
 ## LayoutLMv2Config
 
 [[autodoc]] LayoutLMv2Config

From 56b8d49ddfceb4278e7d893f6ef7d13c86b67078 Mon Sep 17 00:00:00 2001
From: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
Date: Wed, 3 May 2023 17:21:27 +0300
Subject: [PATCH 011/935] Fix ConvNext V2 paramater naming issue (#23122)

Fixes the parameter naming issue in ConvNextV2GRN module
---
 .../models/convnextv2/convert_convnextv2_to_pytorch.py      | 4 ++++
 src/transformers/models/convnextv2/modeling_convnextv2.py   | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
index b2a0a52d27e8..8094ecf0d615 100644
--- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
+++ b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
@@ -99,6 +99,10 @@ def rename_key(name):
     if "stages" in name and "downsampling_layer" not in name:
         # stages.0.0. for instance should be renamed to stages.0.layers.0.
         name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
+    if "gamma" in name:
+        name = name.replace("gamma", "weight")
+    if "beta" in name:
+        name = name.replace("beta", "bias")
     if "stages" in name:
         name = name.replace("stages", "encoder.stages")
     if "norm" in name:
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index 5fbf7831ef12..c309cdc3b6e2 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -100,14 +100,14 @@ class ConvNextV2GRN(nn.Module):
 
     def __init__(self, dim: int):
         super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
-        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.weight = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.bias = nn.Parameter(torch.zeros(1, 1, 1, dim))
 
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
         # Compute and normalize global spatial feature maps
         global_features = torch.norm(hidden_states, p=2, dim=(1, 2), keepdim=True)
         norm_features = global_features / (global_features.mean(dim=-1, keepdim=True) + 1e-6)
-        hidden_states = self.gamma * (hidden_states * norm_features) + self.beta + hidden_states
+        hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
 
         return hidden_states
 

From ee4bc07474015d574742ad25b48dd7f6ccba297f Mon Sep 17 00:00:00 2001
From: Xuehai Pan <XuehaiPan@pku.edu.cn>
Date: Wed, 3 May 2023 22:49:54 +0800
Subject: [PATCH 012/935] Support union types `X | Y` syntax for
 `HfArgumentParser` for Python 3.10+ (#23126)

* Support union types `X | Y` syntax for `HfArgumentParser` for Python 3.10+

* Add tests for PEP 604 for `HfArgumentParser`

* Reorganize tests
---
 src/transformers/hf_argparser.py | 18 +++++++-
 tests/utils/test_hf_argparser.py | 73 +++++++++++++++++++++++---------
 2 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index b1fa67f45823..f808acebe902 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -15,6 +15,7 @@
 import dataclasses
 import json
 import sys
+import types
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError
 from copy import copy
 from enum import Enum
@@ -159,7 +160,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
             aliases = [aliases]
 
         origin_type = getattr(field.type, "__origin__", field.type)
-        if origin_type is Union:
+        if origin_type is Union or (hasattr(types, "UnionType") and isinstance(origin_type, types.UnionType)):
             if str not in field.type.__args__ and (
                 len(field.type.__args__) != 2 or type(None) not in field.type.__args__
             ):
@@ -245,10 +246,23 @@ def _add_dataclass_arguments(self, dtype: DataClassType):
             type_hints: Dict[str, type] = get_type_hints(dtype)
         except NameError:
             raise RuntimeError(
-                f"Type resolution failed for f{dtype}. Try declaring the class in global scope or "
+                f"Type resolution failed for {dtype}. Try declaring the class in global scope or "
                 "removing line of `from __future__ import annotations` which opts in Postponed "
                 "Evaluation of Annotations (PEP 563)"
             )
+        except TypeError as ex:
+            # Remove this block when we drop Python 3.9 support
+            if sys.version_info[:2] < (3, 10) and "unsupported operand type(s) for |" in str(ex):
+                python_version = ".".join(map(str, sys.version_info[:3]))
+                raise RuntimeError(
+                    f"Type resolution failed for {dtype} on Python {python_version}. Try removing "
+                    "line of `from __future__ import annotations` which opts in union types as "
+                    "`X | Y` (PEP 604) via Postponed Evaluation of Annotations (PEP 563). To "
+                    "support Python versions that lower than 3.10, you need to use "
+                    "`typing.Union[X, Y]` instead of `X | Y` and `typing.Optional[X]` instead of "
+                    "`X | None`."
+                ) from ex
+            raise
 
         for field in dataclasses.fields(dtype):
             if not field.init:
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index 0ad3c9c2ac46..a9db072f04d1 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -15,6 +15,7 @@
 import argparse
 import json
 import os
+import sys
 import tempfile
 import unittest
 from argparse import Namespace
@@ -36,6 +37,10 @@
     # For Python 3.7
     from typing_extensions import Literal
 
+# Since Python 3.10, we can use the builtin `|` operator for Union types
+# See PEP 604: https://peps.python.org/pep-0604
+is_python_no_less_than_3_10 = sys.version_info >= (3, 10)
+
 
 def list_field(default=None, metadata=None):
     return field(default_factory=lambda: default, metadata=metadata)
@@ -125,6 +130,23 @@ class StringLiteralAnnotationExample:
     foo_str: "List[str]" = list_field(default=["Hallo", "Bonjour", "Hello"])
 
 
+if is_python_no_less_than_3_10:
+
+    @dataclass
+    class WithDefaultBoolExamplePep604:
+        foo: bool = False
+        baz: bool = True
+        opt: bool | None = None
+
+    @dataclass
+    class OptionalExamplePep604:
+        foo: int | None = None
+        bar: float | None = field(default=None, metadata={"help": "help message"})
+        baz: str | None = None
+        ces: list[str] | None = list_field(default=[])
+        des: list[int] | None = list_field(default=[])
+
+
 class HfArgumentParserTest(unittest.TestCase):
     def argparsersEqual(self, a: argparse.ArgumentParser, b: argparse.ArgumentParser):
         """
@@ -167,8 +189,6 @@ def test_with_default(self):
         self.argparsersEqual(parser, expected)
 
     def test_with_default_bool(self):
-        parser = HfArgumentParser(WithDefaultBoolExample)
-
         expected = argparse.ArgumentParser()
         expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?")
         expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?")
@@ -176,22 +196,29 @@ def test_with_default_bool(self):
         # and its default must be set to False
         expected.add_argument("--no_baz", action="store_false", default=False, dest="baz")
         expected.add_argument("--opt", type=string_to_bool, default=None)
-        self.argparsersEqual(parser, expected)
 
-        args = parser.parse_args([])
-        self.assertEqual(args, Namespace(foo=False, baz=True, opt=None))
+        dataclass_types = [WithDefaultBoolExample]
+        if is_python_no_less_than_3_10:
+            dataclass_types.append(WithDefaultBoolExamplePep604)
 
-        args = parser.parse_args(["--foo", "--no_baz"])
-        self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
+        for dataclass_type in dataclass_types:
+            parser = HfArgumentParser(dataclass_type)
+            self.argparsersEqual(parser, expected)
 
-        args = parser.parse_args(["--foo", "--baz"])
-        self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
+            args = parser.parse_args([])
+            self.assertEqual(args, Namespace(foo=False, baz=True, opt=None))
 
-        args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"])
-        self.assertEqual(args, Namespace(foo=True, baz=True, opt=True))
+            args = parser.parse_args(["--foo", "--no_baz"])
+            self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
 
-        args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"])
-        self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
+            args = parser.parse_args(["--foo", "--baz"])
+            self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
+
+            args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"])
+            self.assertEqual(args, Namespace(foo=True, baz=True, opt=True))
+
+            args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"])
+            self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
 
     def test_with_enum(self):
         parser = HfArgumentParser(MixedTypeEnumExample)
@@ -266,21 +293,27 @@ def test_with_list(self):
         self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
 
     def test_with_optional(self):
-        parser = HfArgumentParser(OptionalExample)
-
         expected = argparse.ArgumentParser()
         expected.add_argument("--foo", default=None, type=int)
         expected.add_argument("--bar", default=None, type=float, help="help message")
         expected.add_argument("--baz", default=None, type=str)
         expected.add_argument("--ces", nargs="+", default=[], type=str)
         expected.add_argument("--des", nargs="+", default=[], type=int)
-        self.argparsersEqual(parser, expected)
 
-        args = parser.parse_args([])
-        self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[]))
+        dataclass_types = [OptionalExample]
+        if is_python_no_less_than_3_10:
+            dataclass_types.append(OptionalExamplePep604)
+
+        for dataclass_type in dataclass_types:
+            parser = HfArgumentParser(dataclass_type)
+
+            self.argparsersEqual(parser, expected)
+
+            args = parser.parse_args([])
+            self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[]))
 
-        args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split())
-        self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
+            args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split())
+            self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
 
     def test_with_required(self):
         parser = HfArgumentParser(RequiredExample)

From c4e32e206fe8eff7ccd9ca64ed061b54e7597193 Mon Sep 17 00:00:00 2001
From: Mayank Agarwal <mayankagarwal44442@gmail.com>
Date: Wed, 3 May 2023 20:20:34 +0530
Subject: [PATCH 013/935] Add support for beam search's num_return_sequencs
 flag in flax  (#23082)

* add code for numReturnSeq

* add flax support for num return sequences

* Make Fix up for changes

* add test for num return sequences

* lint
---
 src/transformers/generation/flax_utils.py | 11 ++++++++---
 tests/generation/test_flax_utils.py       | 13 +++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index c28e0afbb998..58a2bf13ba61 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -463,6 +463,7 @@ def generate(
                 logits_processor=logits_processor,
                 trace=trace,
                 params=params,
+                num_return_sequences=generation_config.num_return_sequences,
                 model_kwargs=model_kwargs,
             )
         else:
@@ -749,6 +750,7 @@ def _beam_search(
         logits_processor: Optional[FlaxLogitsProcessorList] = None,
         trace: bool = True,
         params: Optional[Dict[str, jnp.ndarray]] = None,
+        num_return_sequences: Optional[int] = None,
         model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
     ):
         """
@@ -793,6 +795,9 @@ def gather_fn(tensor):
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
         length_penalty = length_penalty if length_penalty is not None else self.generation_config.length_penalty
         early_stopping = early_stopping if early_stopping is not None else self.generation_config.early_stopping
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences
+        )
 
         batch_size, num_beams, cur_len = input_ids.shape
 
@@ -996,8 +1001,8 @@ def beam_search_body_fn(state, input_ids_length=1):
         sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences)
         scores = jnp.where(none_finished[:, None], state.scores, state.running_scores)
 
-        # take best beam for each batch
-        sequences = sequences[:, 0]
-        scores = scores[:, 0]
+        # Take best beams for each batch (the score is sorted in descending order)
+        sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :])
+        scores = flatten_beam_dim(scores[:, :num_return_sequences])
 
         return FlaxBeamSearchOutput(sequences=sequences, scores=scores)
diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py
index c6182a2386e5..647482b88cd8 100644
--- a/tests/generation/test_flax_utils.py
+++ b/tests/generation/test_flax_utils.py
@@ -158,6 +158,19 @@ def test_beam_search_generate(self):
 
             self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
 
+    def test_beam_search_generate_num_return_sequences(self):
+        config, input_ids, _, max_length = self._get_input_ids_and_config()
+        config.do_sample = False
+        config.max_length = max_length
+        config.num_beams = 2
+        config.num_return_sequences = 2
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids).sequences
+            self.assertEqual(generation_outputs.shape[0], input_ids.shape[0] * config.num_return_sequences)
+
     def test_sample_generate_logits_warper(self):
         config, input_ids, _, max_length = self._get_input_ids_and_config()
         config.do_sample = True

From fbe0178f08c219313986092f4c9b994a7bd4b4a1 Mon Sep 17 00:00:00 2001
From: Nayeon Han <nayeon2.han@gmail.com>
Date: Thu, 4 May 2023 00:04:58 +0900
Subject: [PATCH 014/935] docs: ko: update `_toctree.yml` (#23112)

* docs: ko: update `_toctree.yml`

* fix: ko: update toc

* fix: resolve suggestions

* fix: resolve build issue

---------

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
---
 docs/source/ko/_toctree.yml  | 146 +++++++++++++++++++----------------
 docs/source/ko/notebooks.mdx |   1 -
 2 files changed, 79 insertions(+), 68 deletions(-)
 delete mode 100644 docs/source/ko/notebooks.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index a9e1ff921d6b..bd24ee4e5ce1 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -8,45 +8,22 @@
   title: 시작하기
 - sections:
   - local: pipeline_tutorial
-    title: 추론을 위한 Pipeline
+    title: Pipeline으로 추론하기
   - local: autoclass_tutorial
-    title: 자동 클래스로 사전 학습된 인스턴스 로드하기
+    title: AutoClass로 사전 학습된 인스턴스 로드하기
   - local: preprocessing
-    title: 전처리
+    title: 데이터 전처리하기
   - local: training
     title: 사전 학습된 모델 미세 조정하기
+  - local: run_scripts
+    title: 스크립트로 학습하기
   - local: accelerate
-    title: 🤗 Accelerate를 활용한 분산 학습
+    title: 🤗 Accelerate로 분산 학습 구성하기
   - local: model_sharing
-    title: 모델 공유하기
-  title: (번역중) 튜토리얼
+    title: 만든 모델 공유하기
+  title: 튜토리얼
 - sections:
   - sections:
-    - local: create_a_model
-      title: 맞춤형 아키텍처 만들기
-    - local: custom_models
-      title: 사용자 정의 모델 공유하기
-    - local: run_scripts
-      title: 스크립트로 학습하기
-    - local: sagemaker
-      title: Amazon SageMaker에서 학습 실행하기
-    - local: in_translation
-      title: (번역중) Converting from TensorFlow checkpoints
-    - local: serialization
-      title: ONNX로 내보내기
-    - local: torchscript
-      title: TorchScript로 내보내기
-    - local: in_translation
-      title: (번역중) Troubleshoot
-    title: (번역중) 일반적인 사용방법
-  - sections:
-    - local: in_translation
-      title: (번역중) Use tokenizers from 🤗 Tokenizers
-    - local: multilingual
-      title: 다국어 모델 추론하기
-    - local: in_translation
-      title: (번역중) Text generation strategies
-    - sections:
       - local: tasks/sequence_classification
         title: 텍스트 분류
       - local: tasks/token_classification
@@ -62,39 +39,68 @@
       - local: tasks/summarization
         title: 요약
       - local: in_translation
-        title: (번역중) Multiple choice
-      title: (번역중) 태스크별 가이드
-      isExpanded: false
-    title: (번역중) 자연어처리
+        title: (번역중) Multiple Choice
+    title: 자연어처리
+    isExpanded: false
   - sections:
-    - local: in_translation
-      title: (번역중) Audio classification
-    - local: in_translation
-      title: (번역중) Automatic speech recognition
+      - local: in_translation
+        title: (번역중) Audio classification
+      - local: in_translation
+        title: (번역중) Automatic speech recognition
     title: (번역중) 오디오
+    isExpanded: false
   - sections:
-    - local: tasks/image_classification
-      title: 이미지 분류
+      - local: tasks/image_classification
+        title: 이미지 분류
+      - local: in_translation
+        title: (번역중) Semantic segmentation
+      - local: in_translation
+        title: (번역중) Video classification
+      - local: in_translation
+        title: (번역중) Object detection
+      - local: in_translation
+        title: (번역중) Zero-shot object detection
+      - local: tasks/zero_shot_image_classification
+        title: 제로샷(zero-shot) 이미지 분류
+      - local: in_translation
+        title: (번역중) Depth estimation
+    title: (번역중) 컴퓨터 비전
+    isExpanded: false
+  - sections:
+      - local: tasks/image_captioning
+        title: 이미지 캡셔닝
+      - local: in_translation
+        title: (번역중) Document Question Answering
+    title: (번역중) 멀티모달
+    isExpanded: false
+  title: 태스크 가이드
+- sections:
     - local: in_translation
-      title: (번역중) Semantic segmentation
+      title: (번역중) Use fast tokenizers from 🤗 Tokenizers
+    - local: multilingual
+      title: 다국어 모델 추론하기
     - local: in_translation
-      title: (번역중) Video classification
+      title: (번역중) Customize text generation strategy
+    - local: create_a_model
+      title: 모델별 API 사용하기
+    - local: custom_models
+      title: 사용자 정의 모델 공유하기
+    - local: sagemaker
+      title: Amazon SageMaker에서 학습 실행하기
+    - local: serialization
+      title: ONNX로 내보내기
+    - local: torchscript
+      title: TorchScript로 내보내기
     - local: in_translation
-      title: (번역중) Object detection
+      title: (번역중) Benchmarks
     - local: in_translation
-      title: (번역중) Zero-shot object detection
-    - local: tasks/zero_shot_image_classification
-      title: 제로샷(zero-shot) 이미지 분류
+      title: (번역중) Notebooks with examples
     - local: in_translation
-      title: (번역중) Depth estimation
-    title: (번역중) 컴퓨터 비전
-  - sections:
-    - local: tasks/image_captioning
-      title: 이미지 캡셔닝
+      title: (번역중) Community resources
     - local: in_translation
-      title: (번역중) Document Question Answering
-    title: (번역중) 멀티모달
-  - sections:
+      title: (번역중) Troubleshoot
+  title: (번역중) 개발자 가이드
+- sections:
     - local: in_translation
       title: (번역중) Overview
     - local: in_translation
@@ -129,8 +135,8 @@
       title: (번역중) Hyperparameter Search using Trainer API
     - local: in_translation
       title: (번역중) XLA Integration for TensorFlow Models
-    title: (번역중) 성능 및 확장성
-  - sections:
+  title: (번역중) 성능 및 확장성
+- sections:
     - local: in_translation
       title: (번역중) How to contribute to transformers?
     - local: in_translation
@@ -143,16 +149,8 @@
       title: (번역중) Testing
     - local: in_translation
       title: (번역중) Checks on a Pull Request
-    title: (번역중) 기여하기
-  - local: notebooks
-    title: (번역중) 🤗 Transformers Notebooks
-  - local: in_translation
-    title: (번역중) Community resources
-  - local: in_translation
-    title: (번역중) Benchmarks
-  - local: in_translation
-    title: (번역중) Migrating from previous packages
-  title: (번역중) How-to 가이드
+  title: (번역중) 기여하기
+
 - sections:
   - local: in_translation
     title: (번역중) Philosophy
@@ -263,6 +261,8 @@
         title: (번역중) ConvBERT
       - local: in_translation
         title: (번역중) CPM
+      - local: in_translation
+        title: (번역중) CPMANT
       - local: in_translation
         title: (번역중) CTRL
       - local: in_translation
@@ -309,6 +309,8 @@
         title: (번역중) GPT-J
       - local: in_translation
         title: (번역중) GPT2
+      - local: in_translation
+        title: (번역중) GPTBigCode
       - local: in_translation
         title: (번역중) GPTSAN Japanese
       - local: in_translation
@@ -361,6 +363,8 @@
         title: (번역중) NLLB-MoE
       - local: in_translation
         title: (번역중) Nyströmformer
+      - local: in_translation
+        title: (번역중) Open-Llama
       - local: in_translation
         title: (번역중) OPT
       - local: in_translation
@@ -460,6 +464,8 @@
         title: (번역중) EfficientFormer
       - local: in_translation
         title: (번역중) EfficientNet
+      - local: in_translation
+        title: (번역중) FocalNet
       - local: in_translation
         title: (번역중) GLPN
       - local: in_translation
@@ -572,6 +578,8 @@
         title: (번역중) CLIPSeg
       - local: in_translation
         title: (번역중) Data2Vec
+      - local: in_translation
+        title: (번역중) DePlot
       - local: in_translation
         title: (번역중) Donut
       - local: in_translation
@@ -592,6 +600,8 @@
         title: (번역중) LiLT
       - local: in_translation
         title: (번역중) LXMERT
+      - local: in_translation
+        title: (번역중) MatCha
       - local: in_translation
         title: (번역중) MGP-STR
       - local: in_translation
@@ -602,6 +612,8 @@
         title: (번역중) Perceiver
       - local: in_translation
         title: (번역중) Pix2Struct
+      - local: in_translation
+        title: (번역중) Segment Anything
       - local: in_translation
         title: (번역중) Speech Encoder Decoder Models
       - local: in_translation
diff --git a/docs/source/ko/notebooks.mdx b/docs/source/ko/notebooks.mdx
deleted file mode 100644
index ead906183348..000000000000
--- a/docs/source/ko/notebooks.mdx
+++ /dev/null
@@ -1 +0,0 @@
-# 열심히 번역 중입니다. 조금 이따 만나요!
\ No newline at end of file

From ca7eb27ed590dd583bc028ba1a0f78eb00dbb243 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Wed, 3 May 2023 18:23:09 +0200
Subject: [PATCH 015/935] =?UTF-8?q?[doc]=20Try=20a=20few=20=E2=89=A0=20way?=
 =?UTF-8?q?s=20of=20linking=20to=20Papers,=20users,=20and=20org=20profiles?=
 =?UTF-8?q?=20(#22611)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [doc] Try a few ≠ ways of linking to Papers, users, and org profiles

* Empty commit

* Empty commit now that the backend is fixed

---------

Co-authored-by: Lysandre <lysandre@huggingface.co>
---
 docs/source/en/model_doc/distilbert.mdx | 5 ++++-
 docs/source/en/model_doc/gpt2.mdx       | 2 +-
 docs/source/en/model_doc/roberta.mdx    | 5 ++++-
 docs/source/en/model_doc/t5.mdx         | 7 +++++--
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/distilbert.mdx b/docs/source/en/model_doc/distilbert.mdx
index cc1e03715118..837f0319ec9e 100644
--- a/docs/source/en/model_doc/distilbert.mdx
+++ b/docs/source/en/model_doc/distilbert.mdx
@@ -19,13 +19,16 @@ specific language governing permissions and limitations under the License.
 <a href="https://huggingface.co/spaces/docs-demos/distilbert-base-uncased">
 <img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
 </a>
+<a href="https://huggingface.co/papers/1910.01108">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.01108-green">
+</a>
 </div>
 
 ## Overview
 
 The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
 distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
-distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
+distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/papers/1910.01108). DistilBERT is a
 small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
 *bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
 understanding benchmark.
diff --git a/docs/source/en/model_doc/gpt2.mdx b/docs/source/en/model_doc/gpt2.mdx
index ee80eb2f8b9c..6288e46eb555 100644
--- a/docs/source/en/model_doc/gpt2.mdx
+++ b/docs/source/en/model_doc/gpt2.mdx
@@ -24,7 +24,7 @@ specific language governing permissions and limitations under the License.
 ## Overview
 
 OpenAI GPT-2 model was proposed in [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) by Alec
-Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
+Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever from [OpenAI](https://huggingface.co/openai). It's a causal (unidirectional)
 transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.
 
 The abstract from the paper is the following:
diff --git a/docs/source/en/model_doc/roberta.mdx b/docs/source/en/model_doc/roberta.mdx
index 7c0818a0144d..49007409ad39 100644
--- a/docs/source/en/model_doc/roberta.mdx
+++ b/docs/source/en/model_doc/roberta.mdx
@@ -19,11 +19,14 @@ specific language governing permissions and limitations under the License.
 <a href="https://huggingface.co/spaces/docs-demos/roberta-base">
 <img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
 </a>
+<a href="https://huggingface.co/papers/1907.11692">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1907.11692-green">
+</a>
 </div>
 
 ## Overview
 
-The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
+The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
 Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.
 
 It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.mdx
index f7665c11ae4a..58074f1403b1 100644
--- a/docs/source/en/model_doc/t5.mdx
+++ b/docs/source/en/model_doc/t5.mdx
@@ -19,12 +19,15 @@ specific language governing permissions and limitations under the License.
 <a href="https://huggingface.co/spaces/docs-demos/t5-base">
 <img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
 </a>
+<a href="https://huggingface.co/papers/1910.10683">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.10683-green">
+</a>
 </div>
 
 ## Overview
 
-The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
-Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang,
+Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu).
 
 The abstract from the paper is the following:
 

From 441658dd6c7a788400d566aaeefe74990c9d040f Mon Sep 17 00:00:00 2001
From: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
Date: Wed, 3 May 2023 19:32:42 +0300
Subject: [PATCH 016/935] Add focalnet backbone (#23104)

Adds FocalNet backbone to return features from all stages
---
 src/transformers/__init__.py                  |  2 +
 src/transformers/models/auto/modeling_auto.py |  1 +
 src/transformers/models/focalnet/__init__.py  |  2 +
 .../models/focalnet/configuration_focalnet.py | 47 +++++++++++
 .../focalnet/convert_focalnet_to_hf_format.py |  6 +-
 .../models/focalnet/modeling_focalnet.py      | 82 ++++++++++++++++++-
 src/transformers/utils/dummy_pt_objects.py    |  7 ++
 .../models/focalnet/test_modeling_focalnet.py | 71 ++++++++++++++--
 tests/test_backbone_common.py                 |  2 +
 utils/check_repo.py                           |  1 +
 10 files changed, 210 insertions(+), 11 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 445fbb53e269..46be8c9d2c6f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1623,6 +1623,7 @@
     _import_structure["models.focalnet"].extend(
         [
             "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "FocalNetBackbone",
             "FocalNetForImageClassification",
             "FocalNetForMaskedImageModeling",
             "FocalNetModel",
@@ -5178,6 +5179,7 @@
         )
         from .models.focalnet import (
             FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FocalNetBackbone,
             FocalNetForImageClassification,
             FocalNetForMaskedImageModeling,
             FocalNetModel,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ccdda1af33f0..14847c7ad2ab 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -980,6 +980,7 @@
         ("convnext", "ConvNextBackbone"),
         ("convnextv2", "ConvNextV2Backbone"),
         ("dinat", "DinatBackbone"),
+        ("focalnet", "FocalNetBackbone"),
         ("maskformer-swin", "MaskFormerSwinBackbone"),
         ("nat", "NatBackbone"),
         ("resnet", "ResNetBackbone"),
diff --git a/src/transformers/models/focalnet/__init__.py b/src/transformers/models/focalnet/__init__.py
index e082ae26d2d1..b043a006f937 100644
--- a/src/transformers/models/focalnet/__init__.py
+++ b/src/transformers/models/focalnet/__init__.py
@@ -30,6 +30,7 @@
         "FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST",
         "FocalNetForImageClassification",
         "FocalNetForMaskedImageModeling",
+        "FocalNetBackbone",
         "FocalNetModel",
         "FocalNetPreTrainedModel",
     ]
@@ -45,6 +46,7 @@
     else:
         from .modeling_focalnet import (
             FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FocalNetBackbone,
             FocalNetForImageClassification,
             FocalNetForMaskedImageModeling,
             FocalNetModel,
diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py
index 5bfecb5737a4..c6814e1dda14 100644
--- a/src/transformers/models/focalnet/configuration_focalnet.py
+++ b/src/transformers/models/focalnet/configuration_focalnet.py
@@ -47,6 +47,8 @@ class FocalNetConfig(PretrainedConfig):
         use_conv_embed (`bool`, *optional*, defaults to `False`):
             Whether to use convolutional embedding. The authors noted that using convolutional embedding usually
             improve the performance, but it's not used by default.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[192, 384, 768, 768]`):
+            Dimensionality (hidden size) at each stage.
         depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
             Depth (number of layers) of each stage in the encoder.
         focal_levels (`list(int)`, *optional*, defaults to `[2, 2, 2, 2]`):
@@ -78,6 +80,14 @@ class FocalNetConfig(PretrainedConfig):
             The epsilon used by the layer normalization layers.
         encoder_stride (`int`, `optional`, defaults to 32):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
 
     Example:
 
@@ -102,6 +112,7 @@ def __init__(
         num_channels=3,
         embed_dim=96,
         use_conv_embed=False,
+        hidden_sizes=[192, 384, 768, 768],
         depths=[2, 2, 6, 2],
         focal_levels=[2, 2, 2, 2],
         focal_windows=[3, 3, 3, 3],
@@ -117,6 +128,8 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_stride=32,
+        out_features=None,
+        out_indices=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -126,6 +139,7 @@ def __init__(
         self.num_channels = num_channels
         self.embed_dim = embed_dim
         self.use_conv_embed = use_conv_embed
+        self.hidden_sizes = hidden_sizes
         self.depths = depths
         self.focal_levels = focal_levels
         self.focal_windows = focal_windows
@@ -141,3 +155,36 @@ def __init__(
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.encoder_stride = encoder_stride
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
+
+        if out_features is not None and out_indices is not None:
+            if len(out_features) != len(out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            elif out_features != [self.stage_names[idx] for idx in out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+
+        if out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        elif out_features is not None and out_indices is None:
+            out_indices = [self.stage_names.index(feature) for feature in out_features]
+        elif out_features is None and out_indices is None:
+            out_features = [self.stage_names[-1]]
+            out_indices = [len(self.stage_names) - 1]
+
+        if out_features is not None:
+            if not isinstance(out_features, list):
+                raise ValueError("out_features should be a list")
+            for feature in out_features:
+                if feature not in self.stage_names:
+                    raise ValueError(
+                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
+                    )
+        if out_indices is not None:
+            if not isinstance(out_indices, (list, tuple)):
+                raise ValueError("out_indices should be a list or tuple")
+            for idx in out_indices:
+                if idx >= len(self.stage_names):
+                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
+
+        self.out_features = out_features
+        self.out_indices = out_indices
diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
index a23383e3abc9..4aed15928062 100644
--- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
+++ b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
@@ -56,7 +56,6 @@ def get_focalnet_config(model_name):
         embed_dim = 128
     elif "large" in model_name:
         embed_dim = 192
-        focal_windows = [5, 5, 5, 5]
     elif "xlarge" in model_name:
         embed_dim = 256
     elif "huge" in model_name:
@@ -130,7 +129,10 @@ def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hu
         "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth",
         "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth",
         "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth",
-        "focalnet-large": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth",
+        "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth",
+        "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth",
+        "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth",
+        "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth",
     }
     # fmt: on
 
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index ff1b75e14b5e..cfd64689763b 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -26,7 +26,8 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_utils import PreTrainedModel
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import BackboneMixin, PreTrainedModel
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -209,7 +210,6 @@ def forward(
             embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
 
         embeddings = self.dropout(embeddings)
-
         return embeddings, output_dimensions
 
 
@@ -971,3 +971,81 @@ def forward(
             hidden_states=outputs.hidden_states,
             reshaped_hidden_states=outputs.reshaped_hidden_states,
         )
+
+
+@add_start_docstrings(
+    """
+    FocalNet backbone, to be used with frameworks like X-Decoder.
+    """,
+    FOCALNET_START_DOCSTRING,
+)
+class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.stage_names = config.stage_names
+        self.focalnet = FocalNetModel(config)
+
+        self.num_features = [config.embed_dim] + config.hidden_sizes
+        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+        if config.out_indices is not None:
+            self.out_indices = config.out_indices
+        else:
+            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FOCALNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
+        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.focalnet(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 7fe538eccc9c..1e9845ba9bcc 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3002,6 +3002,13 @@ def __init__(self, *args, **kwargs):
 FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class FocalNetBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class FocalNetForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index 4ddf8e63b944..75127e5fd382 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -22,6 +22,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 
@@ -30,7 +31,12 @@
     import torch
     from torch import nn
 
-    from transformers import FocalNetForImageClassification, FocalNetForMaskedImageModeling, FocalNetModel
+    from transformers import (
+        FocalNetBackbone,
+        FocalNetForImageClassification,
+        FocalNetForMaskedImageModeling,
+        FocalNetModel,
+    )
     from transformers.models.focalnet.modeling_focalnet import FOCALNET_PRETRAINED_MODEL_ARCHIVE_LIST
 
 if is_vision_available():
@@ -48,6 +54,7 @@ def __init__(
         patch_size=2,
         num_channels=3,
         embed_dim=16,
+        hidden_sizes=[32, 64, 128],
         depths=[1, 2, 1],
         num_heads=[2, 2, 4],
         window_size=2,
@@ -67,6 +74,7 @@ def __init__(
         type_sequence_label_size=10,
         encoder_stride=8,
         out_features=["stage1", "stage2"],
+        out_indices=[1, 2],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -74,6 +82,7 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.embed_dim = embed_dim
+        self.hidden_sizes = hidden_sizes
         self.depths = depths
         self.num_heads = num_heads
         self.window_size = window_size
@@ -93,6 +102,7 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.encoder_stride = encoder_stride
         self.out_features = out_features
+        self.out_indices = out_indices
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -111,6 +121,7 @@ def get_config(self):
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             embed_dim=self.embed_dim,
+            hidden_sizes=self.hidden_sizes,
             depths=self.depths,
             num_heads=self.num_heads,
             window_size=self.window_size,
@@ -126,6 +137,7 @@ def get_config(self):
             initializer_range=self.initializer_range,
             encoder_stride=self.encoder_stride,
             out_features=self.out_features,
+            out_indices=self.out_indices,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -139,6 +151,35 @@ def create_and_check_model(self, config, pixel_values, labels):
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
 
+    def create_and_check_backbone(self, config, pixel_values, labels):
+        model = FocalNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size, 8, 8])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+        self.parent.assertListEqual(model.channels, config.hidden_sizes[:-1])
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = FocalNetBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size * 2, 4, 4])
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
+
     def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
         model = FocalNetForMaskedImageModeling(config=config)
         model.to(torch_device)
@@ -191,6 +232,7 @@ class FocalNetModelTest(ModelTesterMixin, unittest.TestCase):
             FocalNetModel,
             FocalNetForImageClassification,
             FocalNetForMaskedImageModeling,
+            FocalNetBackbone,
         )
         if is_torch_available()
         else ()
@@ -204,7 +246,7 @@ class FocalNetModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = FocalNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FocalNetConfig, embed_dim=37)
+        self.config_tester = ConfigTester(self, config_class=FocalNetConfig, embed_dim=37, has_text_modality=False)
 
     def test_config(self):
         self.create_and_test_config_common_properties()
@@ -222,6 +264,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
     def test_for_masked_image_modeling(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
@@ -234,14 +280,14 @@ def test_for_image_classification(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="FocalNet Transformer does not use feedforward chunking")
+    @unittest.skip(reason="FocalNet does not use feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
 
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:-1]:
             model = model_class(config)
             self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
             x = model.get_output_embeddings()
@@ -250,7 +296,7 @@ def test_model_common_attributes(self):
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:-1]:
             model = model_class(config)
             signature = inspect.signature(model.forward)
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
@@ -309,7 +355,7 @@ def test_hidden_states_output(self):
             else (self.model_tester.image_size, self.model_tester.image_size)
         )
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:-1]:
             inputs_dict["output_hidden_states"] = True
             self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
 
@@ -337,7 +383,7 @@ def test_hidden_states_output_with_padding(self):
         padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
         padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:-1]:
             inputs_dict["output_hidden_states"] = True
             self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
 
@@ -393,3 +439,14 @@ def test_inference_image_classification_head(self):
         expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
         self.assertTrue(outputs.logits.argmax(dim=-1).item(), 281)
+
+
+@require_torch
+class FocalNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (FocalNetBackbone,) if is_torch_available() else ()
+    config_class = FocalNetConfig
+
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = FocalNetModelTester(self)
diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py
index 80e68a2f44ad..6bcf47004bd2 100644
--- a/tests/test_backbone_common.py
+++ b/tests/test_backbone_common.py
@@ -135,6 +135,8 @@ def test_backbone_common_attributes(self):
             # Verify num_features has been initialized in the backbone init
             self.assertIsNotNone(backbone.num_features)
             self.assertTrue(len(backbone.channels) == len(backbone.out_indices))
+            print(backbone.stage_names)
+            print(backbone.num_features)
             self.assertTrue(len(backbone.stage_names) == len(backbone.num_features))
             self.assertTrue(len(backbone.channels) <= len(backbone.num_features))
             self.assertTrue(len(backbone.out_feature_channels) == len(backbone.stage_names))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 5bdec16b9ece..7280381faf97 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -836,6 +836,7 @@ def find_all_documented_objects():
     "ConvNextBackbone",
     "ConvNextV2Backbone",
     "DinatBackbone",
+    "FocalNetBackbone",
     "MaskFormerSwinBackbone",
     "MaskFormerSwinConfig",
     "MaskFormerSwinModel",

From e3ee45aa54680bc2c7d57acb568d0570bc28044c Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 3 May 2023 18:47:36 +0200
Subject: [PATCH 017/935] Enable to use custom tracer in FX `symbolic_trace`
 (#23105)

* Enable to use custom tracer in FX `symbolic_trace`

* Integrate feedback from review

* Formatting

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/utils/fx.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 2f5b4810ecf2..e82d44c80281 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -1207,6 +1207,7 @@ def symbolic_trace(
     model: PreTrainedModel,
     input_names: Optional[List[str]] = None,
     disable_check: bool = False,
+    tracer_cls: Type[HFTracer] = HFTracer,
 ) -> GraphModule:
     """
     Performs symbolic tracing on the model.
@@ -1218,6 +1219,8 @@ def symbolic_trace(
             The names of the inputs of the traced model. If unset, model.dummy_inputs.keys() are used instead.
         disable_check (`bool`, *optional*, defaults to `False`):
             If `True`, no check is done before trying to trace the model, this is mostly usesul for debugging purposes.
+        tracer_cls (`Type[HFTracer]`, *optional*, defaults to `HFTracer`):
+            The tracer class to use for instantiating the tracer. If unset, `HFTracer` is used instead.
 
     Returns:
         `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
@@ -1240,7 +1243,7 @@ def symbolic_trace(
         check_if_model_is_supported(model)
 
     # Tracing.
-    tracer = HFTracer()
+    tracer = tracer_cls()
     traced_graph = tracer.trace(model, concrete_args=concrete_args)
     traced = torch.fx.GraphModule(model, traced_graph)
 

From b0a78091a5b2f7e872140cf2d3795e4c56c9c95d Mon Sep 17 00:00:00 2001
From: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
Date: Wed, 3 May 2023 20:04:48 +0300
Subject: [PATCH 018/935] Remove redundant print statements (#23133)

remove redundant print statements
---
 tests/test_backbone_common.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py
index 6bcf47004bd2..80e68a2f44ad 100644
--- a/tests/test_backbone_common.py
+++ b/tests/test_backbone_common.py
@@ -135,8 +135,6 @@ def test_backbone_common_attributes(self):
             # Verify num_features has been initialized in the backbone init
             self.assertIsNotNone(backbone.num_features)
             self.assertTrue(len(backbone.channels) == len(backbone.out_indices))
-            print(backbone.stage_names)
-            print(backbone.num_features)
             self.assertTrue(len(backbone.stage_names) == len(backbone.num_features))
             self.assertTrue(len(backbone.channels) <= len(backbone.num_features))
             self.assertTrue(len(backbone.out_feature_channels) == len(backbone.stage_names))

From b6933d76d27fd14a835b9ea095d56725c69f4796 Mon Sep 17 00:00:00 2001
From: Robert Stone <talby@trap.mtview.ca.us>
Date: Wed, 3 May 2023 12:50:41 -0700
Subject: [PATCH 019/935] Tidy Pytorch GLUE benchmark example (#23134)

Migration to Evaluate for metric is not quite complete
---
 examples/pytorch/text-classification/run_glue.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 1bb4c7bee7b8..dd81d535df7b 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -486,6 +486,8 @@ def preprocess_function(examples):
     # Get the metric function
     if data_args.task_name is not None:
         metric = evaluate.load("glue", data_args.task_name)
+    elif is_regression:
+        metric = evaluate.load("mse")
     else:
         metric = evaluate.load("accuracy")
 
@@ -494,15 +496,10 @@ def preprocess_function(examples):
     def compute_metrics(p: EvalPrediction):
         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        if data_args.task_name is not None:
-            result = metric.compute(predictions=preds, references=p.label_ids)
-            if len(result) > 1:
-                result["combined_score"] = np.mean(list(result.values())).item()
-            return result
-        elif is_regression:
-            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
-        else:
-            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+        result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
+        return result
 
     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
     # we already did the padding.

From 78b7debf56efb907c6af767882162050d4fbb294 Mon Sep 17 00:00:00 2001
From: peter-sk <petersk@imada.sdu.dk>
Date: Wed, 3 May 2023 21:59:19 +0200
Subject: [PATCH 020/935] GPTNeoForQuestionAnswering (#23057)

* first draft - gives index error in question_answering.py

* maturing

* no labels

* pipeline should know about QA

* fixing checks

* formatting

* fixed docstring

* initial commit

* formatting

* adding the class to many places

* towards less unhappy checks

* nearly there

* Update src/transformers/models/gpt_neo/modeling_gpt_neo.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* avoid error

* moving to device of star/end_logits

---------

Co-authored-by: Prof. Peter Schneider-Kamp <jps@ordbogen.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/gpt_neo.mdx          |   5 +
 docs/source/en/tasks/question_answering.mdx   |   2 +-
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/gpt_neo/__init__.py   |   2 +
 .../models/gpt_neo/modeling_gpt_neo.py        | 102 ++++++++++++++++++
 src/transformers/models/gptj/modeling_gptj.py |   4 +-
 src/transformers/utils/dummy_pt_objects.py    |   7 ++
 tests/models/gpt_neo/test_modeling_gpt_neo.py |  25 ++++-
 9 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/gpt_neo.mdx b/docs/source/en/model_doc/gpt_neo.mdx
index 4d6e8c58ce4b..a766a85cf151 100644
--- a/docs/source/en/model_doc/gpt_neo.mdx
+++ b/docs/source/en/model_doc/gpt_neo.mdx
@@ -69,6 +69,11 @@ The `generate()` method can be used to generate text using GPT Neo model.
 [[autodoc]] GPTNeoForCausalLM
     - forward
 
+## GPTNeoForQuestionAnswering
+
+[[autodoc]] GPTNeoForQuestionAnswering
+    - forward
+
 ## GPTNeoForSequenceClassification
 
 [[autodoc]] GPTNeoForSequenceClassification
diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx
index a079a9265c84..3dd11140c253 100644
--- a/docs/source/en/tasks/question_answering.mdx
+++ b/docs/source/en/tasks/question_answering.mdx
@@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 46be8c9d2c6f..1c74d242fc6b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1690,6 +1690,7 @@
         [
             "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
             "GPTNeoForCausalLM",
+            "GPTNeoForQuestionAnswering",
             "GPTNeoForSequenceClassification",
             "GPTNeoForTokenClassification",
             "GPTNeoModel",
@@ -5234,6 +5235,7 @@
         from .models.gpt_neo import (
             GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoForCausalLM,
+            GPTNeoForQuestionAnswering,
             GPTNeoForSequenceClassification,
             GPTNeoForTokenClassification,
             GPTNeoModel,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 14847c7ad2ab..a53128a61216 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -736,6 +736,7 @@
         ("fnet", "FNetForQuestionAnswering"),
         ("funnel", "FunnelForQuestionAnswering"),
         ("gpt2", "GPT2ForQuestionAnswering"),
+        ("gpt_neo", "GPTNeoForQuestionAnswering"),
         ("gptj", "GPTJForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
diff --git a/src/transformers/models/gpt_neo/__init__.py b/src/transformers/models/gpt_neo/__init__.py
index a7ddaf848883..02ca0a11949b 100644
--- a/src/transformers/models/gpt_neo/__init__.py
+++ b/src/transformers/models/gpt_neo/__init__.py
@@ -29,6 +29,7 @@
     _import_structure["modeling_gpt_neo"] = [
         "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
         "GPTNeoForCausalLM",
+        "GPTNeoForQuestionAnswering",
         "GPTNeoForSequenceClassification",
         "GPTNeoForTokenClassification",
         "GPTNeoModel",
@@ -61,6 +62,7 @@
         from .modeling_gpt_neo import (
             GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoForCausalLM,
+            GPTNeoForQuestionAnswering,
             GPTNeoForSequenceClassification,
             GPTNeoForTokenClassification,
             GPTNeoModel,
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 808c3f39ebb1..02f7d5534bde 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -29,6 +29,7 @@
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
     CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
@@ -1012,3 +1013,104 @@ def forward(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The GPT-Neo Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    GPT_NEO_START_DOCSTRING,
+)
+class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTNeoModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_CHECKPOINT_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 43b93234569a..18985cb3bce4 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -1127,9 +1127,9 @@ def forward(
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
             if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
             ignored_index = start_logits.size(1)
             start_positions = start_positions.clamp(0, ignored_index)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1e9845ba9bcc..73827a3937a1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3287,6 +3287,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GPTNeoForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GPTNeoForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 0b5f69639471..c0334bd05aa3 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -34,6 +34,7 @@
         GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
         GPT2Tokenizer,
         GPTNeoForCausalLM,
+        GPTNeoForQuestionAnswering,
         GPTNeoForSequenceClassification,
         GPTNeoForTokenClassification,
         GPTNeoModel,
@@ -325,6 +326,17 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas
         self.parent.assertEqual(result.loss.shape, ())
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
+    def create_and_check_gpt_neo_for_question_answering(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPTNeoForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
     def create_and_check_gpt_neo_for_sequence_classification(
         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
     ):
@@ -385,7 +397,13 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification, GPTNeoForTokenClassification)
+        (
+            GPTNeoModel,
+            GPTNeoForCausalLM,
+            GPTNeoForQuestionAnswering,
+            GPTNeoForSequenceClassification,
+            GPTNeoForTokenClassification,
+        )
         if is_torch_available()
         else ()
     )
@@ -393,6 +411,7 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     pipeline_model_mapping = (
         {
             "feature-extraction": GPTNeoModel,
+            "question-answering": GPTNeoForQuestionAnswering,
             "text-classification": GPTNeoForSequenceClassification,
             "token-classification": GPTNeoForTokenClassification,
             "text-generation": GPTNeoForCausalLM,
@@ -438,6 +457,10 @@ def test_gpt_neo_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
+    def test_gpt_neo_question_answering_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_neo_for_question_answering(*config_and_inputs)
+
     def test_gpt_neo_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt_neo_for_sequence_classification(*config_and_inputs)

From 90e8263d912daf8bdb6c3849d35b6588cf5cc39c Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 May 2023 10:15:06 +0100
Subject: [PATCH 021/935] Add methods to update and verify out_features
 out_indices (#23031)

* Add methods to update and verify out_features out_indices

* Safe update for config attributes

* Fix function names

* Save config correctly

* PR comments - use property setters

* PR comment - directly set attributes

* Update test

* Add updates to recently merged focalnet backbone
---
 src/transformers/modeling_utils.py            |  26 ---
 .../models/bit/configuration_bit.py           |  38 +---
 src/transformers/models/bit/modeling_bit.py   |  11 +-
 .../models/convnext/configuration_convnext.py |  38 +---
 .../models/convnext/modeling_convnext.py      |  13 +-
 .../convnextv2/configuration_convnextv2.py    |  38 +---
 .../models/convnextv2/modeling_convnextv2.py  |  13 +-
 .../models/dinat/configuration_dinat.py       |  38 +---
 .../models/dinat/modeling_dinat.py            |  13 +-
 .../models/focalnet/configuration_focalnet.py |  38 +---
 .../models/focalnet/modeling_focalnet.py      |  11 +-
 .../configuration_maskformer_swin.py          |  38 +---
 .../maskformer/modeling_maskformer_swin.py    |  12 +-
 .../models/nat/configuration_nat.py           |  38 +---
 src/transformers/models/nat/modeling_nat.py   |  11 +-
 .../models/resnet/configuration_resnet.py     |  38 +---
 .../models/resnet/modeling_resnet.py          |  11 +-
 .../models/swin/configuration_swin.py         |  38 +---
 src/transformers/models/swin/modeling_swin.py |  13 +-
 .../models/upernet/modeling_upernet.py        |   3 +-
 src/transformers/utils/backbone_utils.py      | 203 ++++++++++++++++++
 tests/test_backbone_common.py                 |  21 +-
 tests/utils/test_backbone_utils.py            | 102 +++++++++
 23 files changed, 420 insertions(+), 385 deletions(-)
 create mode 100644 src/transformers/utils/backbone_utils.py
 create mode 100644 tests/utils/test_backbone_utils.py

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 46e1c09dba1f..bf06d9c40538 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1006,32 +1006,6 @@ def floating_point_ops(
         return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
 
 
-class BackboneMixin:
-    @property
-    def out_feature_channels(self):
-        # the current backbones will output the number of channels for each stage
-        # even if that stage is not in the out_features list.
-        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
-
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
-
-    def forward_with_filtered_kwargs(self, *args, **kwargs):
-        signature = dict(inspect.signature(self.forward).parameters)
-        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
-        return self(*args, **filtered_kwargs)
-
-    def forward(
-        self,
-        pixel_values: Tensor,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        raise NotImplementedError("This method should be implemented by the derived class.")
-
-
 class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin):
     r"""
     Base class for all models.
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index da53807f3f06..bfac3ab03f00 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -25,7 +26,7 @@
 }
 
 
-class BitConfig(PretrainedConfig):
+class BitConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -128,35 +129,6 @@ def __init__(
         self.width_factor = width_factor
 
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 6a63a7a3e29c..d440f180757b 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -31,7 +31,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -39,6 +39,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_bit import BitConfig
 
 
@@ -848,12 +849,10 @@ def __init__(self, config):
         self.stage_names = config.stage_names
         self.bit = BitModel(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
         self.num_features = [config.embedding_size] + config.hidden_sizes
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
 
         # initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
index a4b7272295c7..0cba78040579 100644
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -32,7 +33,7 @@
 }
 
 
-class ConvNextConfig(PretrainedConfig):
+class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConvNextModel`]. It is used to instantiate an
     ConvNeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -119,38 +120,9 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.image_size = image_size
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class ConvNextOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 35302bffed66..1748e68aeec1 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -29,7 +29,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -37,6 +37,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_convnext import ConvNextConfig
 
 
@@ -485,16 +486,14 @@ def __init__(self, config):
         self.embeddings = ConvNextEmbeddings(config)
         self.encoder = ConvNextEncoder(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
         self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
 
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
-        for stage, num_channels in zip(self.out_features, self.channels):
+        for stage, num_channels in zip(self._out_features, self.channels):
             hidden_states_norms[stage] = ConvNextLayerNorm(num_channels, data_format="channels_first")
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
 
diff --git a/src/transformers/models/convnextv2/configuration_convnextv2.py b/src/transformers/models/convnextv2/configuration_convnextv2.py
index f02a21371b20..14dfcf85124e 100644
--- a/src/transformers/models/convnextv2/configuration_convnextv2.py
+++ b/src/transformers/models/convnextv2/configuration_convnextv2.py
@@ -17,6 +17,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -26,7 +27,7 @@
 }
 
 
-class ConvNextV2Config(PretrainedConfig):
+class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ConvNextV2Model`]. It is used to instantiate an
     ConvNeXTV2 model according to the specified arguments, defining the model architecture. Instantiating a
@@ -109,35 +110,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.image_size = image_size
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index c309cdc3b6e2..c4cac4eb39fc 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -29,7 +29,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -37,6 +37,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_convnextv2 import ConvNextV2Config
 
 
@@ -508,16 +509,14 @@ def __init__(self, config):
         self.embeddings = ConvNextV2Embeddings(config)
         self.encoder = ConvNextV2Encoder(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
         self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
 
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
-        for stage, num_channels in zip(self.out_features, self.channels):
+        for stage, num_channels in zip(self._out_features, self.channels):
             hidden_states_norms[stage] = ConvNextV2LayerNorm(num_channels, data_format="channels_first")
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
 
diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py
index 7c6a84ecdd13..963c72f29bd4 100644
--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -26,7 +27,7 @@
 }
 
 
-class DinatConfig(PretrainedConfig):
+class DinatConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -145,35 +146,6 @@ def __init__(
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.layer_scale_init_value = layer_scale_init_value
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 5b2394f122f1..7e3809c1a303 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -26,7 +26,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -39,6 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_dinat import DinatConfig
 
 
@@ -890,16 +891,14 @@ def __init__(self, config):
         self.embeddings = DinatEmbeddings(config)
         self.encoder = DinatEncoder(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
-        for stage, num_channels in zip(self.out_features, self.channels):
+        for stage, num_channels in zip(self._out_features, self.channels):
             hidden_states_norms[stage] = nn.LayerNorm(num_channels)
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
 
diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py
index c6814e1dda14..f4bcd0ddce3b 100644
--- a/src/transformers/models/focalnet/configuration_focalnet.py
+++ b/src/transformers/models/focalnet/configuration_focalnet.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -25,7 +26,7 @@
 }
 
 
-class FocalNetConfig(PretrainedConfig):
+class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a
     FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -156,35 +157,6 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.encoder_stride = encoder_stride
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index cfd64689763b..e7ebdda5e5d4 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -27,7 +27,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -36,6 +36,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_focalnet import FocalNetConfig
 
 
@@ -987,11 +988,9 @@ def __init__(self, config):
         self.focalnet = FocalNetModel(config)
 
         self.num_features = [config.embed_dim] + config.hidden_sizes
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
 
         # initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py
index ca60b6176eed..7c3ac54bd80d 100644
--- a/src/transformers/models/maskformer/configuration_maskformer_swin.py
+++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py
@@ -16,12 +16,13 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
 
 
-class MaskFormerSwinConfig(PretrainedConfig):
+class MaskFormerSwinConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`MaskFormerSwinModel`]. It is used to instantiate
     a Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -141,35 +142,6 @@ def __init__(
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index 8684a58a4e7b..c7b74a6f2bd7 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -27,8 +27,9 @@
 from ...activations import ACT2FN
 from ...file_utils import ModelOutput
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_maskformer_swin import MaskFormerSwinConfig
 
 
@@ -855,14 +856,13 @@ def __init__(self, config: MaskFormerSwinConfig):
         self.stage_names = config.stage_names
         self.model = MaskFormerSwinModel(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+        self._out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
         if "stem" in self.out_features:
             raise ValueError("This backbone does not support 'stem' in the `out_features`.")
 
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
         self.hidden_states_norms = nn.ModuleList(
             [nn.LayerNorm(num_channels) for num_channels in self.num_features[1:]]
diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py
index a74b8c9165c7..5d8bd6b3c6eb 100644
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
@@ -16,6 +16,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -26,7 +27,7 @@
 }
 
 
-class NatConfig(PretrainedConfig):
+class NatConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
     according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -141,35 +142,6 @@ def __init__(
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.layer_scale_init_value = layer_scale_init_value
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index c5e83b29da67..7634a08ad95b 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -26,7 +26,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -39,6 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_nat import NatConfig
 
 
@@ -868,11 +869,9 @@ def __init__(self, config):
         self.embeddings = NatEmbeddings(config)
         self.encoder = NatEncoder(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py
index 6a88935f3b02..f12fe542a067 100644
--- a/src/transformers/models/resnet/configuration_resnet.py
+++ b/src/transformers/models/resnet/configuration_resnet.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -31,7 +32,7 @@
 }
 
 
-class ResNetConfig(PretrainedConfig):
+class ResNetConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`ResNetModel`]. It is used to instantiate an
     ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -108,38 +109,9 @@ def __init__(
         self.hidden_act = hidden_act
         self.downsample_in_first_stage = downsample_in_first_stage
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class ResNetOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index 6926f6a43116..b177cdeda6c1 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -28,7 +28,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -36,6 +36,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_resnet import ResNetConfig
 
 
@@ -436,11 +437,9 @@ def __init__(self, config):
         self.embedder = ResNetEmbeddings(config)
         self.encoder = ResNetEncoder(config)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
         self.num_features = [config.embedding_size] + config.hidden_sizes
 
         # initialize weights and apply final processing
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 612bc9949fb2..757112f8cebf 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -22,6 +22,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -34,7 +35,7 @@
 }
 
 
-class SwinConfig(PretrainedConfig):
+class SwinConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`SwinModel`]. It is used to instantiate a Swin
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -158,38 +159,9 @@ def __init__(
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
-
-        if out_features is not None and out_indices is not None:
-            if len(out_features) != len(out_indices):
-                raise ValueError("out_features and out_indices should have the same length if both are set")
-            elif out_features != [self.stage_names[idx] for idx in out_indices]:
-                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
-
-        if out_features is None and out_indices is not None:
-            out_features = [self.stage_names[idx] for idx in out_indices]
-        elif out_features is not None and out_indices is None:
-            out_indices = [self.stage_names.index(feature) for feature in out_features]
-        elif out_features is None and out_indices is None:
-            out_features = [self.stage_names[-1]]
-            out_indices = [len(self.stage_names) - 1]
-
-        if out_features is not None:
-            if not isinstance(out_features, list):
-                raise ValueError("out_features should be a list")
-            for feature in out_features:
-                if feature not in self.stage_names:
-                    raise ValueError(
-                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
-                    )
-        if out_indices is not None:
-            if not isinstance(out_indices, (list, tuple)):
-                raise ValueError("out_indices should be a list or tuple")
-            for idx in out_indices:
-                if idx >= len(self.stage_names):
-                    raise ValueError(f"Index {idx} is not a valid index for a list of length {len(self.stage_names)}")
-
-        self.out_features = out_features
-        self.out_indices = out_indices
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
 
 
 class SwinOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 2f7cfeb1adbd..6482ff1b5bf2 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -28,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -38,6 +38,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
 from .configuration_swin import SwinConfig
 
 
@@ -1264,16 +1265,14 @@ def __init__(self, config: SwinConfig):
         self.embeddings = SwinEmbeddings(config)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
-        if config.out_indices is not None:
-            self.out_indices = config.out_indices
-        else:
-            self.out_indices = tuple(i for i, layer in enumerate(self.stage_names) if layer in self.out_features)
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            config.out_features, config.out_indices, self.stage_names
+        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
-        for stage, num_channels in zip(self.out_features, self.channels):
+        for stage, num_channels in zip(self._out_features, self.channels):
             hidden_states_norms[stage] = nn.LayerNorm(num_channels)
         self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
 
diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py
index a00866f77dc4..6143c57e92d5 100644
--- a/src/transformers/models/upernet/modeling_upernet.py
+++ b/src/transformers/models/upernet/modeling_upernet.py
@@ -22,8 +22,9 @@
 
 from ... import AutoBackbone
 from ...modeling_outputs import SemanticSegmenterOutput
-from ...modeling_utils import BackboneMixin, PreTrainedModel
+from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_upernet import UperNetConfig
 
 
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
new file mode 100644
index 000000000000..8c6b7107eb0e
--- /dev/null
+++ b/src/transformers/utils/backbone_utils.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Collection of utils to be used by backbones and their components."""
+
+import inspect
+from typing import Iterable, List, Optional, Tuple, Union
+
+
+def verify_out_features_out_indices(
+    out_features: Optional[Iterable[str]], out_indices: Optional[Iterable[int]], stage_names: Optional[Iterable[str]]
+):
+    """
+    Verify that out_indices and out_features are valid for the given stage_names.
+    """
+    if stage_names is None:
+        raise ValueError("Stage_names must be set for transformers backbones")
+
+    if out_features is not None:
+        if not isinstance(out_features, (list,)):
+            raise ValueError(f"out_features must be a list {type(out_features)}")
+        if any(feat not in stage_names for feat in out_features):
+            raise ValueError(f"out_features must be a subset of stage_names: {stage_names} got {out_features}")
+
+    if out_indices is not None:
+        if not isinstance(out_indices, (list, tuple)):
+            raise ValueError(f"out_indices must be a list or tuple, got {type(out_indices)}")
+        if any(idx >= len(stage_names) for idx in out_indices):
+            raise ValueError("out_indices must be valid indices for stage_names {stage_names}, got {out_indices}")
+
+    if out_features is not None and out_indices is not None:
+        if len(out_features) != len(out_indices):
+            raise ValueError("out_features and out_indices should have the same length if both are set")
+        if out_features != [stage_names[idx] for idx in out_indices]:
+            raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+
+
+def _align_output_features_output_indices(
+    out_features: Optional[List[str]],
+    out_indices: Optional[Union[List[int], Tuple[int]]],
+    stage_names: List[str],
+):
+    """
+    Finds the corresponding `out_features` and `out_indices` for the given `stage_names`.
+
+    The logic is as follows:
+        - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
+        `out_indices`.
+        - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
+        `out_features`.
+        - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
+        - `out_indices` and `out_features` set: input `out_indices` and `out_features` are returned.
+
+    Args:
+        out_features (`List[str]`): The names of the features for the backbone to output.
+        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
+        stage_names (`List[str]`): The names of the stages of the backbone.
+    """
+    if out_indices is None and out_features is None:
+        out_indices = [len(stage_names) - 1]
+        out_features = [stage_names[-1]]
+    elif out_indices is None and out_features is not None:
+        out_indices = [stage_names.index(layer) for layer in stage_names if layer in out_features]
+    elif out_features is None and out_indices is not None:
+        out_features = [stage_names[idx] for idx in out_indices]
+    return out_features, out_indices
+
+
+def get_aligned_output_features_output_indices(
+    out_features: Optional[List[str]],
+    out_indices: Optional[Union[List[int], Tuple[int]]],
+    stage_names: List[str],
+) -> Tuple[List[str], List[int]]:
+    """
+    Get the `out_features` and `out_indices` so that they are aligned.
+
+    The logic is as follows:
+        - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
+        `out_indices`.
+        - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
+        `out_features`.
+        - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
+        - `out_indices` and `out_features` set: they are verified to be aligned.
+
+    Args:
+        out_features (`List[str]`): The names of the features for the backbone to output.
+        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
+        stage_names (`List[str]`): The names of the stages of the backbone.
+    """
+    # First verify that the out_features and out_indices are valid
+    verify_out_features_out_indices(out_features=out_features, out_indices=out_indices, stage_names=stage_names)
+    output_features, output_indices = _align_output_features_output_indices(
+        out_features=out_features, out_indices=out_indices, stage_names=stage_names
+    )
+    # Verify that the aligned out_features and out_indices are valid
+    verify_out_features_out_indices(out_features=output_features, out_indices=output_indices, stage_names=stage_names)
+    return output_features, output_indices
+
+
+class BackboneMixin:
+    @property
+    def out_feature_channels(self):
+        # the current backbones will output the number of channels for each stage
+        # even if that stage is not in the out_features list.
+        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    def forward_with_filtered_kwargs(self, *args, **kwargs):
+        signature = dict(inspect.signature(self.forward).parameters)
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
+        return self(*args, **filtered_kwargs)
+
+    def forward(
+        self,
+        pixel_values,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        raise NotImplementedError("This method should be implemented by the derived class.")
+
+    @property
+    def out_features(self):
+        return self._out_features
+
+    @out_features.setter
+    def out_features(self, out_features: List[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=None, stage_names=self.stage_names
+        )
+
+    @property
+    def out_indices(self):
+        return self._out_indices
+
+    @out_indices.setter
+    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=None, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+class BackboneConfigMixin:
+    """
+    A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations.
+    """
+
+    @property
+    def out_features(self):
+        return self._out_features
+
+    @out_features.setter
+    def out_features(self, out_features: List[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=None, stage_names=self.stage_names
+        )
+
+    @property
+    def out_indices(self):
+        return self._out_indices
+
+    @out_indices.setter
+    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=None, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features")
+        output["out_indices"] = output.pop("_out_indices")
+        return output
diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py
index 80e68a2f44ad..fd9bbe3bfbfe 100644
--- a/tests/test_backbone_common.py
+++ b/tests/test_backbone_common.py
@@ -81,9 +81,15 @@ def test_channels(self):
             out_channels = [num_features[idx] for idx in out_indices]
             self.assertListEqual(model.channels, out_channels)
 
-            config.out_features = None
-            config.out_indices = None
-            model = model_class(config)
+            new_config = copy.deepcopy(config)
+            new_config.out_features = None
+            model = model_class(new_config)
+            self.assertEqual(len(model.channels), 1)
+            self.assertListEqual(model.channels, [num_features[-1]])
+
+            new_config = copy.deepcopy(config)
+            new_config.out_indices = None
+            model = model_class(new_config)
             self.assertEqual(len(model.channels), 1)
             self.assertListEqual(model.channels, [num_features[-1]])
 
@@ -102,6 +108,15 @@ def test_create_from_modified_config(self):
             # Check output of last stage is taken if out_features=None, out_indices=None
             modified_config = copy.deepcopy(config)
             modified_config.out_features = None
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), 1)
+            self.assertEqual(len(model.channels), 1)
+
+            modified_config = copy.deepcopy(config)
             modified_config.out_indices = None
             model = model_class(modified_config)
             model.to(torch_device)
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
new file mode 100644
index 000000000000..66b7087da246
--- /dev/null
+++ b/tests/utils/test_backbone_utils.py
@@ -0,0 +1,102 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.utils.backbone_utils import (
+    BackboneMixin,
+    get_aligned_output_features_output_indices,
+    verify_out_features_out_indices,
+)
+
+
+class BackboneUtilsTester(unittest.TestCase):
+    def test_get_aligned_output_features_output_indices(self):
+        stage_names = ["a", "b", "c"]
+
+        # Defaults to last layer if both are None
+        out_features, out_indices = get_aligned_output_features_output_indices(None, None, stage_names)
+        self.assertEqual(out_features, ["c"])
+        self.assertEqual(out_indices, [2])
+
+        # Out indices set to match out features
+        out_features, out_indices = get_aligned_output_features_output_indices(["a", "c"], None, stage_names)
+        self.assertEqual(out_features, ["a", "c"])
+        self.assertEqual(out_indices, [0, 2])
+
+        # Out features set to match out indices
+        out_features, out_indices = get_aligned_output_features_output_indices(None, [0, 2], stage_names)
+        self.assertEqual(out_features, ["a", "c"])
+        self.assertEqual(out_indices, [0, 2])
+
+        # Out features selected from negative indices
+        out_features, out_indices = get_aligned_output_features_output_indices(None, [-3, -1], stage_names)
+        self.assertEqual(out_features, ["a", "c"])
+        self.assertEqual(out_indices, [-3, -1])
+
+    def test_verify_out_features_out_indices(self):
+        # Stage names must be set
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0, 1), None)
+
+        # Out features must be a list
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(("a", "b"), (0, 1), ["a", "b"])
+
+        # Out features must be a subset of stage names
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0, 1), ["a"])
+
+        # Out indices must be a list or tuple
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(None, 0, ["a", "b"])
+
+        # Out indices must be a subset of stage names
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(None, (0, 1), ["a"])
+
+        # Out features and out indices must be the same length
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0,), ["a", "b", "c"])
+
+        # Out features should match out indices
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["a", "b"], (0, 2), ["a", "b", "c"])
+
+        # Out features and out indices should be in order
+        with self.assertRaises(ValueError):
+            verify_out_features_out_indices(["b", "a"], (0, 1), ["a", "b"])
+
+        # Check passes with valid inputs
+        verify_out_features_out_indices(["a", "b", "d"], (0, 1, -1), ["a", "b", "c", "d"])
+
+    def test_backbone_mixin(self):
+        backbone = BackboneMixin()
+
+        backbone.stage_names = ["a", "b", "c"]
+        backbone._out_features = ["a", "c"]
+        backbone._out_indices = [0, 2]
+
+        # Check that the output features and indices are set correctly
+        self.assertEqual(backbone.out_features, ["a", "c"])
+        self.assertEqual(backbone.out_indices, [0, 2])
+
+        # Check out features and indices are updated correctly
+        backbone.out_features = ["a", "b"]
+        self.assertEqual(backbone.out_features, ["a", "b"])
+        self.assertEqual(backbone.out_indices, [0, 1])
+
+        backbone.out_indices = [-3, -1]
+        self.assertEqual(backbone.out_features, ["a", "c"])
+        self.assertEqual(backbone.out_indices, [-3, -1])

From 5eeb5564846297aea01b39621ed7fc32ed458246 Mon Sep 17 00:00:00 2001
From: digger-yu <digger@meshbox.io>
Date: Thu, 4 May 2023 21:56:28 +0800
Subject: [PATCH 022/935] fix spelling error (#23143)

change referrred to referred
---
 docs/source/en/preprocessing.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx
index 9896b6898931..c2933099b03d 100644
--- a/docs/source/en/preprocessing.mdx
+++ b/docs/source/en/preprocessing.mdx
@@ -41,7 +41,7 @@ The main tool for preprocessing textual data is a [tokenizer](main_classes/token
 
 <Tip>
 
-If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referrred to as the *vocab*) during pretraining.
+If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining.
 
 </Tip>
 

From 3b74889e8f3109efb36a0002c8aaa3ede164e30e Mon Sep 17 00:00:00 2001
From: Victor Geislinger <9027783+MrGeislinger@users.noreply.github.com>
Date: Thu, 4 May 2023 06:56:45 -0700
Subject: [PATCH 023/935] Remove typo in perf_train_gpu_many.mdx (#23144)

- Excess `w` in  the word `bottom`
---
 docs/source/en/perf_train_gpu_many.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/perf_train_gpu_many.mdx b/docs/source/en/perf_train_gpu_many.mdx
index 17eb7b739925..e756732daf1a 100644
--- a/docs/source/en/perf_train_gpu_many.mdx
+++ b/docs/source/en/perf_train_gpu_many.mdx
@@ -272,7 +272,7 @@ It's easy to see from the bottom diagram how PP has less dead zones, where GPUs
 
 Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
 
-PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
+PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
 
 Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
 

From adb0760b5f008893a421109d72918ef2a47825ad Mon Sep 17 00:00:00 2001
From: Qingyang Wu <wilwu@ucdavis.edu>
Date: Thu, 4 May 2023 09:57:32 -0400
Subject: [PATCH 024/935] fix resume fsdp (#23111)

* fix resume fsdp

* fix rank 0 loading

* fix style and quality
---
 src/transformers/trainer.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d30218dc4721..d18e3efeb87a 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2114,7 +2114,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
         safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
 
         if not any(
-            [os.path.isfile(f) for f in [weights_file, safe_weights_file, weights_index_file, safe_weights_index_file]]
+            os.path.isfile(f) for f in [weights_file, safe_weights_file, weights_index_file, safe_weights_index_file]
         ):
             raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
 
@@ -2364,6 +2364,12 @@ def _save_checkpoint(self, model, trial, metrics=None):
         if self.sharded_ddp == ShardedDDPOption.SIMPLE:
             self.optimizer.consolidate_state_dict()
 
+        if self.fsdp:
+            # FSDP has a different interface for saving optimizer states.
+            # Needs to be called on all ranks to gather all states.
+            # full_optim_state_dict will be deprecated after Pytorch 2.2!
+            full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
+
         if is_torch_tpu_available():
             xm.rendezvous("saving_optimizer_states")
             xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
@@ -2388,7 +2394,11 @@ def _save_checkpoint(self, model, trial, metrics=None):
                     torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
         elif self.args.should_save and not self.deepspeed:
             # deepspeed.save_checkpoint above saves model/optim/sched
-            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            if self.fsdp:
+                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
+            else:
+                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+
             with warnings.catch_warnings(record=True) as caught_warnings:
                 torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
             reissue_pt_warnings(caught_warnings)
@@ -2498,9 +2508,18 @@ def opt_load_hook(mod, opt):
                     # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
                     # likely to get OOM on CPU (since we load num_gpu times the optimizer state
                     map_location = self.args.device if self.args.world_size > 1 else "cpu"
-                    self.optimizer.load_state_dict(
-                        torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
-                    )
+                    if self.fsdp:
+                        full_osd = None
+                        # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it
+                        if self.args.process_index == 0:
+                            full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))
+                        # call scatter_full_optim_state_dict on all ranks
+                        sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model)
+                        self.optimizer.load_state_dict(sharded_osd)
+                    else:
+                        self.optimizer.load_state_dict(
+                            torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
+                        )
                 with warnings.catch_warnings(record=True) as caught_warnings:
                     self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
                 reissue_pt_warnings(caught_warnings)

From 510ad0a8b82cfe47cb368afcfcedf521ce0dbbed Mon Sep 17 00:00:00 2001
From: peter-sk <petersk@imada.sdu.dk>
Date: Thu, 4 May 2023 15:58:38 +0200
Subject: [PATCH 025/935] gpt2 multi-gpu fix (#23149)

Co-authored-by: Prof. Peter Schneider-Kamp <jps@ordbogen.com>
---
 src/transformers/models/gpt2/modeling_gpt2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index afc79463bd7b..b69b910a76d0 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -1670,9 +1670,9 @@ def forward(
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
             if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
             ignored_index = start_logits.size(1)
             start_positions = start_positions.clamp(0, ignored_index)

From 83b38fbea8c2a6fbd9af0d9561db3afde2f4f4e2 Mon Sep 17 00:00:00 2001
From: peter-sk <petersk@imada.sdu.dk>
Date: Thu, 4 May 2023 16:15:15 +0200
Subject: [PATCH 026/935] GPTNeoXForQuestionAnswering (#23059)

* first draft - gives index error in question_answering.py

* maturing

* no labels

* pipeline should know about QA

* fixing checks

* formatting

* fixed docstring

* initial commit

* formatting

* adding the class to many places

* towards less unhappy checks

* nearly there

* and gpt neox for qa

* use right model

* forgot this one

* base_model_prefix is "gpt_neox" for GPTNeoX* models

* unnecessary stuff

* Update src/transformers/models/gpt_neox/modeling_gpt_neox.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* format

* Update src/transformers/models/gpt_neox/modeling_gpt_neox.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* removed gpt2 stuff

---------

Co-authored-by: Prof. Peter Schneider-Kamp <jps@ordbogen.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/gpt_neox.mdx         |   5 +
 docs/source/en/tasks/question_answering.mdx   |   2 +-
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/gpt2/modeling_gpt2.py |   3 +-
 src/transformers/models/gpt_neox/__init__.py  |   2 +
 .../models/gpt_neox/modeling_gpt_neox.py      | 101 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   7 ++
 .../models/gpt_neox/test_modeling_gpt_neox.py |  23 +++-
 9 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/model_doc/gpt_neox.mdx b/docs/source/en/model_doc/gpt_neox.mdx
index 0c86a81a3ba1..bc8bbd67331f 100644
--- a/docs/source/en/model_doc/gpt_neox.mdx
+++ b/docs/source/en/model_doc/gpt_neox.mdx
@@ -79,6 +79,11 @@ The `generate()` method can be used to generate text using GPT Neo model.
 [[autodoc]] GPTNeoXForCausalLM
     - forward
 
+## GPTNeoXForQuestionAnswering
+
+[[autodoc]] GPTNeoXForQuestionAnswering
+    - forward
+
 ## GPTNeoXForSequenceClassification
 
 [[autodoc]] GPTNeoXForSequenceClassification
diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx
index 3dd11140c253..fa39b640494d 100644
--- a/docs/source/en/tasks/question_answering.mdx
+++ b/docs/source/en/tasks/question_answering.mdx
@@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1c74d242fc6b..7bf322ca8e1e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1702,6 +1702,7 @@
         [
             "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
             "GPTNeoXForCausalLM",
+            "GPTNeoXForQuestionAnswering",
             "GPTNeoXForSequenceClassification",
             "GPTNeoXForTokenClassification",
             "GPTNeoXLayer",
@@ -5245,6 +5246,7 @@
         from .models.gpt_neox import (
             GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoXForCausalLM,
+            GPTNeoXForQuestionAnswering,
             GPTNeoXForSequenceClassification,
             GPTNeoXForTokenClassification,
             GPTNeoXLayer,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a53128a61216..1ebc906d2d67 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -737,6 +737,7 @@
         ("funnel", "FunnelForQuestionAnswering"),
         ("gpt2", "GPT2ForQuestionAnswering"),
         ("gpt_neo", "GPTNeoForQuestionAnswering"),
+        ("gpt_neox", "GPTNeoXForQuestionAnswering"),
         ("gptj", "GPTJForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
         ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index b69b910a76d0..e6ee04200b34 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -52,7 +52,6 @@
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "gpt2"
-_REAL_CHECKPOINT_FOR_DOC = "gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -1619,7 +1618,7 @@ def __init__(self, config):
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+        real_checkpoint=_CHECKPOINT_FOR_DOC,
     )
     def forward(
         self,
diff --git a/src/transformers/models/gpt_neox/__init__.py b/src/transformers/models/gpt_neox/__init__.py
index d5fd5cbda2b6..46f06b1991af 100644
--- a/src/transformers/models/gpt_neox/__init__.py
+++ b/src/transformers/models/gpt_neox/__init__.py
@@ -36,6 +36,7 @@
     _import_structure["modeling_gpt_neox"] = [
         "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
         "GPTNeoXForCausalLM",
+        "GPTNeoXForQuestionAnswering",
         "GPTNeoXForSequenceClassification",
         "GPTNeoXForTokenClassification",
         "GPTNeoXLayer",
@@ -64,6 +65,7 @@
         from .modeling_gpt_neox import (
             GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             GPTNeoXForCausalLM,
+            GPTNeoXForQuestionAnswering,
             GPTNeoXForSequenceClassification,
             GPTNeoXForTokenClassification,
             GPTNeoXLayer,
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 055fc9ee27cc..2058dbc1a56b 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -31,6 +31,7 @@
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
@@ -955,3 +956,103 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The GPT-NeoX Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    GPT_NEOX_START_DOCSTRING,
+)
+class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt_neox = GPTNeoXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 73827a3937a1..7279117698db 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3336,6 +3336,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GPTNeoXForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GPTNeoXForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index ff226684ccae..927d097691c5 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -31,6 +31,7 @@
 
     from transformers import (
         GPTNeoXForCausalLM,
+        GPTNeoXForQuestionAnswering,
         GPTNeoXForSequenceClassification,
         GPTNeoXForTokenClassification,
         GPTNeoXModel,
@@ -149,6 +150,15 @@ def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_la
         result = model(input_ids, attention_mask=input_mask, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
+    def create_and_check_for_question_answering(self, config, input_ids, input_mask, token_labels):
+        config.num_labels = self.num_labels
+        model = GPTNeoXForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
     def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, token_labels):
         config.num_labels = self.num_labels
         model = GPTNeoXForSequenceClassification(config)
@@ -213,7 +223,13 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (GPTNeoXModel, GPTNeoXForCausalLM, GPTNeoXForSequenceClassification, GPTNeoXForTokenClassification)
+        (
+            GPTNeoXModel,
+            GPTNeoXForCausalLM,
+            GPTNeoXForQuestionAnswering,
+            GPTNeoXForSequenceClassification,
+            GPTNeoXForTokenClassification,
+        )
         if is_torch_available()
         else ()
     )
@@ -221,6 +237,7 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     pipeline_model_mapping = (
         {
             "feature-extraction": GPTNeoXModel,
+            "question-answering": GPTNeoXForQuestionAnswering,
             "text-classification": GPTNeoXForSequenceClassification,
             "token-classification": GPTNeoXForTokenClassification,
             "text-generation": GPTNeoXForCausalLM,
@@ -265,6 +282,10 @@ def test_model_for_causal_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
 
+    def test_model_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
     def test_model_for_sequence_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)

From 57ffd8ab4c833e26b2288769f6031f94870a102c Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 4 May 2023 16:31:19 +0200
Subject: [PATCH 027/935] [`GPT-J`] Fix causal mask dtype (#23147)

* fix #23136

* better fix

* same fix for `masked_bias`
---
 src/transformers/models/gptj/modeling_gptj.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 18985cb3bce4..3a1f99dd713a 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -89,8 +89,9 @@ def __init__(self, config):
             torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
@@ -732,7 +733,7 @@ def custom_forward(*inputs):
     GPTJ_START_DOCSTRING,
 )
 class GPTJForCausalLM(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
 
     def __init__(self, config):
         super().__init__(config)

From 3341bb41cd2e3bf69e2682fbfe042b7a98b6d4fb Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Thu, 4 May 2023 12:00:22 -0400
Subject: [PATCH 028/935] Pin urllib3

---
 setup.py                                      | 3 ++-
 src/transformers/dependency_versions_table.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index abafc03bc6f2..d8f58360d7af 100644
--- a/setup.py
+++ b/setup.py
@@ -185,6 +185,7 @@
     "tqdm>=4.27",
     "unidic>=1.0.2",
     "unidic_lite>=1.0.7",
+    "urllib3<2.0.0",
     "uvicorn",
 ]
 
@@ -331,7 +332,7 @@ def run(self):
 
 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
 
-extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder")
+extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder", "urllib3")
 
 extras["all"] = (
     extras["tf"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index bae19acd3e1d..dd1055ebd2e4 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -85,5 +85,6 @@
     "tqdm": "tqdm>=4.27",
     "unidic": "unidic>=1.0.2",
     "unidic_lite": "unidic_lite>=1.0.7",
+    "urllib3": "urllib3<2.0.0",
     "uvicorn": "uvicorn",
 }

From c8f2c5c56e942e8c45821d07555f2eab178b3f83 Mon Sep 17 00:00:00 2001
From: raghavanone <115454562+raghavanone@users.noreply.github.com>
Date: Thu, 4 May 2023 22:30:16 +0530
Subject: [PATCH 029/935] Add FlaxWhisperForAudioClassification model (#22883)

* Add FlaxWhisperForAudioClassification model

* Add models to init

* Add models to init

* Fix copies

* Fix automapping
---
 docs/source/en/model_doc/whisper.mdx          |   6 +
 src/transformers/__init__.py                  |   8 +-
 .../models/auto/modeling_flax_auto.py         |   5 +
 src/transformers/models/whisper/__init__.py   |   2 +
 .../models/whisper/modeling_flax_whisper.py   | 160 ++++++++++++++
 src/transformers/utils/dummy_flax_objects.py  |   7 +
 .../whisper/test_modeling_flax_whisper.py     | 205 +++++++++++++++++-
 7 files changed, 390 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 22b08e4e61bc..52a8b5953c63 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -105,3 +105,9 @@ The original code can be found [here](https://github.com/openai/whisper).
 
 [[autodoc]] FlaxWhisperForConditionalGeneration
     - __call__
+
+## FlaxWhisperForAudioClassification
+
+[[autodoc]] FlaxWhisperForAudioClassification
+    - __call__
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7bf322ca8e1e..b0766b0946cd 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3779,6 +3779,7 @@
             "FlaxWhisperForConditionalGeneration",
             "FlaxWhisperModel",
             "FlaxWhisperPreTrainedModel",
+            "FlaxWhisperForAudioClassification",
         ]
     )
     _import_structure["models.xglm"].extend(
@@ -6903,7 +6904,12 @@
             FlaxWav2Vec2Model,
             FlaxWav2Vec2PreTrainedModel,
         )
-        from .models.whisper import FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel
+        from .models.whisper import (
+            FlaxWhisperForAudioClassification,
+            FlaxWhisperForConditionalGeneration,
+            FlaxWhisperModel,
+            FlaxWhisperPreTrainedModel,
+        )
         from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel
         from .models.xlm_roberta import (
             FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index 755d1f07a344..e3b8d9cf5b52 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -229,6 +229,11 @@
     ]
 )
 
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("whisper", "FlaxWhisperForAudioClassification"),
+    ]
+)
 
 FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
 FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index 3b6015a56f6f..cd962478e34d 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -75,6 +75,7 @@
         "FlaxWhisperForConditionalGeneration",
         "FlaxWhisperModel",
         "FlaxWhisperPreTrainedModel",
+        "FlaxWhisperForAudioClassification",
     ]
 
 
@@ -126,6 +127,7 @@
         pass
     else:
         from .modeling_flax_whisper import (
+            FlaxWhisperForAudioClassification,
             FlaxWhisperForConditionalGeneration,
             FlaxWhisperModel,
             FlaxWhisperPreTrainedModel,
diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index b8d6f07242d8..e36131680d63 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -36,6 +36,7 @@
     FlaxCausalLMOutputWithCrossAttentions,
     FlaxSeq2SeqLMOutput,
     FlaxSeq2SeqModelOutput,
+    FlaxSequenceClassifierOutput,
 )
 from ...modeling_flax_utils import (
     ACT2FN,
@@ -1506,3 +1507,162 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 append_replace_return_docstrings(
     FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
 )
+
+
+class FlaxWhisperForAudioClassificationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype)
+        self.config.is_encoder_decoder = False
+        num_layers = self.config.num_hidden_layers + 1
+        if self.config.use_weighted_layer_sum:
+            self.layer_weights = jnp.repeat(1 / num_layers, num_layers)
+        self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_features,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states: bool = True,
+        return_dict: bool = True,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = jnp.stack(encoder_outputs, axis=1)
+            norm_weights = jax.nn.softmax(self.layer_weights, axis=-1)
+            hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = jnp.mean(hidden_states, axis=1)
+
+        logits = self.classifier(pooled_output)
+
+        if not return_dict:
+            return (logits,) + encoder_outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForAudioClassificationModule
+    dtype: jnp.dtype = jnp.float32
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            rngs=rngs,
+        )
+
+
+FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
+    Returns:
+
+    Transcription example:
+
+    ```python
+    >>> import jax.numpy as jnp
+    >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification
+    >>> from datasets import load_dataset
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
+    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
+    ... )
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+
+    >>> sample = next(iter(ds))
+
+    >>> inputs = feature_extractor(
+    ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np"
+    ... )
+    >>> input_features = inputs.input_features
+
+    >>> logits = model(input_features).logits
+
+    >>> predicted_class_ids = jnp.argmax(logits).item()
+    >>> predicted_label = model.config.id2label[predicted_class_ids]
+    >>> predicted_label
+    'af_za'
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
+)
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index eeec3277492d..ce571bc9f8d0 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -1182,6 +1182,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxWhisperForAudioClassification(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxWhisperForConditionalGeneration(metaclass=DummyObject):
     _backends = ["flax"]
 
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
index 3f1e201d72d8..79a2c51039ac 100644
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
 import functools
 import inspect
 import tempfile
@@ -41,6 +39,7 @@
 
     from transformers import (
         FLAX_MODEL_MAPPING,
+        FlaxWhisperForAudioClassification,
         FlaxWhisperForConditionalGeneration,
         FlaxWhisperModel,
         WhisperFeatureExtractor,
@@ -704,3 +703,205 @@ def test_tiny_timestamp_generation(self):
 
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+
+class FlaxWhisperEncoderModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=60,
+        is_training=True,
+        use_labels=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        input_channels=1,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        max_source_positions=30,
+        num_mel_bins=80,
+        num_conv_layers=1,
+        suppress_tokens=None,
+        begin_suppress_tokens=None,
+        classifier_proj_size=4,
+        num_labels=2,
+        is_encoder_decoder=False,
+        is_decoder=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_mel_bins = num_mel_bins
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.num_conv_layers = num_conv_layers
+        self.suppress_tokens = suppress_tokens
+        self.begin_suppress_tokens = begin_suppress_tokens
+        self.classifier_proj_size = classifier_proj_size
+        self.num_labels = num_labels
+        self.is_encoder_decoder = is_encoder_decoder
+        self.is_decoder = is_decoder
+
+    def get_config(self):
+        return WhisperConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            input_channels=self.input_channels,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_source_positions=self.max_source_positions,
+            decoder_ffn_dim=self.hidden_size,
+            encoder_ffn_dim=self.hidden_size,
+            suppress_tokens=self.suppress_tokens,
+            begin_suppress_tokens=self.begin_suppress_tokens,
+            classifier_proj_size=self.classifier_proj_size,
+            num_labels=self.num_labels,
+            is_encoder_decoder=self.is_encoder_decoder,
+            is_decoder=self.is_decoder,
+        )
+
+    def prepare_whisper_encoder_inputs_dict(
+        self,
+        input_features,
+    ):
+        return {
+            "input_features": input_features,
+        }
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
+
+        config = self.get_config()
+        inputs_dict = self.prepare_whisper_encoder_inputs_dict(
+            input_features=input_features,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_subsampled_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        for i in range(self.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    @property
+    def encoder_seq_length(self):
+        return self.get_subsampled_output_lengths(self.seq_length)
+
+
+@require_flax
+class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else ()
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = False
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = FlaxWhisperEncoderModelTester(self)
+        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        self.all_model_classes = (
+            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
+        )
+        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite because of `input_features`
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(input_features, **kwargs):
+                    return model(input_features=input_features, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    # overwrite because of `input_features`
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_features", "attention_mask", "output_attentions"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_inputs_embeds(self):
+        pass
+
+    # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    def test_save_load_to_base(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    def test_save_load_from_base(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_from_base_pt(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_to_base_pt(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_bf16_to_base_pt(self):
+        pass

From 516dc6305f2abca3f48218310ea872e3b1c9c995 Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Thu, 4 May 2023 13:17:13 -0400
Subject: [PATCH 030/935] [docs] Text to speech task guide (#23107)

* First draft

* Some polishing

* Text polishing

* added TOC entry for TTS

* make style

* added links to images

* fixed links to images

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* feedback addressed

* feedback from Matthijs addresed

* Update docs/source/en/tasks/text-to-speech.mdx

Co-authored-by: Matthijs Hollemans <mail@hollance.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Matthijs Hollemans <mail@hollance.com>
---
 docs/source/en/_toctree.yml             |   2 +
 docs/source/en/tasks/text-to-speech.mdx | 558 ++++++++++++++++++++++++
 2 files changed, 560 insertions(+)
 create mode 100644 docs/source/en/tasks/text-to-speech.mdx

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b9e346b3000c..f6e9684f79e3 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -71,6 +71,8 @@
         title: Image captioning
       - local: tasks/document_question_answering
         title: Document Question Answering
+      - local: tasks/text-to-speech
+        title: Text to speech
     title: Multimodal
     isExpanded: false
   title: Task Guides
diff --git a/docs/source/en/tasks/text-to-speech.mdx b/docs/source/en/tasks/text-to-speech.mdx
new file mode 100644
index 000000000000..a368fcb35f7e
--- /dev/null
+++ b/docs/source/en/tasks/text-to-speech.mdx
@@ -0,0 +1,558 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text to speech
+
+[[open-in-colab]]
+
+Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple 
+languages and for multiple speakers. The only text-to-speech model currently available in 🤗 Transformers 
+is [SpeechT5](model_doc/speecht5), though more will be added in the future. SpeechT5 is pre-trained on a combination of 
+speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text 
+and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 
+supports multiple speakers through x-vector speaker embeddings. 
+
+This guide illustrates how to:
+
+1. Fine-tune [SpeechT5](model_doc/speecht5) that was originally trained on English speech on the Dutch (`nl`) language subset of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset.
+2. Use your fine-tuned model for inference.
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install datasets soundfile speechbrain accelerate
+```
+
+Install 🤗Transformers from source as not all the SpeechT5 features have been merged into an official release yet:
+
+```bash
+pip install git+https://github.com/huggingface/transformers.git
+```
+
+<Tip>
+
+To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available: 
+
+```bash
+!nvidia-smi
+```
+
+</Tip>
+
+We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load the dataset
+
+[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of 
+data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15 
+European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset. 
+
+Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable 
+option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are 
+typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite 
+challenging.
+
+Let's load the data:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
+>>> len(dataset)
+20968
+```
+
+20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so 
+make sure the examples in the dataset meet this requirement:
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+## Preprocess the data
+
+Let's begin by defining the model checkpoint to use and loading the appropriate processor: 
+
+```py
+>>> from transformers import SpeechT5Processor
+
+>>> checkpoint = "microsoft/speecht5_tts"
+>>> processor = SpeechT5Processor.from_pretrained(checkpoint)
+```
+
+### Text cleanup for SpeechT5 tokenization 
+
+Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text:
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input, 
+consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written 
+out as text. Thus, it is a better fit, and we recommend using    `normalized_text` as input text.
+
+Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If 
+left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are 
+used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`.
+
+To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which 
+works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates 
+the transcriptions from all examples into one string and converts it to a set of characters. 
+Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for 
+the mapping function.
+
+```py
+>>> def extract_all_chars(batch):
+...     all_text = " ".join(batch["normalized_text"])
+...     vocab = list(set(all_text))
+...     return {"vocab": [vocab], "all_text": [all_text]}
+
+
+>>> vocabs = dataset.map(
+...     extract_all_chars,
+...     batched=True,
+...     batch_size=-1,
+...     keep_in_memory=True,
+...     remove_columns=dataset.column_names,
+... )
+
+>>> dataset_vocab = set(vocabs["vocab"][0])
+>>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
+```
+
+Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer. 
+To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting 
+set will contain the characters that are in the dataset but not in the tokenizer.
+
+```py
+>>> dataset_vocab - tokenizer_vocab
+{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}
+```
+
+To handle the unsupported characters identified in the previous step, define a function that maps these characters to 
+valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately.
+
+```py
+>>> replacements = [
+...     ("à", "a"),
+...     ("ç", "c"),
+...     ("è", "e"),
+...     ("ë", "e"),
+...     ("í", "i"),
+...     ("ï", "i"),
+...     ("ö", "o"),
+...     ("ü", "u"),
+... ]
+
+
+>>> def cleanup_text(inputs):
+...     for src, dst in replacements:
+...         inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
+...     return inputs
+
+
+>>> dataset = dataset.map(cleanup_text)
+```
+
+Now that you have dealt with special characters in the text, it's time to shift focus to the audio data.
+
+### Speakers
+
+The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To 
+determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset. 
+With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of 
+speakers and examples in the data.
+
+```py
+>>> from collections import defaultdict
+
+>>> speaker_counts = defaultdict(int)
+
+>>> for speaker_id in dataset["speaker_id"]:
+...     speaker_counts[speaker_id] += 1
+```
+
+By plotting a histogram you can get a sense of how much data there is for each speaker.
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> plt.figure()
+>>> plt.hist(speaker_counts.values(), bins=20)
+>>> plt.ylabel("Speakers")
+>>> plt.xlabel("Examples")
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_speakers_histogram.png" alt="Speakers histogram"/>
+</div>
+
+The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while 
+around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit 
+the data to speakers with between 100 and 400 examples. 
+
+```py
+>>> def select_speaker(speaker_id):
+...     return 100 <= speaker_counts[speaker_id] <= 400
+
+
+>>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
+```
+
+Let's check how many speakers remain: 
+
+```py
+>>> len(set(dataset["speaker_id"]))
+42
+```
+
+Let's see how many examples are left: 
+
+```py
+>>> len(dataset)
+9973
+```
+
+You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient.
+
+Note that some speakers with few examples may actually have more audio available if the examples are long. However, 
+determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a 
+time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here.
+
+### Speaker embeddings
+
+To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. 
+The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics.
+To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) 
+model from SpeechBrain. 
+
+Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector 
+containing the corresponding speaker embedding.
+
+```py
+>>> import os
+>>> import torch
+>>> from speechbrain.pretrained import EncoderClassifier
+
+>>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> speaker_model = EncoderClassifier.from_hparams(
+...     source=spk_model_name,
+...     run_opts={"device": device},
+...     savedir=os.path.join("/tmp", spk_model_name),
+... )
+
+
+>>> def create_speaker_embedding(waveform):
+...     with torch.no_grad():
+...         speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
+...         speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+...         speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
+...     return speaker_embeddings
+```
+
+It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb 
+dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate 
+reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases.
+
+For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model 
+is better able to capture the unique voice characteristics present in the Dutch language.
+
+### Processing the dataset
+
+Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a 
+single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram. 
+It should also add the speaker embeddings as an additional input.
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example = processor(
+...         text=example["normalized_text"],
+...         audio_target=audio["array"],
+...         sampling_rate=audio["sampling_rate"],
+...         return_attention_mask=False,
+...     )
+
+...     # strip off the batch dimension
+...     example["labels"] = example["labels"][0]
+
+...     # use SpeechBrain to obtain x-vector
+...     example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
+
+...     return example
+```
+
+Verify the processing is correct by looking at a single example:
+
+```py
+>>> processed_example = prepare_dataset(dataset[0])
+>>> list(processed_example.keys())
+['input_ids', 'labels', 'stop_labels', 'speaker_embeddings']
+```
+
+Speaker embeddings should be a 512-element vector:
+
+```py
+>>> processed_example["speaker_embeddings"].shape
+(512,)
+```
+
+The labels should be a log-mel spectrogram with 80 mel bins.
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> plt.figure()
+>>> plt.imshow(processed_example["labels"].T)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_1.png" alt="Log-mel spectrogram with 80 mel bins"/>
+</div>
+
+Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies 
+at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library, 
+the y-axis is flipped and the spectrograms appear upside down.
+
+Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes.
+
+```py
+>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
+```
+
+You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). 
+Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens.
+
+```py
+>>> def is_not_too_long(input_ids):
+...     input_length = len(input_ids)
+...     return input_length < 200
+
+
+>>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
+>>> len(dataset)
+8259
+```
+
+Next, create a basic train/test split: 
+
+```py
+>>> dataset = dataset.train_test_split(test_size=0.1)
+```
+
+### Data collator
+
+In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding 
+tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value 
+instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss.
+
+```py
+>>> from dataclasses import dataclass
+>>> from typing import Any, Dict, List, Union
+
+
+>>> @dataclass
+... class TTSDataCollatorWithPadding:
+...     processor: Any
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+...         label_features = [{"input_values": feature["labels"]} for feature in features]
+...         speaker_features = [feature["speaker_embeddings"] for feature in features]
+
+...         # collate the inputs and targets into a batch
+...         batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")
+
+...         # replace padding with -100 to ignore loss correctly
+...         batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)
+
+...         # not used during fine-tuning
+...         del batch["decoder_attention_mask"]
+
+...         # round down target lengths to multiple of reduction factor
+...         if model.config.reduction_factor > 1:
+...             target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
+...             target_lengths = target_lengths.new(
+...                 [length - length % model.config.reduction_factor for length in target_lengths]
+...             )
+...             max_length = max(target_lengths)
+...             batch["labels"] = batch["labels"][:, :max_length]
+
+...         # also add in the speaker embeddings
+...         batch["speaker_embeddings"] = torch.tensor(speaker_features)
+
+...         return batch
+```
+
+In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every 
+other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original 
+target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a 
+multiple of 2.
+
+```py 
+>>> data_collator = TTSDataCollatorWithPadding(processor=processor)
+```
+
+## Train the model
+
+Load the pre-trained model from the same checkpoint as you used for loading the processor: 
+
+```py
+>>> from transformers import SpeechT5ForTextToSpeech
+
+>>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
+```
+
+The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training.
+
+```py 
+>>> model.config.use_cache = False
+```
+
+Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll 
+only look at the loss:
+
+```python
+>>> from transformers import Seq2SeqTrainingArguments
+
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="speecht5_finetuned_voxpopuli_nl",  # change to a repo name of your choice
+...     per_device_train_batch_size=4,
+...     gradient_accumulation_steps=8,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=4000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=2,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     report_to=["tensorboard"],
+...     load_best_model_at_end=True,
+...     greater_is_better=False,
+...     label_names=["labels"],
+...     push_to_hub=True,
+... )
+```
+
+Instantiate the `Trainer` object  and pass the model, dataset, and data collator to it.
+
+```py
+>>> from transformers import Seq2SeqTrainer
+
+>>> trainer = Seq2SeqTrainer(
+...     args=training_args,
+...     model=model,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     data_collator=data_collator,
+...     tokenizer=processor.tokenizer,
+... )
+```
+
+And with that, you're ready to start training! Training will take several hours. Depending on your GPU, 
+it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce 
+the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate.
+
+```py
+>>> trainer.train()
+```
+
+Push the final model to the 🤗 Hub:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+Great, now that you've fine-tuned a model, you can use it for inference!
+Load the model from the 🤗 Hub (make sure to use your account name in the following code snippet): 
+
+```py
+>>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
+```
+
+Pick an example, here we'll take one from the test dataset. Obtain a speaker embedding. 
+
+```py 
+>>> example = dataset["test"][304]
+>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+```
+
+Define some input text and tokenize it.
+
+```py 
+>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
+```
+
+Preprocess the input text: 
+
+```py
+>>> inputs = processor(text=text, return_tensors="pt")
+```
+
+Create a spectrogram with your model: 
+
+```py
+>>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+```
+
+Visualize the spectrogram, if you'd like to: 
+
+```py
+>>> plt.figure()
+>>> plt.imshow(spectrogram.T)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_2.png" alt="Generated log-mel spectrogram"/>
+</div>
+
+Finally, use the vocoder to turn the spectrogram into sound.
+
+```py
+>>> with torch.no_grad():
+...     speech = vocoder(spectrogram)
+
+>>> from IPython.display import Audio
+
+>>> Audio(speech.numpy(), rate=16000)
+```
+
+In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker 
+embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best 
+when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding.
+
+Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does 
+capture the voice characteristics of the speaker (compare to the original audio in the example).
+Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to 
+see if this improves the results.
+
+Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it 
+may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please 
+use TTS judiciously and responsibly.
\ No newline at end of file

From b369e507aaa78103baf5d3f3563952b44a0408a1 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 4 May 2023 18:36:23 +0100
Subject: [PATCH 031/935] Generate: text generation pipeline no longer emits
 `max_length` warning when it is not set (#23139)

---
 src/transformers/generation/flax_utils.py     |  2 +-
 src/transformers/generation/tf_utils.py       |  2 +-
 src/transformers/generation/utils.py          |  2 +-
 src/transformers/pipelines/text_generation.py | 32 +++++++++++++------
 .../test_pipelines_text_generation.py         | 32 ++++++++++++++++++-
 5 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 58a2bf13ba61..65d65869afd2 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -385,7 +385,6 @@ def generate(
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
             if not has_default_max_length:
                 logger.warning(
                     f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
@@ -393,6 +392,7 @@ def generate(
                     "Please refer to the documentation for more information. "
                     "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
 
         if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
             raise ValueError(
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 5cd8153c2bf1..5e4bc58c8407 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -858,7 +858,6 @@ def generate(
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
             if not has_default_max_length:
                 logger.warning(
                     f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
@@ -866,6 +865,7 @@ def generate(
                     "Please refer to the documentation for more information. "
                     "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
 
         # If the input length is a tensor (i.e. dynamic length), skip length checks
         if not isinstance(input_ids_seq_length, tf.Tensor):
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 06836d4d4ae2..0f0191fb144f 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1348,7 +1348,6 @@ def generate(
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
             if not has_default_max_length:
                 logger.warning(
                     f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
@@ -1356,6 +1355,7 @@ def generate(
                     "Please refer to the documentation for more information. "
                     "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
 
         if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
             raise ValueError(
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index f95acf7d307f..60037339072d 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -1,3 +1,4 @@
+import copy
 import enum
 import warnings
 
@@ -105,17 +106,8 @@ def _sanitize_parameters(
             prefix_inputs = self.tokenizer(
                 prefix, padding=False, add_special_tokens=False, return_tensors=self.framework
             )
-            prefix_length = prefix_inputs["input_ids"].shape[-1]
+            generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
 
-            if "max_new_tokens" in generate_kwargs:
-                pass
-            elif "max_length" in generate_kwargs:
-                generate_kwargs["max_length"] += prefix_length
-            else:
-                generate_kwargs["max_length"] = self.model.config.max_length + prefix_length
-
-            if "min_length" in generate_kwargs:
-                generate_kwargs["min_length"] += prefix_length
         if handle_long_generation is not None:
             if handle_long_generation not in {"hole"}:
                 raise ValueError(
@@ -247,6 +239,26 @@ def _forward(self, model_inputs, **generate_kwargs):
         else:
             in_b = input_ids.shape[0]
         prompt_text = model_inputs.pop("prompt_text")
+
+        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        generate_kwargs = copy.deepcopy(generate_kwargs)
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].max_new_tokens is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].min_new_tokens is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+
         # BS x SL
         generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
         out_b = generated_sequence.shape[0]
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 34dbef6df2d0..84b144905217 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -14,8 +14,15 @@
 
 import unittest
 
-from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING, TextGenerationPipeline, pipeline
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+    TextGenerationPipeline,
+    logging,
+    pipeline,
+)
 from transformers.testing_utils import (
+    CaptureLogger,
     is_pipeline_test,
     require_accelerate,
     require_tf,
@@ -323,3 +330,26 @@ def test_pipeline_accelerate_top_p(self):
 
         pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device_map="auto", torch_dtype=torch.float16)
         pipe("This is a test", do_sample=True, top_p=0.5)
+
+    def test_pipeline_length_setting_warning(self):
+        prompt = """Hello world"""
+        text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2")
+        if text_generator.model.framework == "tf":
+            logger = logging.get_logger("transformers.generation.tf_utils")
+        else:
+            logger = logging.get_logger("transformers.generation.utils")
+        logger_msg = "Both `max_new_tokens`"  # The beggining of the message to be checked in this test
+
+        # Both are set by the user -> log warning
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_length=10, max_new_tokens=1)
+        self.assertIn(logger_msg, cl.out)
+
+        # The user only sets one -> no warning
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_new_tokens=1)
+        self.assertNotIn(logger_msg, cl.out)
+
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_length=10)
+        self.assertNotIn(logger_msg, cl.out)

From 01734dba842c29408c96caa5c345c9e415c7569b Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 4 May 2023 13:47:07 -0400
Subject: [PATCH 032/935] Revert "Add FlaxWhisperForAudioClassification model"
 (#23154)

Revert "Add FlaxWhisperForAudioClassification model (#22883)"

This reverts commit c8f2c5c56e942e8c45821d07555f2eab178b3f83.
---
 docs/source/en/model_doc/whisper.mdx          |   6 -
 src/transformers/__init__.py                  |   8 +-
 .../models/auto/modeling_flax_auto.py         |   5 -
 src/transformers/models/whisper/__init__.py   |   2 -
 .../models/whisper/modeling_flax_whisper.py   | 160 --------------
 src/transformers/utils/dummy_flax_objects.py  |   7 -
 .../whisper/test_modeling_flax_whisper.py     | 205 +-----------------
 7 files changed, 3 insertions(+), 390 deletions(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 52a8b5953c63..22b08e4e61bc 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -105,9 +105,3 @@ The original code can be found [here](https://github.com/openai/whisper).
 
 [[autodoc]] FlaxWhisperForConditionalGeneration
     - __call__
-
-## FlaxWhisperForAudioClassification
-
-[[autodoc]] FlaxWhisperForAudioClassification
-    - __call__
-
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b0766b0946cd..7bf322ca8e1e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3779,7 +3779,6 @@
             "FlaxWhisperForConditionalGeneration",
             "FlaxWhisperModel",
             "FlaxWhisperPreTrainedModel",
-            "FlaxWhisperForAudioClassification",
         ]
     )
     _import_structure["models.xglm"].extend(
@@ -6904,12 +6903,7 @@
             FlaxWav2Vec2Model,
             FlaxWav2Vec2PreTrainedModel,
         )
-        from .models.whisper import (
-            FlaxWhisperForAudioClassification,
-            FlaxWhisperForConditionalGeneration,
-            FlaxWhisperModel,
-            FlaxWhisperPreTrainedModel,
-        )
+        from .models.whisper import FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel
         from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel
         from .models.xlm_roberta import (
             FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index e3b8d9cf5b52..755d1f07a344 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -229,11 +229,6 @@
     ]
 )
 
-FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
-    [
-        ("whisper", "FlaxWhisperForAudioClassification"),
-    ]
-)
 
 FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
 FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index cd962478e34d..3b6015a56f6f 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -75,7 +75,6 @@
         "FlaxWhisperForConditionalGeneration",
         "FlaxWhisperModel",
         "FlaxWhisperPreTrainedModel",
-        "FlaxWhisperForAudioClassification",
     ]
 
 
@@ -127,7 +126,6 @@
         pass
     else:
         from .modeling_flax_whisper import (
-            FlaxWhisperForAudioClassification,
             FlaxWhisperForConditionalGeneration,
             FlaxWhisperModel,
             FlaxWhisperPreTrainedModel,
diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index e36131680d63..b8d6f07242d8 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -36,7 +36,6 @@
     FlaxCausalLMOutputWithCrossAttentions,
     FlaxSeq2SeqLMOutput,
     FlaxSeq2SeqModelOutput,
-    FlaxSequenceClassifierOutput,
 )
 from ...modeling_flax_utils import (
     ACT2FN,
@@ -1507,162 +1506,3 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 append_replace_return_docstrings(
     FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
 )
-
-
-class FlaxWhisperForAudioClassificationModule(nn.Module):
-    config: WhisperConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self) -> None:
-        self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype)
-        self.config.is_encoder_decoder = False
-        num_layers = self.config.num_hidden_layers + 1
-        if self.config.use_weighted_layer_sum:
-            self.layer_weights = jnp.repeat(1 / num_layers, num_layers)
-        self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype)
-        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
-
-    def __call__(
-        self,
-        input_features,
-        encoder_outputs=None,
-        output_attentions=None,
-        output_hidden_states: bool = True,
-        return_dict: bool = True,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_features,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-        if self.config.use_weighted_layer_sum:
-            hidden_states = jnp.stack(encoder_outputs, axis=1)
-            norm_weights = jax.nn.softmax(self.layer_weights, axis=-1)
-            hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1)
-        else:
-            hidden_states = encoder_outputs[0]
-
-        hidden_states = self.projector(hidden_states)
-        pooled_output = jnp.mean(hidden_states, axis=1)
-
-        logits = self.classifier(pooled_output)
-
-        if not return_dict:
-            return (logits,) + encoder_outputs[1:]
-
-        return FlaxSequenceClassifierOutput(
-            logits=logits,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING)
-class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel):
-    module_class = FlaxWhisperForAudioClassificationModule
-    dtype: jnp.dtype = jnp.float32
-
-    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
-        # init input tensors
-        input_features = jnp.zeros(input_shape, dtype="f4")
-        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
-
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
-
-        random_params = self.module.init(
-            rngs,
-            input_features=input_features,
-        )["params"]
-
-        if params is not None:
-            random_params = flatten_dict(unfreeze(random_params))
-            params = flatten_dict(unfreeze(params))
-            for missing_key in self._missing_keys:
-                params[missing_key] = random_params[missing_key]
-            self._missing_keys = set()
-            return freeze(unflatten_dict(params))
-        else:
-            return random_params
-
-    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
-    def __call__(
-        self,
-        input_features: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-        **kwargs,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        return self.module.apply(
-            {"params": params or self.params},
-            input_features=jnp.array(input_features, dtype="f4"),
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            rngs=rngs,
-        )
-
-
-FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
-    Returns:
-
-    Transcription example:
-
-    ```python
-    >>> import jax.numpy as jnp
-    >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification
-    >>> from datasets import load_dataset
-
-    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
-    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
-    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
-    ... )
-    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
-
-    >>> sample = next(iter(ds))
-
-    >>> inputs = feature_extractor(
-    ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np"
-    ... )
-    >>> input_features = inputs.input_features
-
-    >>> logits = model(input_features).logits
-
-    >>> predicted_class_ids = jnp.argmax(logits).item()
-    >>> predicted_label = model.config.id2label[predicted_class_ids]
-    >>> predicted_label
-    'af_za'
-    ```
-"""
-
-overwrite_call_docstring(
-    FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING
-)
-append_replace_return_docstrings(
-    FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
-)
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index ce571bc9f8d0..eeec3277492d 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -1182,13 +1182,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
-class FlaxWhisperForAudioClassification(metaclass=DummyObject):
-    _backends = ["flax"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["flax"])
-
-
 class FlaxWhisperForConditionalGeneration(metaclass=DummyObject):
     _backends = ["flax"]
 
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
index 79a2c51039ac..3f1e201d72d8 100644
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
 import functools
 import inspect
 import tempfile
@@ -39,7 +41,6 @@
 
     from transformers import (
         FLAX_MODEL_MAPPING,
-        FlaxWhisperForAudioClassification,
         FlaxWhisperForConditionalGeneration,
         FlaxWhisperModel,
         WhisperFeatureExtractor,
@@ -703,205 +704,3 @@ def test_tiny_timestamp_generation(self):
 
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-
-class FlaxWhisperEncoderModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=60,
-        is_training=True,
-        use_labels=True,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=30,
-        num_mel_bins=80,
-        num_conv_layers=1,
-        suppress_tokens=None,
-        begin_suppress_tokens=None,
-        classifier_proj_size=4,
-        num_labels=2,
-        is_encoder_decoder=False,
-        is_decoder=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
-        self.classifier_proj_size = classifier_proj_size
-        self.num_labels = num_labels
-        self.is_encoder_decoder = is_encoder_decoder
-        self.is_decoder = is_decoder
-
-    def get_config(self):
-        return WhisperConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            decoder_ffn_dim=self.hidden_size,
-            encoder_ffn_dim=self.hidden_size,
-            suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
-            classifier_proj_size=self.classifier_proj_size,
-            num_labels=self.num_labels,
-            is_encoder_decoder=self.is_encoder_decoder,
-            is_decoder=self.is_decoder,
-        )
-
-    def prepare_whisper_encoder_inputs_dict(
-        self,
-        input_features,
-    ):
-        return {
-            "input_features": input_features,
-        }
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
-
-        config = self.get_config()
-        inputs_dict = self.prepare_whisper_encoder_inputs_dict(
-            input_features=input_features,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    @property
-    def encoder_seq_length(self):
-        return self.get_subsampled_output_lengths(self.seq_length)
-
-
-@require_flax
-class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else ()
-    is_encoder_decoder = False
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-
-    input_name = "input_features"
-
-    def setUp(self):
-        self.model_tester = FlaxWhisperEncoderModelTester(self)
-        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        self.all_model_classes = (
-            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
-        )
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # overwrite because of `input_features`
-    def test_jit_compilation(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-
-                @jax.jit
-                def model_jitted(input_features, **kwargs):
-                    return model(input_features=input_features, **kwargs)
-
-                with self.subTest("JIT Enabled"):
-                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                with self.subTest("JIT Disabled"):
-                    with jax.disable_jit():
-                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
-
-                self.assertEqual(len(outputs), len(jitted_outputs))
-                for jitted_output, output in zip(jitted_outputs, outputs):
-                    self.assertEqual(jitted_output.shape, output.shape)
-
-    # overwrite because of `input_features`
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.__call__)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_features", "attention_mask", "output_attentions"]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_inputs_embeds(self):
-        pass
-
-    # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    def test_save_load_to_base(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    def test_save_load_from_base(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass

From 1b9c352e55f5316586ad0b3378fd33864e81cd09 Mon Sep 17 00:00:00 2001
From: Perry Huang <perryh.dev@gmail.com>
Date: Fri, 5 May 2023 08:29:20 -0700
Subject: [PATCH 033/935] Add TrOCR resources (#23142)

* Add TrOCR resources

* Made fixes suggested by stevhliu
---
 docs/source/en/model_doc/trocr.mdx | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.mdx
index 3e3a6c100753..8ad65668627d 100644
--- a/docs/source/en/model_doc/trocr.mdx
+++ b/docs/source/en/model_doc/trocr.mdx
@@ -50,6 +50,27 @@ Tips:
   information, see the [official models](https://huggingface.co/models?other=trocr>).
 - TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A blog post on [Accelerating Document AI](https://huggingface.co/blog/document-ai) with TrOCR.
+- A blog post on how to [Document AI](https://github.com/philschmid/document-ai-transformers) with TrOCR.
+- A notebook on how to [finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb).
+- A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo.
+- A notebook on [finetune TrOCR on the IAM Handwriting Database](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb) using native PyTorch.
+- A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb).
+
+<PipelineTag pipeline="text-generation"/>
+
+- [Casual language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) task guide.
+
+⚡️ Inference
+
+- An interactive-demo on [TrOCR handwritten character recognition](https://huggingface.co/spaces/nielsr/TrOCR-handwritten).
+
 ## Inference
 
 TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of

From 77412343c8825dd2ed40729367e7708c3d2ec7ac Mon Sep 17 00:00:00 2001
From: Andrei Filatov <43551010+anvilarth@users.noreply.github.com>
Date: Fri, 5 May 2023 18:36:15 +0300
Subject: [PATCH 034/935] fixed whisper positional encoding (#23167)

---
 src/transformers/models/whisper/modeling_tf_whisper.py | 2 +-
 src/transformers/models/whisper/modeling_whisper.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 12f6e7db5eef..d9a175062b36 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -128,7 +128,7 @@ def build(self, input_shape):
 
     def call(self, input_ids, past_key_values_length=0):
         past_key_values_length = tf.cast(past_key_values_length, tf.int32)
-        gather_indices = tf.range(tf.shape(input_ids)[-1], delta=1) + past_key_values_length
+        gather_indices = tf.range(tf.shape(input_ids)[1], delta=1) + past_key_values_length
         return tf.gather(self.weight, gather_indices)
 
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ed845febac82..bde800911600 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -226,7 +226,7 @@ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional
         super().__init__(num_positions, embedding_dim)
 
     def forward(self, input_ids, past_key_values_length=0):
-        return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
+        return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[1]]
 
 
 class WhisperAttention(nn.Module):

From 40082d598b756c4f1fb048c571ed95c31f05a69c Mon Sep 17 00:00:00 2001
From: Gabriel Yang <gabrielwithhappy@gmail.com>
Date: Sat, 6 May 2023 00:36:56 +0900
Subject: [PATCH 035/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20docs:=20ko:?=
 =?UTF-8?q?=20Translate=20`multiple=5Fchoice.mdx`=20(#23064)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update doctree

* doc: ko: translate multiple choice

* Update reviews
---
 docs/source/ko/_toctree.yml              |   4 +-
 docs/source/ko/tasks/multiple_choice.mdx | 461 +++++++++++++++++++++++
 2 files changed, 463 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tasks/multiple_choice.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index bd24ee4e5ce1..35744420e93e 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -38,8 +38,8 @@
         title: 번역
       - local: tasks/summarization
         title: 요약
-      - local: in_translation
-        title: (번역중) Multiple Choice
+      - local: tasks/multiple_choice
+        title: 객관식 문제(Multiple Choice)
     title: 자연어처리
     isExpanded: false
   - sections:
diff --git a/docs/source/ko/tasks/multiple_choice.mdx b/docs/source/ko/tasks/multiple_choice.mdx
new file mode 100644
index 000000000000..9a259ee77ae6
--- /dev/null
+++ b/docs/source/ko/tasks/multiple_choice.mdx
@@ -0,0 +1,461 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 객관식 문제[[multiple-choice]]
+
+[[open-in-colab]]
+
+객관식 과제는 문맥과 함께 여러 개의 후보 답변이 제공되고 모델이 정답을 선택하도록 학습된다는 점을 제외하면 질의응답과 유사합니다.
+
+진행하는 방법은 아래와 같습니다:
+
+1. [SWAG](https://huggingface.co/datasets/swag) 데이터 세트의 'regular' 구성으로 [BERT](https://huggingface.co/bert-base-uncased)를 미세 조정하여 여러 옵션과 일부 컨텍스트가 주어졌을 때 가장 적합한 답을 선택합니다.
+2. 추론에 미세 조정된 모델을 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+모델을 업로드하고 커뮤니티와 공유할 수 있도록 허깅페이스 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력하여 로그인합니다:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## SWAG 데이터 세트 가져오기[[load-swag-dataset]]
+
+먼저 🤗 Datasets  라이브러리에서 SWAG 데이터셋의 '일반' 구성을 가져옵니다:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+이제 데이터를 살펴봅니다:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+여기에는 많은 필드가 있는 것처럼 보이지만 실제로는 매우 간단합니다:
+
+- `sent1` 및 `sent2`: 이 필드는 문장이 어떻게 시작되는지 보여주며, 이 두 필드를 합치면 `시작 구절(startphrase)` 필드가 됩니다.
+- `종료 구절(ending)`: 문장이 어떻게 끝날 수 있는지에 대한 가능한 종료 구절를 제시하지만 그 중 하나만 정답입니다.
+- `레이블(label)`: 올바른 문장 종료 구절을 식별합니다.
+
+## 전처리[[preprocess]]
+
+다음 단계는 문장의 시작과 네 가지 가능한 구절을 처리하기 위해 BERT 토크나이저를 불러옵니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+생성하려는 전처리 함수는 다음과 같아야 합니다:
+
+1. `sent1` 필드를 네 개 복사한 다음 각각을 `sent2`와 결합하여 문장이 시작되는 방식을 재현합니다.
+2. `sent2`를 네 가지 가능한 문장 구절 각각과 결합합니다.
+3. 이 두 목록을 토큰화할 수 있도록 평탄화(flatten)하고, 각 예제에 해당하는 `input_ids`, `attention_mask` 및 `labels` 필드를 갖도록 다차원화(unflatten) 합니다.
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+전체 데이터 집합에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용합니다. `batched=True`를 설정하여 데이터 집합의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다:
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+🤗 Transformers에는 객관식용 데이터 콜레이터가 없으므로 예제 배치를 만들려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 데이터 정렬 중에 전체 데이터 집합을 최대 길이로 패딩하는 대신 배치 중 가장 긴 길이로 문장을 *동적 패딩*하는 것이 더 효율적입니다.
+
+`DataCollatorForMultipleChoice`는 모든 모델 입력을 평탄화하고 패딩을 적용하며 그 결과를 결과를 다차원화합니다:
+
+<frameworkcontent>
+<pt>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## 평가 하기[[evaluate]]
+
+훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗[Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 지표를 가져옵니다(🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+그리고 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정할 때 이 함수로 돌아가게 됩니다.
+
+## 훈련 하기[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-with-pytorch-trainer)를 살펴보세요!
+
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForMultipleChoice`]로 BERT를 로드합니다:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+이제 세 단계만 남았습니다:
+
+1. 훈련 하이퍼파라미터를 [`TrainingArguments`]에 정의합니다. 유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`를 설정하여 이 모델을 허브에 푸시합니다(모델을 업로드하려면 허깅 페이스에 로그인해야 합니다). 각 에폭이 끝날 때마다 [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다.
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 훈련 인자를 [`Trainer`]에 전달합니다.
+3. [`~Trainer.train`]을 사용하여 모델을 미세 조정합니다.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     tokenizer=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-a-tensorflow-model-with-keras)를 살펴보시기 바랍니다!
+
+</Tip>
+TensorFlow에서 모델을 미세 조정하려면 최적화 함수, 학습률 스케쥴 및 몇 가지 학습 하이퍼파라미터를 설정하는 것부터 시작하세요:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+그리고 [`TFAutoModelForMultipleChoice`]로 BERT를 가져올 수 있습니다:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 훈련 모델을 구성합니다:
+
+```py
+>>> model.compile(optimizer=optimizer)
+```
+
+훈련을 시작하기 전에 설정해야 할 마지막 두 가지는 예측의 정확도를 계산하고 모델을 허브로 푸시하는 방법을 제공하는 것입니다. 이 두 가지 작업은 모두 [Keras 콜백](../main_classes/keras_callbacks)을 사용하여 수행할 수 있습니다.
+
+`compute_metrics`함수를 [`~transformers.KerasMetricCallback`]에 전달하세요:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+모델과 토크나이저를 업로드할 위치를 [`~transformers.PushToHubCallback`]에서 지정하세요:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+그리고 콜백을 함께 묶습니다:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+이제 모델 훈련을 시작합니다! 훈련 및 검증 데이터 세트, 에폭 수, 콜백을 사용하여 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하고 모델을 미세 조정합니다:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+훈련이 완료되면 모델이 자동으로 허브에 업로드되어 누구나 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+객관식 모델을 미세 조정하는 방법에 대한 보다 심층적인 예는 아래 문서를 참조하세요.
+[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+
+</Tip>
+
+## 추론 하기[[inference]]
+
+이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+텍스트와 두 개의 후보 답안을 작성합니다:
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+각 프롬프트와 후보 답변 쌍을 토큰화하여 PyTorch 텐서를 반환합니다. 또한 `labels`을 생성해야 합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+입력과 레이블을 모델에 전달하고 `logits`을 반환합니다:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+가장 높은 확률을 가진 클래스를 가져옵니다:
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+'0'
+```
+</pt>
+<tf>
+각 프롬프트와 후보 답안 쌍을 토큰화하여 텐서플로 텐서를 반환합니다:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+모델에 입력을 전달하고 `logits`를 반환합니다:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+가장 높은 확률을 가진 클래스를 가져옵니다:
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+'0'
+```
+</tf>
+</frameworkcontent>

From 17083b9b847c71e8c303e9cb0798a8928e99a6e0 Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Fri, 5 May 2023 11:52:19 -0400
Subject: [PATCH 036/935] fix: Passing language as acronym to Whisper generate
 (#23141)

* add fix

* address comments

* remove error formatting
---
 .../models/whisper/modeling_whisper.py            |  8 ++++++--
 tests/models/whisper/test_modeling_whisper.py     | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index bde800911600..91de6810b17e 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1562,6 +1562,7 @@ def generate(
             generation_config.return_timestamps = False
 
         if language is not None:
+            language = language.lower()
             generation_config.language = language
         if task is not None:
             generation_config.task = task
@@ -1573,10 +1574,13 @@ def generate(
                     language_token = generation_config.language
                 elif generation_config.language in TO_LANGUAGE_CODE.keys():
                     language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
+                elif generation_config.language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{generation_config.language}|>"
                 else:
+                    is_language_code = len(generation_config.language) == 2
                     raise ValueError(
-                        f"Unsupported language: {self.language}. Language should be one of:"
-                        f" {list(TO_LANGUAGE_CODE.keys()) if generation_config.language in TO_LANGUAGE_CODE.keys() else list(TO_LANGUAGE_CODE.values())}."
+                        f"Unsupported language: {generation_config.language}. Language should be one of:"
+                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                     )
                 forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
             else:
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index dd6ad07eb494..0591c6f46436 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -414,6 +414,21 @@ def test_generate_fp16(self):
         model.generate(input_features)
         model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
+    def test_generate_language(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_features = input_dict["input_features"]
+        model = WhisperForConditionalGeneration(config).to(torch_device)
+        # Hack to keep the test fast and not require downloading a model with a generation_config
+        model.generation_config.__setattr__("lang_to_id", {"<|en|>": 1})
+        model.generation_config.__setattr__("task_to_id", {"transcribe": 2})
+
+        # test language code
+        model.generate(input_features, language="en")
+        # test tokenizer code
+        model.generate(input_features, language="<|en|>")
+        # test language name
+        model.generate(input_features, language="English")
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 

From fc6c8b0eaa8755bff60a48757d2e37e3a03a33dd Mon Sep 17 00:00:00 2001
From: Ashwin Mathur <97467100+awinml@users.noreply.github.com>
Date: Fri, 5 May 2023 22:52:49 +0530
Subject: [PATCH 037/935] Add `no_trainer` scripts to pre-train Vision
 Transformers (#23156)

* Add run_mim_no_trainer.py draft from #20412

Add parse_args method and copy over other dependencies

Add Method call for sending telemetry

Initialize Accelerator

Make one log on every process

Set seed and Handle repository creation

Initialize dataset and Set validation split

Create Config

Adapt Config

Update Config

Create Feature Extractor

Create model

Set column names

Create transforms

Create mask generator

Create method to preprocess images

Shuffle datasets if needed and set transforms

Create Dataloaders

Add optimizer

Add learning rate scheduler

Prepare everything with our accelerator

Tie weights for TPU training

Recalculate training steps and training epochs

Set accelerator checkpointing steps

Initialize trackers and store configuration

Set total batch size

Fix typo: mlm -> mim

Log info at the start of training

Load in the weights and states from previous save

update the progress_bar if load from checkpoint

Define train loop

Add evaluation loop to training

Add to parse_args method

Push repo to hub

Save accelerator state

End training and save model and feature extractor

Remove unused imports

Fix trailing whitespace

* Update code based on comments, Rename feature_extractor to image_processor

* Fix linting

* Add argument for learning rate

* Add argument for setting number of training epochs

* Remove incorrect logger argument

* Convert max_train_steps to int for tqdm

---------

Co-authored-by: Saad Mahmud <shuvro.mahmud79@gmail.com>
---
 .../image-pretraining/run_mim_no_trainer.py   | 771 ++++++++++++++++++
 1 file changed, 771 insertions(+)
 create mode 100644 examples/pytorch/image-pretraining/run_mim_no_trainer.py

diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
new file mode 100644
index 000000000000..a94585d39698
--- /dev/null
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -0,0 +1,771 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import math
+import os
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from huggingface_hub import Repository
+from torch.utils.data import DataLoader
+from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor
+from tqdm.auto import tqdm
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
+    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
+    AutoConfig,
+    AutoImageProcessor,
+    AutoModelForMaskedImageModeling,
+    SchedulerType,
+    get_scheduler,
+)
+from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+""" Pre-training a 🤗 Transformers model for simple masked image modeling (SimMIM)
+without using HuggingFace Trainer.
+Any model supported by the AutoModelForMaskedImageModeling API can be used.
+"""
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.25.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a simple Masked Image Modeling task"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="cifar10",
+        help="Name of a dataset from the datasets package",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--image_column_name",
+        type=str,
+        default=None,
+        help="The column name of the images in the files. If not set, will try to use 'image' or 'img'.",
+    )
+    parser.add_argument(
+        "--train_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data.",
+    )
+    parser.add_argument(
+        "--validation_dir",
+        type=None,
+        default=None,
+        help="A folder containing the validation data.",
+    )
+    parser.add_argument(
+        "--train_val_split",
+        type=float,
+        default=0.15,
+        help="Percent to split off of train for validation.",
+    )
+    parser.add_argument(
+        "--mask_patch_size",
+        type=int,
+        default=32,
+        help="The size of the square patches to use for masking.",
+    )
+    parser.add_argument(
+        "--mask_ratio",
+        type=float,
+        default=0.6,
+        help="Percentage of patches to mask.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--max_eval_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default=None,
+        help=(
+            "The model checkpoint for weights initialization. Can be a local path to a pytorch_model.bin or a "
+            "checkpoint identifier on the hub. "
+            "Don't set if you want to train a model from scratch."
+        ),
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES),
+    )
+    parser.add_argument(
+        "--config_name_or_path",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--config_overrides",
+        type=str,
+        default=None,
+        help=(
+            "Override some existing default config settings when a model is trained from scratch. Example: "
+            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        ),
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Where do you want to store (cache) the pretrained models/datasets downloaded from the hub",
+    )
+    parser.add_argument(
+        "--model_revision",
+        type=str,
+        default="main",
+        help="The specific model version to use (can be a branch name, tag name or commit id).",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--image_processor_name",
+        type=str,
+        default=None,
+        help="Name or path of preprocessor config.",
+    )
+    parser.add_argument(
+        "--use_auth_token",
+        type=bool,
+        default=False,
+        help=(
+            "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+            "with private models)."
+        ),
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=None,
+        help="The size (resolution) of each image. If not specified, will use `image_size` of the configuration.",
+    )
+    parser.add_argument(
+        "--patch_size",
+        type=int,
+        default=None,
+        help="The size (resolution) of each patch. If not specified, will use `patch_size` of the configuration.",
+    )
+    parser.add_argument(
+        "--encoder_stride",
+        type=int,
+        default=None,
+        help={"help": "Stride to use for the encoder."},
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the model to the Hub.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="A seed for reproducible training.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="The initial learning rate for [`AdamW`] optimizer.",
+    )
+    parser.add_argument(
+        "--weight_decay",
+        type=float,
+        default=0.0,
+        help="Weight decay to use.",
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=float,
+        default=3.0,
+        help="Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training).",
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Where to store the final model.",
+    )
+    args = parser.parse_args()
+
+    # Sanity checks
+    data_files = {}
+    if args.train_dir is not None:
+        data_files["train"] = args.train_dir
+    if args.validation_dir is not None:
+        data_files["val"] = args.validation_dir
+    args.data_files = data_files if data_files else None
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+class MaskGenerator:
+    """
+    A class to generate boolean masks for the pretraining task.
+
+    A mask is a 1D tensor of shape (model_patch_size**2,) where the value is either 0 or 1,
+    where 1 indicates "masked".
+    """
+
+    def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ratio=0.6):
+        self.input_size = input_size
+        self.mask_patch_size = mask_patch_size
+        self.model_patch_size = model_patch_size
+        self.mask_ratio = mask_ratio
+
+        if self.input_size % self.mask_patch_size != 0:
+            raise ValueError("Input size must be divisible by mask patch size")
+        if self.mask_patch_size % self.model_patch_size != 0:
+            raise ValueError("Mask patch size must be divisible by model patch size")
+
+        self.rand_size = self.input_size // self.mask_patch_size
+        self.scale = self.mask_patch_size // self.model_patch_size
+
+        self.token_count = self.rand_size**2
+        self.mask_count = int(np.ceil(self.token_count * self.mask_ratio))
+
+    def __call__(self):
+        mask_idx = np.random.permutation(self.token_count)[: self.mask_count]
+        mask = np.zeros(self.token_count, dtype=int)
+        mask[mask_idx] = 1
+
+        mask = mask.reshape((self.rand_size, self.rand_size))
+        mask = mask.repeat(self.scale, axis=0).repeat(self.scale, axis=1)
+
+        return torch.tensor(mask.flatten())
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    mask = torch.stack([example["mask"] for example in examples])
+    return {"pixel_values": pixel_values, "bool_masked_pos": mask}
+
+
+def main():
+    args = parse_args()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_mim_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["logging_dir"] = args.output_dir
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        **accelerator_log_kwargs,
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Initialize our dataset.
+    ds = load_dataset(
+        args.dataset_name,
+        args.dataset_config_name,
+        data_files=args.data_files,
+        cache_dir=args.cache_dir,
+        use_auth_token=True if args.use_auth_token else None,
+    )
+
+    # If we don't have a validation split, split off a percentage of train as validation.
+    args.train_val_split = None if "validation" in ds.keys() else args.train_val_split
+    if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
+        split = ds["train"].train_test_split(args.train_val_split)
+        ds["train"] = split["train"]
+        ds["validation"] = split["test"]
+
+    # Create config
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config_kwargs = {
+        "cache_dir": args.cache_dir,
+        "revision": args.model_revision,
+        "use_auth_token": True if args.use_auth_token else None,
+    }
+    if args.config_name_or_path:
+        config = AutoConfig.from_pretrained(args.config_name_or_path, **config_kwargs)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path, **config_kwargs)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+        if args.config_overrides is not None:
+            logger.info(f"Overriding config: {args.config_overrides}")
+            config.update_from_string(args.config_overrides)
+            logger.info(f"New config: {config}")
+
+    # make sure the decoder_type is "simmim" (only relevant for BEiT)
+    if hasattr(config, "decoder_type"):
+        config.decoder_type = "simmim"
+
+    # adapt config
+    args.image_size = args.image_size if args.image_size is not None else config.image_size
+    args.patch_size = args.patch_size if args.patch_size is not None else config.patch_size
+    args.encoder_stride = args.encoder_stride if args.encoder_stride is not None else config.encoder_stride
+
+    config.update(
+        {
+            "image_size": args.image_size,
+            "patch_size": args.patch_size,
+            "encoder_stride": args.encoder_stride,
+        }
+    )
+
+    # create image processor
+    if args.image_processor_name:
+        image_processor = AutoImageProcessor.from_pretrained(args.image_processor_name, **config_kwargs)
+    elif args.model_name_or_path:
+        image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path, **config_kwargs)
+    else:
+        IMAGE_PROCESSOR_TYPES = {
+            conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
+        }
+        image_processor = IMAGE_PROCESSOR_TYPES[args.model_type]()
+
+    # create model
+    if args.model_name_or_path:
+        model = AutoModelForMaskedImageModeling.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            cache_dir=args.cache_dir,
+            revision=args.model_revision,
+            use_auth_token=True if args.use_auth_token else None,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMaskedImageModeling.from_config(config)
+
+    column_names = ds["train"].column_names
+
+    if args.image_column_name is not None:
+        image_column_name = args.image_column_name
+    elif "image" in column_names:
+        image_column_name = "image"
+    elif "img" in column_names:
+        image_column_name = "img"
+    else:
+        image_column_name = column_names[0]
+
+    # transformations as done in original SimMIM paper
+    # source: https://github.com/microsoft/SimMIM/blob/main/data/data_simmim.py
+    transforms = Compose(
+        [
+            Lambda(lambda img: img.convert("RGB")),
+            RandomResizedCrop(args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
+            RandomHorizontalFlip(),
+            ToTensor(),
+            Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
+        ]
+    )
+
+    # create mask generator
+    mask_generator = MaskGenerator(
+        input_size=args.image_size,
+        mask_patch_size=args.mask_patch_size,
+        model_patch_size=args.patch_size,
+        mask_ratio=args.mask_ratio,
+    )
+
+    def preprocess_images(examples):
+        """Preprocess a batch of images by applying transforms + creating a corresponding mask, indicating
+        which patches to mask."""
+
+        examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]]
+        examples["mask"] = [mask_generator() for i in range(len(examples[image_column_name]))]
+
+        return examples
+
+    if args.max_train_samples is not None:
+        ds["train"] = ds["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+    # Set the training transforms
+    ds["train"].set_transform(preprocess_images)
+
+    if args.max_eval_samples is not None:
+        ds["validation"] = ds["validation"].shuffle(seed=args.seed).select(range(args.max_eval_samples))
+    # Set the validation transforms
+    ds["validation"].set_transform(preprocess_images)
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        ds["train"],
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.per_device_train_batch_size,
+    )
+    eval_dataloader = DataLoader(
+        ds["validation"],
+        collate_fn=collate_fn,
+        batch_size=args.per_device_eval_batch_size,
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model,
+        optimizer,
+        train_dataloader,
+        eval_dataloader,
+        lr_scheduler,
+    )
+
+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers("mim_no_trainer", experiment_config)
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(ds['train'])}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(int(args.max_train_steps)), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
+            starting_epoch = resume_step // len(train_dataloader)
+            resume_step -= starting_epoch * len(train_dataloader)
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        for step, batch in enumerate(train_dataloader):
+            # We need to skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == starting_epoch:
+                if resume_step is not None and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        completed_steps += 1
+                    continue
+
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps }"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+
+            loss = outputs.loss
+            losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
+
+        losses = torch.cat(losses)
+        eval_loss = torch.mean(losses)
+
+        logger.info(f"epoch {epoch}: eval_loss: {eval_loss}")
+
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "eval_loss": eval_loss,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
+            )
+
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+            )
+            if accelerator.is_main_process:
+                image_processor.save_pretrained(args.output_dir)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )
+
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
+    if args.with_tracking:
+        accelerator.end_training()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+        )
+        if accelerator.is_main_process:
+            image_processor.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+
+
+if __name__ == "__main__":
+    main()

From 312b104ff65514736c0475814fec19e47425b0b5 Mon Sep 17 00:00:00 2001
From: raghavanone <115454562+raghavanone@users.noreply.github.com>
Date: Fri, 5 May 2023 22:53:46 +0530
Subject: [PATCH 038/935] Add FlaxWhisperForAudioClassification model (#23173)

* Add FlaxWhisperForAudioClassification model

* Add models to init

* Add models to init

* Fix copies

* Fix automapping

* Fix failing test
---
 docs/source/en/model_doc/whisper.mdx          |   6 +
 src/transformers/__init__.py                  |   8 +-
 .../models/auto/modeling_flax_auto.py         |   5 +
 src/transformers/models/whisper/__init__.py   |   2 +
 .../models/whisper/modeling_flax_whisper.py   | 161 ++++++++++++++
 src/transformers/utils/dummy_flax_objects.py  |   7 +
 .../whisper/test_modeling_flax_whisper.py     | 205 +++++++++++++++++-
 tests/models/whisper/test_modeling_whisper.py |   8 +-
 8 files changed, 395 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 22b08e4e61bc..52a8b5953c63 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -105,3 +105,9 @@ The original code can be found [here](https://github.com/openai/whisper).
 
 [[autodoc]] FlaxWhisperForConditionalGeneration
     - __call__
+
+## FlaxWhisperForAudioClassification
+
+[[autodoc]] FlaxWhisperForAudioClassification
+    - __call__
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7bf322ca8e1e..b0766b0946cd 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3779,6 +3779,7 @@
             "FlaxWhisperForConditionalGeneration",
             "FlaxWhisperModel",
             "FlaxWhisperPreTrainedModel",
+            "FlaxWhisperForAudioClassification",
         ]
     )
     _import_structure["models.xglm"].extend(
@@ -6903,7 +6904,12 @@
             FlaxWav2Vec2Model,
             FlaxWav2Vec2PreTrainedModel,
         )
-        from .models.whisper import FlaxWhisperForConditionalGeneration, FlaxWhisperModel, FlaxWhisperPreTrainedModel
+        from .models.whisper import (
+            FlaxWhisperForAudioClassification,
+            FlaxWhisperForConditionalGeneration,
+            FlaxWhisperModel,
+            FlaxWhisperPreTrainedModel,
+        )
         from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel
         from .models.xlm_roberta import (
             FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index 755d1f07a344..e3b8d9cf5b52 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -229,6 +229,11 @@
     ]
 )
 
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("whisper", "FlaxWhisperForAudioClassification"),
+    ]
+)
 
 FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES)
 FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index 3b6015a56f6f..cd962478e34d 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -75,6 +75,7 @@
         "FlaxWhisperForConditionalGeneration",
         "FlaxWhisperModel",
         "FlaxWhisperPreTrainedModel",
+        "FlaxWhisperForAudioClassification",
     ]
 
 
@@ -126,6 +127,7 @@
         pass
     else:
         from .modeling_flax_whisper import (
+            FlaxWhisperForAudioClassification,
             FlaxWhisperForConditionalGeneration,
             FlaxWhisperModel,
             FlaxWhisperPreTrainedModel,
diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index b8d6f07242d8..1a994acea4df 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -36,6 +36,7 @@
     FlaxCausalLMOutputWithCrossAttentions,
     FlaxSeq2SeqLMOutput,
     FlaxSeq2SeqModelOutput,
+    FlaxSequenceClassifierOutput,
 )
 from ...modeling_flax_utils import (
     ACT2FN,
@@ -1506,3 +1507,163 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 append_replace_return_docstrings(
     FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
 )
+
+
+class FlaxWhisperForAudioClassificationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype)
+        self.config.is_encoder_decoder = False
+        num_layers = self.config.num_hidden_layers + 1
+        if self.config.use_weighted_layer_sum:
+            self.layer_weights = jnp.repeat(1 / num_layers, num_layers)
+        self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_features,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states: bool = True,
+        return_dict: bool = True,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = jnp.stack(encoder_outputs, axis=1)
+            norm_weights = jax.nn.softmax(self.layer_weights, axis=-1)
+            hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = jnp.mean(hidden_states, axis=1)
+
+        logits = self.classifier(pooled_output)
+
+        if not return_dict:
+            return (logits,) + encoder_outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForAudioClassificationModule
+    dtype: jnp.dtype = jnp.float32
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            rngs=rngs,
+        )
+
+
+FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
+    Returns:
+
+    Transcription example:
+
+    ```python
+    >>> import jax.numpy as jnp
+    >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification
+    >>> from datasets import load_dataset
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
+    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
+    ... )
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+
+    >>> sample = next(iter(ds))
+
+    >>> inputs = feature_extractor(
+    ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np"
+    ... )
+    >>> input_features = inputs.input_features
+
+    >>> logits = model(input_features).logits
+
+    >>> predicted_class_ids = jnp.argmax(logits).item()
+    >>> predicted_label = model.config.id2label[predicted_class_ids]
+    >>> predicted_label
+    'af_za'
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
+)
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index eeec3277492d..ce571bc9f8d0 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -1182,6 +1182,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxWhisperForAudioClassification(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxWhisperForConditionalGeneration(metaclass=DummyObject):
     _backends = ["flax"]
 
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
index 3f1e201d72d8..79a2c51039ac 100644
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
 import functools
 import inspect
 import tempfile
@@ -41,6 +39,7 @@
 
     from transformers import (
         FLAX_MODEL_MAPPING,
+        FlaxWhisperForAudioClassification,
         FlaxWhisperForConditionalGeneration,
         FlaxWhisperModel,
         WhisperFeatureExtractor,
@@ -704,3 +703,205 @@ def test_tiny_timestamp_generation(self):
 
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+
+class FlaxWhisperEncoderModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=60,
+        is_training=True,
+        use_labels=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        input_channels=1,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        max_source_positions=30,
+        num_mel_bins=80,
+        num_conv_layers=1,
+        suppress_tokens=None,
+        begin_suppress_tokens=None,
+        classifier_proj_size=4,
+        num_labels=2,
+        is_encoder_decoder=False,
+        is_decoder=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_mel_bins = num_mel_bins
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.num_conv_layers = num_conv_layers
+        self.suppress_tokens = suppress_tokens
+        self.begin_suppress_tokens = begin_suppress_tokens
+        self.classifier_proj_size = classifier_proj_size
+        self.num_labels = num_labels
+        self.is_encoder_decoder = is_encoder_decoder
+        self.is_decoder = is_decoder
+
+    def get_config(self):
+        return WhisperConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            input_channels=self.input_channels,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_source_positions=self.max_source_positions,
+            decoder_ffn_dim=self.hidden_size,
+            encoder_ffn_dim=self.hidden_size,
+            suppress_tokens=self.suppress_tokens,
+            begin_suppress_tokens=self.begin_suppress_tokens,
+            classifier_proj_size=self.classifier_proj_size,
+            num_labels=self.num_labels,
+            is_encoder_decoder=self.is_encoder_decoder,
+            is_decoder=self.is_decoder,
+        )
+
+    def prepare_whisper_encoder_inputs_dict(
+        self,
+        input_features,
+    ):
+        return {
+            "input_features": input_features,
+        }
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
+
+        config = self.get_config()
+        inputs_dict = self.prepare_whisper_encoder_inputs_dict(
+            input_features=input_features,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_subsampled_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        for i in range(self.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    @property
+    def encoder_seq_length(self):
+        return self.get_subsampled_output_lengths(self.seq_length)
+
+
+@require_flax
+class WhisperEncoderModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxWhisperForAudioClassification,) if is_flax_available() else ()
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = False
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = FlaxWhisperEncoderModelTester(self)
+        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self.init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        self.all_model_classes = (
+            make_partial_class(model_class, input_shape=self.init_shape) for model_class in self.all_model_classes
+        )
+        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite because of `input_features`
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(input_features, **kwargs):
+                    return model(input_features=input_features, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    # overwrite because of `input_features`
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_features", "attention_mask", "output_attentions"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_inputs_embeds(self):
+        pass
+
+    # WhisperEncoder has no inputs_embeds and thus the `get_input_embeddings` fn is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    def test_save_load_to_base(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    def test_save_load_from_base(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_from_base_pt(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_to_base_pt(self):
+        pass
+
+    # WhisperEncoder does not have any base model
+    @is_pt_flax_cross_test
+    def test_save_load_bf16_to_base_pt(self):
+        pass
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 0591c6f46436..0b5b375e9dd3 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -95,7 +95,7 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        seq_length=60,
+        seq_length=1500,
         is_training=True,
         use_labels=False,
         vocab_size=200,
@@ -107,7 +107,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
-        max_source_positions=30,
+        max_source_positions=750,
         max_target_positions=40,
         bos_token_id=98,
         eos_token_id=98,
@@ -1434,7 +1434,7 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        seq_length=60,
+        seq_length=3000,
         is_training=True,
         use_labels=True,
         hidden_size=16,
@@ -1445,7 +1445,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
-        max_source_positions=30,
+        max_source_positions=1500,
         num_mel_bins=80,
         num_conv_layers=1,
         suppress_tokens=None,

From ef42c2c487260c2a0111fa9d17f2507d84ddedea Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sat, 6 May 2023 23:41:08 +0800
Subject: [PATCH 039/935] search buffers for dtype (#23159)

---
 src/transformers/modeling_utils.py | 36 ++++++++++++++++++------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index bf06d9c40538..8f13d4aa2300 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -207,21 +207,29 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil
         # if no floating dtype was found return whatever the first dtype is
         return last_dtype
 
-    else:
-        # For nn.DataParallel compatibility in PyTorch > 1.5
-        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
+    for t in parameter.buffers():
+        last_dtype = t.dtype
+        if t.is_floating_point():
+            return t.dtype
 
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        last_tuple = None
-        for tuple in gen:
-            last_tuple = tuple
-            if tuple[1].is_floating_point():
-                return tuple[1].dtype
-
-        # fallback to the last dtype
-        return last_tuple[1].dtype
+    if last_dtype is not None:
+        # if no floating dtype was found return whatever the first dtype is
+        return last_dtype
+
+    # For nn.DataParallel compatibility in PyTorch > 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+        tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+        return tuples
+
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    last_tuple = None
+    for tuple in gen:
+        last_tuple = tuple
+        if tuple[1].is_floating_point():
+            return tuple[1].dtype
+
+    # fallback to the last dtype
+    return last_tuple[1].dtype
 
 
 def get_state_dict_float_dtype(state_dict):

From ef0c380c121881adc9ab981a227e1166d99272f3 Mon Sep 17 00:00:00 2001
From: Ashwin Mathur <97467100+awinml@users.noreply.github.com>
Date: Mon, 8 May 2023 04:22:44 +0530
Subject: [PATCH 040/935] Update LLaMA docs with arxiv link (#23191)

* Update docs with arxiv link

* Update llama model docs
---
 docs/source/en/model_doc/llama.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/llama.mdx b/docs/source/en/model_doc/llama.mdx
index edcb0482097b..a5f0553358be 100644
--- a/docs/source/en/model_doc/llama.mdx
+++ b/docs/source/en/model_doc/llama.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](LLaMA: Open and Efficient Foundation Language Models)  by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
+The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
 
 The abstract from the paper is the following:
 

From 6f8a02844a0eeaf15f29acb7fe40126768936fba Mon Sep 17 00:00:00 2001
From: Bartosz Szmelczynski <43574448+Bearnardd@users.noreply.github.com>
Date: Mon, 8 May 2023 00:55:04 +0200
Subject: [PATCH 041/935] fix random attention for pytorch's
 bigbird/pegasus_bigbird (#23056)

* fix random attention usage for bigbird and pegasus_bigbird

* remove staticmethod, update tests target valus

* revert style changes
---
 .../models/big_bird/modeling_big_bird.py      | 14 ++-
 .../modeling_bigbird_pegasus.py               | 12 ++-
 .../models/big_bird/test_modeling_big_bird.py | 88 ++++++++-----------
 .../big_bird/test_modeling_flax_big_bird.py   | 16 +---
 4 files changed, 60 insertions(+), 70 deletions(-)

diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 4f90deb506b7..73a5c7d9b52e 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1052,9 +1052,8 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
 
         return plan_from_length, plan_num_rand_blocks
 
-    @staticmethod
     def _bigbird_block_rand_mask(
-        from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
+        self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
     ):
         """
         Create adjacency list of random attention.
@@ -1077,6 +1076,9 @@ def _bigbird_block_rand_mask(
             raise ValueError("Error the number of blocks needs to be same!")
 
         rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
+        # During inference (eval) no randomness
+        if not self.training:
+            return rand_attn
         middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
         last = to_seq_length // to_block_size - 1
         if last_idx > (2 * to_block_size):
@@ -1160,11 +1162,17 @@ def _bigbird_block_rand_mask_with_head(
         plan_block_length = np.array(plan_from_length) // from_block_size
         # till when to follow plan
         max_plan_idx = plan_from_length.index(from_seq_length)
+
         # Random Attention adjacency list
         rand_attn = [
             np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
             for i in range(num_heads)
         ]
+        # During inference (eval) no randomness
+        if not self.training:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
 
         # We will go iteratively over the plan blocks and pick random number of
         # Attention blocks from the legally allowed blocks
@@ -1353,7 +1361,6 @@ def set_attention_type(self, value: str):
         attn_weights.key = self.self.key
         self.self = attn_weights
         self.attention_type = value
-
         if not self.training:
             self.self.eval()
 
@@ -1380,7 +1387,6 @@ def forward(
             from_mask = from_mask.to(hidden_states.dtype)
         if to_mask is not None:
             to_mask = to_mask.to(hidden_states.dtype)
-
         if self.attention_type == "original_full":
             self_outputs = self.self(
                 hidden_states,
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 859e019ac953..e4c64e12b554 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -873,9 +873,8 @@ def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
 
         return plan_from_length, plan_num_rand_blocks
 
-    @staticmethod
     def _bigbird_block_rand_mask(
-        from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
+        self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
     ):
         """
         Create adjacency list of random attention.
@@ -898,6 +897,9 @@ def _bigbird_block_rand_mask(
             raise ValueError("Error the number of blocks needs to be same!")
 
         rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
+        # During inference (eval) no randomness
+        if not self.training:
+            return rand_attn
         middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
         last = to_seq_length // to_block_size - 1
         if last_idx > (2 * to_block_size):
@@ -981,11 +983,17 @@ def _bigbird_block_rand_mask_with_head(
         plan_block_length = np.array(plan_from_length) // from_block_size
         # till when to follow plan
         max_plan_idx = plan_from_length.index(from_seq_length)
+
         # Random Attention adjacency list
         rand_attn = [
             np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
             for i in range(num_heads)
         ]
+        # During inference (eval) no randomness
+        if not self.training:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
 
         # We will go iteratively over the plan blocks and pick random number of
         # Attention blocks from the legally allowed blocks
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index 69455935e5e5..f86c6d0ac70a 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -20,7 +20,7 @@
 from transformers import BigBirdConfig, is_torch_available
 from transformers.models.auto import get_values
 from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
-from transformers.testing_utils import is_pt_flax_cross_test, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
@@ -618,20 +618,6 @@ def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, n
         else:
             super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
 
-    @is_pt_flax_cross_test
-    @unittest.skip(
-        reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode"
-    )
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    @unittest.skip(
-        reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode"
-    )
-    def test_equivalence_pt_to_flax(self):
-        pass
-
 
 @require_torch
 @slow
@@ -664,18 +650,19 @@ def test_inference_block_sparse_pretraining(self):
 
         expected_prediction_logits_slice = torch.tensor(
             [
-                [-0.2420, -0.6048, -0.0614, 7.8422],
-                [-0.0596, -0.0104, -1.8408, 9.3352],
-                [1.0588, 0.7999, 5.0770, 8.7555],
-                [-0.1385, -1.7199, -1.7613, 6.1094],
+                [-0.5583, 0.0475, -0.2508, 7.4423],
+                [0.7409, 1.4460, -0.7593, 7.7010],
+                [1.9150, 3.1395, 5.8840, 9.3498],
+                [-0.1854, -1.4640, -2.2052, 3.7968],
             ],
             device=torch_device,
         )
+
         self.assertTrue(
             torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
         )
 
-        expected_seq_relationship_logits = torch.tensor([[58.8196, 56.3629]], device=torch_device)
+        expected_seq_relationship_logits = torch.tensor([[46.9465, 47.9517]], device=torch_device)
         self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
 
     def test_inference_full_pretraining(self):
@@ -787,22 +774,23 @@ def test_block_sparse_context_layer(self):
         blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
             attn_mask, config.block_size
         )
+
         targeted_cl = torch.tensor(
             [
-                [0.1874, 1.5260, 0.2335, -0.0473, -0.0961, 1.8384, -0.0141, 0.1250, 0.0085, -0.0048],
-                [-0.0554, 0.0728, 0.1683, -0.1332, 0.1741, 0.1337, -0.2380, -0.1849, -0.0390, -0.0259],
-                [-0.0419, 0.0767, 0.1591, -0.1399, 0.1789, 0.1257, -0.2406, -0.1772, -0.0261, -0.0079],
-                [0.1860, 1.5172, 0.2326, -0.0473, -0.0953, 1.8291, -0.0147, 0.1245, 0.0082, -0.0046],
-                [0.1879, 1.5296, 0.2335, -0.0471, -0.0975, 1.8433, -0.0136, 0.1260, 0.0086, -0.0054],
-                [0.1854, 1.5147, 0.2334, -0.0480, -0.0956, 1.8250, -0.0149, 0.1222, 0.0082, -0.0060],
-                [0.1859, 1.5184, 0.2334, -0.0474, -0.0955, 1.8297, -0.0143, 0.1234, 0.0079, -0.0054],
-                [0.1885, 1.5336, 0.2335, -0.0467, -0.0979, 1.8481, -0.0130, 0.1269, 0.0085, -0.0049],
-                [0.1881, 1.5305, 0.2335, -0.0471, -0.0976, 1.8445, -0.0135, 0.1262, 0.0086, -0.0053],
-                [0.1852, 1.5148, 0.2333, -0.0480, -0.0949, 1.8254, -0.0151, 0.1225, 0.0079, -0.0055],
-                [0.1877, 1.5292, 0.2335, -0.0470, -0.0972, 1.8431, -0.0135, 0.1259, 0.0084, -0.0052],
-                [0.1874, 1.5261, 0.2334, -0.0472, -0.0968, 1.8393, -0.0140, 0.1251, 0.0084, -0.0052],
-                [0.1853, 1.5151, 0.2331, -0.0478, -0.0948, 1.8256, -0.0154, 0.1228, 0.0086, -0.0052],
-                [0.1867, 1.5233, 0.2334, -0.0475, -0.0965, 1.8361, -0.0139, 0.1247, 0.0084, -0.0054],
+                [0.1870, 1.5248, 0.2333, -0.0483, -0.0952, 1.8359, -0.0142, 0.1239, 0.0083, -0.0045],
+                [-0.0601, 0.1243, 0.1329, -0.1524, 0.2347, 0.0894, -0.2248, -0.2461, -0.0645, -0.0109],
+                [-0.0418, 0.1463, 0.1290, -0.1638, 0.2489, 0.0799, -0.2341, -0.2406, -0.0524, 0.0106],
+                [0.1859, 1.5182, 0.2324, -0.0473, -0.0952, 1.8295, -0.0148, 0.1242, 0.0080, -0.0045],
+                [0.1879, 1.5300, 0.2334, -0.0480, -0.0967, 1.8428, -0.0137, 0.1256, 0.0087, -0.0050],
+                [0.1852, 1.5149, 0.2330, -0.0492, -0.0936, 1.8236, -0.0154, 0.1210, 0.0080, -0.0048],
+                [0.1857, 1.5186, 0.2331, -0.0484, -0.0940, 1.8285, -0.0148, 0.1224, 0.0077, -0.0045],
+                [0.1884, 1.5336, 0.2334, -0.0469, -0.0974, 1.8477, -0.0132, 0.1266, 0.0085, -0.0046],
+                [0.1881, 1.5308, 0.2334, -0.0479, -0.0969, 1.8438, -0.0136, 0.1258, 0.0088, -0.0050],
+                [0.1849, 1.5143, 0.2329, -0.0491, -0.0930, 1.8230, -0.0156, 0.1209, 0.0074, -0.0047],
+                [0.1878, 1.5299, 0.2333, -0.0472, -0.0967, 1.8434, -0.0137, 0.1257, 0.0084, -0.0048],
+                [0.1873, 1.5260, 0.2333, -0.0478, -0.0961, 1.8383, -0.0142, 0.1245, 0.0083, -0.0048],
+                [0.1849, 1.5145, 0.2327, -0.0491, -0.0935, 1.8237, -0.0156, 0.1215, 0.0083, -0.0046],
+                [0.1866, 1.5232, 0.2332, -0.0488, -0.0950, 1.8342, -0.0143, 0.1237, 0.0084, -0.0047],
             ],
             device=torch_device,
         )
@@ -851,21 +839,22 @@ def test_tokenizer_inference(self):
 
         expected_prediction = torch.tensor(
             [
-                [-0.0213, -0.2213, -0.0061, 0.0687],
-                [0.0977, 0.1858, 0.2374, 0.0483],
-                [0.2112, -0.2524, 0.5793, 0.0967],
-                [0.2473, -0.5070, -0.0630, 0.2174],
-                [0.2885, 0.1139, 0.6071, 0.2991],
-                [0.2328, -0.2373, 0.3648, 0.1058],
-                [0.2517, -0.0689, 0.0555, 0.0880],
-                [0.1021, -0.1495, -0.0635, 0.1891],
-                [0.0591, -0.0722, 0.2243, 0.2432],
-                [-0.2059, -0.2679, 0.3225, 0.6183],
-                [0.2280, -0.2618, 0.1693, 0.0103],
-                [0.0183, -0.1375, 0.2284, -0.1707],
+                [0.1887, -0.0474, 0.2604, 0.1453],
+                [0.0651, 0.1999, 0.1797, 0.1161],
+                [0.2833, -0.3036, 0.6910, 0.1123],
+                [0.2836, -0.4644, -0.0111, 0.1530],
+                [0.3919, -0.2823, 0.4192, 0.1687],
+                [0.2168, -0.1956, 0.4050, 0.0925],
+                [0.2597, -0.0884, 0.1258, 0.1119],
+                [0.1127, -0.1203, 0.1924, 0.2859],
+                [0.1362, -0.1315, 0.2693, 0.1027],
+                [-0.3169, -0.2266, 0.4419, 0.6740],
+                [0.2366, -0.1452, 0.2589, 0.0579],
+                [0.0358, -0.2021, 0.3112, -0.1392],
             ],
             device=torch_device,
         )
+
         self.assertTrue(torch.allclose(prediction[0, 52:64, 320:324], expected_prediction, atol=1e-4))
 
     def test_inference_question_answering(self):
@@ -908,11 +897,12 @@ def test_inference_question_answering(self):
 
         # fmt: off
         target_start_logits = torch.tensor(
-            [[-8.9304, -10.3849, -14.4997, -9.6497, -13.9469, -7.8134, -8.9687, -13.3585, -9.7987, -13.8869, -9.2632, -8.9294, -13.6721, -7.3198, -9.5434, -11.2641, -14.3245, -9.5705, -12.7367, -8.6168, -11.083, -13.7573, -8.1151, -14.5329, -7.6876, -15.706, -12.8558, -9.1135, 8.0909, -3.1925, -11.5812, -9.4822], [-11.5595, -14.5591, -10.2978, -14.8445, -10.2092, -11.1899, -13.8356, -10.5644, -14.7706, -9.9841, -11.0052, -14.1862, -8.8173, -11.1098, -12.4686, -15.0531, -11.0196, -13.6614, -10.0236, -11.8151, -14.8744, -9.5123, -15.1605, -8.6472, -15.4184, -8.898, -9.6328, -7.0258, -11.3365, -14.4065, -10.2587, -8.9103]],  # noqa: E231
+            [[-8.5622, -9.6209, -14.3351, -8.7032, -11.8596, -7.7446, -9.6730, -13.6063, -8.9651, -11.7417, -8.2641, -8.7056, -13.4116, -5.6600, -8.8316, -10.4148, -12.2180, -7.7979, -12.5274, -6.0685, -10.3373, -11.3128, -6.6456, -14.4030, -6.8292, -14.5383, -11.5638, -6.3326, 11.5293, -1.8434, -10.0013, -7.6150], [-10.7384, -13.1179, -10.1837, -13.7700, -10.0186, -11.7335, -13.3411, -10.0188, -13.4235, -9.9381, -10.4252, -13.1281, -8.2022, -10.4326, -11.5542, -14.1549, -10.7546, -13.4691, -8.2744, -11.4324, -13.3773, -9.8284, -14.5825, -8.7471, -14.7050, -8.0364, -11.3627, -6.4638, -11.7031, -14.3446, -9.9425, -8.0088]], # noqa: E231
             device=torch_device,
         )
+
         target_end_logits = torch.tensor(
-            [[-12.4131, -8.5959, -15.7163, -11.1524, -15.9913, -12.2038, -7.8902, -16.0296, -12.164, -16.5017, -13.3332, -6.9488, -15.7756, -13.8506, -11.0779, -9.2893, -15.0426, -10.1963, -17.3292, -12.2945, -11.5337, -16.4514, -9.1564, -17.5001, -9.1562, -16.2971, -13.3199, -7.5724, -5.1175, 7.2168, -10.3804, -11.9873], [-10.8654, -14.9967, -11.4144, -16.9189, -14.2673, -9.7068, -15.0182, -12.8846, -16.8716, -13.665, -10.3113, -15.1436, -14.9069, -13.3364, -11.2339, -16.0118, -11.8331, -17.0613, -13.8852, -12.4163, -16.8978, -10.7772, -17.2324, -10.6979, -16.9811, -10.3427, -9.497, -13.7104, -11.1107, -13.2936, -13.855, -14.1264]],  # noqa: E231
+            [[-12.1736, -8.8487, -14.8877, -11.6713, -15.1165, -12.2396, -7.6828, -15.4153, -12.2528, -14.3671, -12.3596, -7.4272, -14.9615, -13.6356, -11.7939, -9.9767, -14.8112, -8.9567, -15.8798, -11.5291, -9.4249, -14.7544, -7.9387, -16.2789, -8.9702, -15.3111, -11.5585, -7.9992, -4.1127, 10.3209, -8.3926, -10.2005], [-11.1375, -15.4027, -12.6861, -16.9884, -13.7093, -10.3560, -15.7228, -12.9290, -15.8519, -13.7953, -10.2460, -15.7198, -14.2078, -12.8477, -11.4861, -16.1017, -11.8900, -16.4488, -13.2959, -10.3980, -15.4874, -10.3539, -16.8263, -10.9973, -17.0344, -9.2751, -10.1196, -13.8907, -12.1025, -13.0628, -12.8530, -13.8173]], # noqa: E321
             device=torch_device,
         )
         # fmt: on
@@ -954,7 +944,7 @@ def test_auto_padding(self):
 
         # fmt: off
         target = torch.tensor(
-            [[-0.045136, -0.068013, 0.12246, -0.01356, 0.018386, 0.025333, -0.0044439, -0.0030996, -0.064031, 0.0006439], [-0.045018, -0.067638, 0.12317, -0.013998, 0.019216, 0.025695, -0.0043705, -0.0031895, -0.063153, 0.00088899], [-0.045042, -0.067305, 0.1234, -0.014512, 0.020057, 0.026084, -0.004615, -0.0031728, -0.062442, 0.0010263], [-0.044589, -0.067655, 0.12416, -0.014287, 0.019416, 0.026065, -0.0050958, -0.002702, -0.063158, 0.0004827], [-0.044627, -0.067535, 0.1239, -0.014319, 0.019491, 0.026213, -0.0059482, -0.0025906, -0.063116, 0.00014669], [-0.044899, -0.067704, 0.12337, -0.014231, 0.019256, 0.026345, -0.0065565, -0.0022938, -0.063433, -0.00011409], [-0.045599, -0.067764, 0.12235, -0.014151, 0.019206, 0.026417, -0.0068965, -0.0024494, -0.063313, -4.4499e-06], [-0.045557, -0.068372, 0.12199, -0.013747, 0.017962, 0.026103, -0.0070607, -0.0023552, -0.06447, -0.00048756], [-0.045334, -0.068913, 0.1217, -0.013566, 0.01693, 0.025745, -0.006311, -0.0024903, -0.065575, -0.0006719], [-0.045171, -0.068726, 0.12164, -0.013688, 0.017139, 0.025629, -0.005213, -0.0029412, -0.065237, -0.00020669], [-0.044411, -0.069267, 0.12206, -0.013645, 0.016212, 0.025589, -0.0044121, -0.002972, -0.066277, -0.00067963], [-0.043487, -0.069792, 0.1232, -0.013663, 0.015303, 0.02613, -0.0036294, -0.0030616, -0.067483, -0.0012642], [-0.042622, -0.069287, 0.12469, -0.013936, 0.016204, 0.026474, -0.0040534, -0.0027365, -0.066994, -0.0014148], [-0.041879, -0.070031, 0.12593, -0.014047, 0.015082, 0.027751, -0.0040683, -0.0027189, -0.068985, -0.0027146]],  # noqa: E231
+            [[-0.129420, -0.164740, 0.042422, -0.336030, 0.094379, 0.033794, 0.384590, 0.229660, -0.196500, 0.108020], [-0.000154, -0.168800, 0.165820, -0.313670, 0.101240, 0.035145, 0.381880, 0.213730, -0.201080, 0.077443], [0.053754, -0.166350, 0.225520, -0.272900, 0.119670, 0.019987, 0.348670, 0.199190, -0.181600, 0.084640], [0.063636, -0.187110, 0.237010, -0.297380, 0.126300, 0.020025, 0.268490, 0.191820, -0.192300, 0.035077], [0.073893, -0.184790, 0.188870, -0.297860, 0.134280, 0.028972, 0.174650, 0.186890, -0.180530, 0.006851], [0.005253, -0.169360, 0.123100, -0.302550, 0.126930, 0.024188, 0.133410, 0.200600, -0.168210, -0.001006], [-0.093336, -0.175370, -0.004768, -0.333170, 0.114330, 0.034168, 0.120960, 0.203570, -0.162810, -0.005757], [-0.160210, -0.169310, -0.049064, -0.331950, 0.115730, 0.027062, 0.143600, 0.205310, -0.144580, 0.026746], [-0.193200, -0.156820, -0.079422, -0.351600, 0.106450, 0.032174, 0.245690, 0.210250, -0.173480, 0.043914], [-0.167980, -0.153050, -0.059764, -0.357890,0.103910, 0.031481, 0.334190, 0.208960,-0.178180, 0.072165], [-0.136990, -0.156950, -0.012099, -0.353140,0.096996, 0.025864, 0.376340, 0.216050, -0.171820, 0.089963], [-0.041143, -0.167060, 0.079754, -0.353220, 0.093247, 0.019867, 0.385810, 0.214340, -0.191800, 0.065946],[0.040373, -0.158610, 0.152570, -0.312930, 0.110590, 0.012282, 0.345270, 0.204040, -0.176500, 0.064972], [0.043762, -0.166450, 0.179500, -0.317930, 0.117280, -0.004040, 0.304490, 0.201380, -0.182780, 0.044000]], # noqa: E231
             device=torch_device,
         )
         # fmt: on
diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py
index 6c91106a8514..63b2237fbddc 100644
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ b/tests/models/big_bird/test_modeling_flax_big_bird.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers import BigBirdConfig, is_flax_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers.testing_utils import require_flax, slow
 
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
@@ -221,17 +221,3 @@ def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, n
             return
         else:
             super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
-    @is_pt_flax_cross_test
-    @unittest.skip(
-        reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode"
-    )
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    @unittest.skip(
-        reason="Current Pytorch implementation has bug with random attention -> it always uses it not matter if we are in eval/train mode"
-    )
-    def test_equivalence_pt_to_flax(self):
-        pass

From dbc12269ed5546b2da9236b9f1078b95b6a4d3d5 Mon Sep 17 00:00:00 2001
From: Robert Baruch <robert.c.baruch@gmail.com>
Date: Sun, 7 May 2023 16:06:24 -0700
Subject: [PATCH 042/935] Fix hf_argparser.parse_json_file to open file with
 utf-8 encoding, close file when finished (#23194)

* Open json args in utf-8 encoding, close file when finished

* black formatted
---
 src/transformers/hf_argparser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index f808acebe902..b31497dd103e 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -401,8 +401,8 @@ def parse_json_file(self, json_file: str, allow_extra_keys: bool = False) -> Tup
 
                 - the dataclass instances in the same order as they were passed to the initializer.
         """
-        open_json_file = open(Path(json_file))
-        data = json.loads(open_json_file.read())
+        with open(Path(json_file), encoding="utf-8") as open_json_file:
+            data = json.loads(open_json_file.read())
         outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys)
         return tuple(outputs)
 

From bbfb9fc22bdd49a45dd6ed850fc78c4d99b59afb Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 8 May 2023 10:45:40 +0100
Subject: [PATCH 043/935] =?UTF-8?q?Generate:=20starcoder=20=F0=9F=A4=9C=20?=
 =?UTF-8?q?=F0=9F=A4=9B=20assisted=20generation=20(#23182)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* starcoder has joined the chat

* indexing that works for all
---
 src/transformers/generation/utils.py | 14 ++++++++++++--
 tests/generation/test_utils.py       |  4 ++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 0f0191fb144f..8c8a67fa5cb0 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4221,6 +4221,9 @@ def assisted_decoding(
         # keep track of which sequences are already finished
         unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
 
+        # other auxiliary variables
+        max_len = stopping_criteria[0].max_length
+
         this_peer_finished = False  # used by synced_gpus only
         while True:
             if synced_gpus:
@@ -4235,7 +4238,7 @@ def assisted_decoding(
 
             # Assistant: main logic start
             cur_len = input_ids.shape[-1]
-            max_len = stopping_criteria[0].max_length
+            assistant_kv_indexing = 0 if "bloom" not in assistant_model.__class__.__name__.lower() else 1
 
             #  1. Forecast next N tokens using the assistant model. This `for` block can be replaced with a
             # `.generate()` call if we decide to add `past_key_values` as a possible output of generate, as we
@@ -4244,7 +4247,7 @@ def assisted_decoding(
             for _ in range(int(assistant_model.max_assistant_tokens)):
                 # 1.1. use the assistant model to obtain the next candidate logits
                 if "assistant_past_key_values" in model_kwargs:
-                    prev_seq_len = model_kwargs["assistant_past_key_values"][0][0].shape[2]
+                    prev_seq_len = model_kwargs["assistant_past_key_values"][0][assistant_kv_indexing].shape[-2]
                     # `new_token_len` can be 1 or 2 (next token in assistant + last token picked by the larger model)
                     new_token_len = candidate_input_ids.shape[1] - prev_seq_len
                     assist_inputs = candidate_input_ids[:, -new_token_len:]
@@ -4505,6 +4508,13 @@ def _crop_past_key_values(model, past_key_values, maximum_length):
                 )
             )
         past_key_values = tuple(new_past)
+    elif "gptbigcode" in model.__class__.__name__.lower():  # gptbigcode is too
+        if model.config.multi_query:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :maximum_length, :]
+        else:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :, :maximum_length, :]
     else:
         for idx in range(len(past_key_values)):
             new_past.append(
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 3b96f2b2bdff..70de057d5fe7 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1473,7 +1473,7 @@ def test_assisted_decoding_matches_greedy_search(self):
             # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes
             if any(
                 model_name in model_class.__name__.lower()
-                for model_name in ["bigbirdpegasus", "gptbigcode", "led", "mega", "speech2text", "git", "prophetnet"]
+                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
             ):
                 return
 
@@ -1529,7 +1529,7 @@ def test_assisted_decoding_sample(self):
             # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes
             if any(
                 model_name in model_class.__name__.lower()
-                for model_name in ["bigbirdpegasus", "gptbigcode", "led", "mega", "speech2text", "git", "prophetnet"]
+                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
             ):
                 return
 

From 843fdf2e420a95bdf93f5c7b5c43151fdaa98e48 Mon Sep 17 00:00:00 2001
From: Orr Zohar <108689663+orrzohar@users.noreply.github.com>
Date: Mon, 8 May 2023 04:35:04 -0700
Subject: [PATCH 044/935] Fixing class embedding selection in owl-vit (#23157)

fixing class embedding selection in owl-vit
---
 src/transformers/models/owlvit/modeling_owlvit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 1a0f3ed0f6d8..93872fffcf41 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1499,7 +1499,7 @@ def embed_image_query(
 
             selected_inds = (ious[0] >= iou_threshold).nonzero()
             if selected_inds.numel():
-                selected_embeddings = class_embeds[i][selected_inds[0]]
+                selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
                 mean_embeds = torch.mean(class_embeds[i], axis=0)
                 mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
                 best_box_ind = selected_inds[torch.argmin(mean_sim)]

From fd6970bc56afa5a011ab9b839f73fae1c9e72625 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 8 May 2023 08:52:44 -0400
Subject: [PATCH 045/935] Skip failing test

---
 examples/tensorflow/test_tensorflow_examples.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py
index 956209baade4..d5ae4c71b869 100644
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@@ -297,6 +297,7 @@ def test_run_translation(self):
             result = get_results(tmp_dir)
             self.assertGreaterEqual(result["bleu"], 30)
 
+    @skip("Fix me Matt")
     def test_run_image_classification(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""

From 94056b57beb4499f4f74d5d88a41e8266cc01778 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 8 May 2023 09:47:08 -0400
Subject: [PATCH 046/935] New version of Accelerate for the Trainer (#23204)

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d8f58360d7af..41e52e7f7c8e 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
     "Pillow",
-    "accelerate>=0.17.0",
+    "accelerate>=0.19.0",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
     "black~=23.1",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index dd1055ebd2e4..f325447a109d 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
     "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.17.0",
+    "accelerate": "accelerate>=0.19.0",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",
     "black": "black~=23.1",

From 188a8bfcccc6b862fe7ccc2859d977c01dd98136 Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Mon, 8 May 2023 14:56:42 -0400
Subject: [PATCH 047/935] docs: Fix broken link in 'How to add a model...' 
 (#23216)

fix link
---
 docs/source/en/add_new_model.mdx                                | 2 +-
 templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/add_new_model.mdx b/docs/source/en/add_new_model.mdx
index 49dce27600ce..38efaa945ded 100644
--- a/docs/source/en/add_new_model.mdx
+++ b/docs/source/en/add_new_model.mdx
@@ -678,7 +678,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")
 **7. Implement the forward pass**
 
 Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
-sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
+sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
 pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
 implementation instead of the original one. It should look as follows:
 
diff --git a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
index 10bbd2011096..201806837591 100644
--- a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
+++ b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md
@@ -848,7 +848,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")
 Having managed to correctly load the pretrained weights into the 🤗
 Transformers implementation, you should now make sure that the forward
 pass is correctly implemented. In [Get familiar with the original
-repository](#run-a-pretrained-checkpoint-using-the-original-repository),
+repository](#34-run-a-pretrained-checkpoint-using-the-original-repository),
 you have already created a script that runs a forward pass of the model
 using the original repository. Now you should write an analogous script
 using the 🤗 Transformers implementation instead of the original one. It

From 006da469dd5a465f4551f4245f780e3b1e92b76c Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 8 May 2023 18:36:22 -0400
Subject: [PATCH 048/935] Pin tensorflow-probability (#23220)

* Pin tensorflow-probability

* [all-test]

* [all-test] Fix syntax for bash
---
 .circleci/create_circleci_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 53c0279c4a0a..7208d876a97c 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -186,7 +186,7 @@ def job_name(self):
         "git lfs install",
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install tensorflow_probability",
+        'pip install "tensorflow_probability<0.20"',
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
@@ -227,7 +227,7 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
-        "pip install tensorflow_probability",
+        'pip install "tensorflow_probability<0.20"',
     ],
     parallelism=1,
     pytest_options={"rA": None},
@@ -266,7 +266,7 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install -y cmake",
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]",
-        "pip install tensorflow_probability",
+        'pip install "tensorflow_probability<0.20"',
     ],
     pytest_options={"rA": None},
     marker="is_pipeline_test",

From 431b04d8c410e3fc1a0f85d43d74ee6927bd95c5 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 9 May 2023 14:58:19 +0200
Subject: [PATCH 049/935] [SAM] Add resources (#23224)

Add resources
---
 docs/source/en/model_doc/sam.mdx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/sam.mdx b/docs/source/en/model_doc/sam.mdx
index 70e93d2ae2cb..969b7e2b2290 100644
--- a/docs/source/en/model_doc/sam.mdx
+++ b/docs/source/en/model_doc/sam.mdx
@@ -22,7 +22,7 @@ The model can be used to predict segmentation masks of any object of interest gi
 
 The abstract from the paper is the following:
 
-*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at \href{https://segment-anything.com}{https://segment-anything.com} to foster research into foundation models for computer vision.*
+*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.*
 
 Tips:
 
@@ -63,8 +63,10 @@ scores = outputs.iou_scores
 
 Resources:
 
-- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model
-- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using automatic mask generation pipeline.
+- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model.
+- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using the automatic mask generation pipeline.
+- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Run_inference_with_MedSAM_using_HuggingFace_Transformers.ipynb) for inference with MedSAM, a fine-tuned version of SAM on the medical domain.
+- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)_on_a_custom_dataset.ipynb) for fine-tuning the model on custom data.
 
 ## SamConfig
 

From 7f9195090160d508c7afb2e444e34f181872dd10 Mon Sep 17 00:00:00 2001
From: Matthijs Hollemans <mail@hollance.com>
Date: Tue, 9 May 2023 15:10:17 +0200
Subject: [PATCH 050/935] audio_utils improvements (#21998)

* silly change to allow making a PR

* clean up doc comments

* simplify hertz_to_mel and mel_to_hertz

* fixup

* clean up power_to_db

* also add amplitude_to_db

* move functions

* clean up mel_filter_bank

* fixup

* credit librosa & torchaudio authors

* add unit tests

* tests for power_to_db and amplitude_to_db

* add mel_filter_bank tests

* rewrite STFT

* add convenience spectrogram function

* missing transpose

* fewer transposes

* add integration test to M-CTC-T

* frame length can be either window or FFT length

* rewrite stft API

* add preemphasis coefficient

* move argument

* add log option to spectrogram

* replace M-CTC-T feature extractor

* fix api thing

* replace whisper STFT

* replace whisper mel filters

* replace tvlt's stft

* allow alternate window names

* replace speecht5 stft

* fixup

* fix integration tests

* fix doc comments

* remove manual FFT length calculation

* fix docs

* go away, deprecation warnings

* combine everything into spectrogram function

* add deprecated functions back

* fixup
---
 docs/source/en/internal/audio_utils.mdx       |  15 +-
 src/transformers/audio_utils.py               | 648 ++++++++++++-----
 .../models/clap/feature_extraction_clap.py    |  53 +-
 .../models/mctct/feature_extraction_mctct.py  | 116 +---
 .../speecht5/feature_extraction_speecht5.py   |  60 +-
 .../models/tvlt/feature_extraction_tvlt.py    | 153 +---
 .../whisper/feature_extraction_whisper.py     | 147 +---
 ...xtraction_audio_spectrogram_transformer.py |   5 +-
 .../mctct/test_feature_extraction_mctct.py    |  39 +-
 .../test_feature_extraction_speech_to_text.py |  24 +
 .../test_feature_extraction_speecht5.py       |   4 +-
 .../tvlt/test_feature_extraction_tvlt.py      |   6 +-
 .../test_feature_extraction_whisper.py        |   5 +-
 tests/utils/test_audio_utils.py               | 652 ++++++++++++++++++
 14 files changed, 1334 insertions(+), 593 deletions(-)
 create mode 100644 tests/utils/test_audio_utils.py

diff --git a/docs/source/en/internal/audio_utils.mdx b/docs/source/en/internal/audio_utils.mdx
index 8f1d6597149d..74c2fe82a363 100644
--- a/docs/source/en/internal/audio_utils.mdx
+++ b/docs/source/en/internal/audio_utils.mdx
@@ -12,10 +12,9 @@ specific language governing permissions and limitations under the License.
 
 # Utilities for `FeatureExtractors`
 
-This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *Mel log spectrogram*.
+This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *log mel spectrogram*.
 
-
-Most of those are only useful if you are studying the code of the image processors in the library.
+Most of those are only useful if you are studying the code of the audio processors in the library.
 
 ## Audio Transformations
 
@@ -23,12 +22,14 @@ Most of those are only useful if you are studying the code of the image processo
 
 [[autodoc]] audio_utils.mel_to_hertz
 
-[[autodoc]] audio_utils.get_mel_filter_banks
+[[autodoc]] audio_utils.mel_filter_bank
 
-[[autodoc]] audio_utils.stft
+[[autodoc]] audio_utils.optimal_fft_length
 
-[[autodoc]] audio_utils.power_to_db
+[[autodoc]] audio_utils.window_function
 
-[[autodoc]] audio_utils.fram_wave
+[[autodoc]] audio_utils.spectrogram
 
+[[autodoc]] audio_utils.power_to_db
 
+[[autodoc]] audio_utils.amplitude_to_db
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 73bc041d6961..a34892af4123 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
+# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,66 +13,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Audio processing functions to extract feature from a raw audio. Should all be in numpy to support all frameworks, and
- remmove unecessary dependencies.
+Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
+and remove unnecessary dependencies.
 """
-import math
 import warnings
-from typing import Optional
+from typing import Optional, Union
 
 import numpy as np
-from numpy.fft import fft
 
 
-def hertz_to_mel(freq: float, mel_scale: str = "htk") -> float:
-    """Convert Hertz to Mels.
+def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
+    """
+    Convert frequency from hertz to mels.
 
     Args:
-        freqs (`float`):
-            Frequencies in Hertz
+        freq (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in hertz (Hz).
         mel_scale (`str`, *optional*, defaults to `"htk"`):
-            Scale to use, `htk` or `slaney`.
+            The mel frequency scale to use, `"htk"` or `"slaney"`.
 
     Returns:
-        mels (`float`):
-            Frequency in Mels
+        `float` or `np.ndarray`: The frequencies on the mel scale.
     """
 
     if mel_scale not in ["slaney", "htk"]:
         raise ValueError('mel_scale should be one of "htk" or "slaney".')
 
     if mel_scale == "htk":
-        return 2595.0 * math.log10(1.0 + (freq / 700.0))
-
-    # Fill in the linear part
-    frequency_min = 0.0
-    f_sp = 200.0 / 3
+        return 2595.0 * np.log10(1.0 + (freq / 700.0))
 
-    mels = (freq - frequency_min) / f_sp
-
-    # Fill in the log-scale part
     min_log_hertz = 1000.0
-    min_log_mel = (min_log_hertz - frequency_min) / f_sp
-    logstep = math.log(6.4) / 27.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
 
-    if freq >= min_log_hertz:
-        mels = min_log_mel + math.log(freq / min_log_hertz) / logstep
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
 
     return mels
 
 
-def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
-    """Convert mel bin numbers to frequencies.
+def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
+    """
+    Convert frequency from mels to hertz.
 
     Args:
-        mels (`np.array`):
-            Mel frequencies
+        mels (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in mels.
         mel_scale (`str`, *optional*, `"htk"`):
-            Scale to use: `htk` or `slaney`.
+            The mel frequency scale to use, `"htk"` or `"slaney"`.
 
     Returns:
-        freqs (`np.array`):
-            Mels converted to Hertz
+        `float` or `np.ndarray`: The frequencies in hertz.
     """
 
     if mel_scale not in ["slaney", "htk"]:
@@ -81,171 +76,509 @@ def mel_to_hertz(mels: np.array, mel_scale: str = "htk") -> np.array:
     if mel_scale == "htk":
         return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
 
-    # Fill in the linear scale
-    frequency_min = 0.0
-    f_sp = 200.0 / 3
-    freqs = frequency_min + f_sp * mels
-
-    # And now the nonlinear scale
     min_log_hertz = 1000.0
-    min_log_mel = (min_log_hertz - frequency_min) / f_sp
-    logstep = math.log(6.4) / 27.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
 
-    log_t = mels >= min_log_mel
-    freqs[log_t] = min_log_hertz * np.exp(logstep * (mels[log_t] - min_log_mel))
+    if isinstance(mels, np.ndarray):
+        log_region = mels >= min_log_mel
+        freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
+    elif mels >= min_log_mel:
+        freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
 
-    return freqs
+    return freq
 
 
-def _create_triangular_filterbank(
-    all_freqs: np.array,
-    f_pts: np.array,
-) -> np.array:
-    """Create a triangular filter bank.
+def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
+    """
+    Creates a triangular filter bank.
 
+    Adapted from *torchaudio* and *librosa*.
 
     Args:
-        all_freqs (`np.array` of shape (`nb_frequency_bins`, )):
-            Discrete frequencies used when the STFT was computed.
-        f_pts (`np.array`, of shape (`nb_mel_filters`, )):
-            Coordinates of the middle points of the triangular filters to create.
+        fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
+            Discrete frequencies of the FFT bins in Hz.
+        filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
+            Center frequencies of the triangular filters to create, in Hz.
 
     Returns:
-        fb (np.array):
-            The filter bank of size (`nb_frequency_bins`, `nb_mel_filters`).
+        `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
     """
-    # Adapted from Librosa
-    # calculate the difference between each filter mid point and each stft freq point in hertz
-    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
-    slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (nb_frequency_bins, n_filter + 2)
-    # create overlapping triangles
-    zero = np.zeros(1)
-    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (nb_frequency_bins, n_filter)
-    up_slopes = slopes[:, 2:] / f_diff[1:]  # (nb_frequency_bins, n_filter)
-    fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
-
-    return fb
-
-
-def get_mel_filter_banks(
-    nb_frequency_bins: int,
-    nb_mel_filters: int,
-    frequency_min: float,
-    frequency_max: float,
-    sample_rate: int,
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+
+
+def mel_filter_bank(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
     norm: Optional[str] = None,
     mel_scale: str = "htk",
-) -> np.array:
+) -> np.ndarray:
     """
-    Create a frequency bin conversion matrix used to obtain the Mel Spectrogram. This is called a *mel filter bank*,
-    and various implementation exist, which differ in the number of filters, the shape of the filters, the way the
-    filters are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
+    Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
+    various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
+    are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
     features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
-    This code is heavily inspired from the *torchaudio* implementation, see
-    [here](https://pytorch.org/audio/stable/transforms.html) for more details.
-
-
-    Tips:
-        - Different banks of Mel filters were introduced in the litterature. The following variation are supported:
-            - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHertz
-            and a speech bandwidth of `[0, 4600]` Hertz
-            - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a
-            speech bandwidth `[0, 8000]` Hertz (sampling rate ≥ 16 kHertz).
-            - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate
-            of 16 kHertz, and speech bandwidth [133, 6854] Hertz. This version also includes an area normalization.
-            - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes sampling
-            rate of 12.5 kHertz and speech bandwidth [0, 6250] Hertz
-        - The default parameters of `torchaudio`'s mel filterbanks implement the `"htk"` filers while `torchlibrosa`
-        uses the `"slaney"` implementation.
+
+    Different banks of mel filters were introduced in the literature. The following variations are supported:
+
+    - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
+      bandwidth of `[0, 4600]` Hz.
+    - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
+      bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
+    - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
+      speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
+    - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
+      12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
+
+    This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
+    `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
 
     Args:
-        nb_frequency_bins (`int`):
+        num_frequency_bins (`int`):
             Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
-        nb_mel_filters (`int`):
-            Number of Mel filers to generate.
-        frequency_min (`float`):
-            Minimum frequency of interest(Hertz).
-        frequency_max (`float`):
-            Maximum frequency of interest(Hertz).
-        sample_rate (`int`):
+        num_mel_filters (`int`):
+            Number of mel filters to generate.
+        min_frequency (`float`):
+            Lowest frequency of interest in Hz.
+        max_frequency (`float`):
+            Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
+        sampling_rate (`int`):
             Sample rate of the audio waveform.
         norm (`str`, *optional*):
-            If "slaney", divide the triangular Mel weights by the width of the mel band (area normalization).
+            If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
         mel_scale (`str`, *optional*, defaults to `"htk"`):
-            Scale to use: `"htk"` or `"slaney"`.
+            The mel frequency scale to use, `"htk"` or `"slaney"`.
 
     Returns:
-        `np.ndarray`: Triangular filter banks (fb matrix) of shape (`nb_frequency_bins`, `nb_mel_filters`). This matrix
-        is a projection matrix to go from a spectrogram to a Mel Spectrogram.
-
+        `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
+        projection matrix to go from a spectrogram to a mel spectrogram.
     """
-
     if norm is not None and norm != "slaney":
         raise ValueError('norm must be one of None or "slaney"')
 
-    # freqency bins
-    all_freqs = np.linspace(0, sample_rate // 2, nb_frequency_bins)
-
-    # Compute mim and max frequencies in mel scale
-    m_min = hertz_to_mel(frequency_min, mel_scale=mel_scale)
-    m_max = hertz_to_mel(frequency_max, mel_scale=mel_scale)
+    # frequencies of FFT bins in Hz
+    fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
 
-    # create the centers of the triangular mel filters.
-    m_pts = np.linspace(m_min, m_max, nb_mel_filters + 2)
-    f_pts = mel_to_hertz(m_pts, mel_scale=mel_scale)
+    # center points of the triangular mel filters
+    mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
+    mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
+    mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
 
-    # create the filterbank
-    filterbank = _create_triangular_filterbank(all_freqs, f_pts)
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
 
     if norm is not None and norm == "slaney":
         # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (f_pts[2 : nb_mel_filters + 2] - f_pts[:nb_mel_filters])
-        filterbank *= np.expand_dims(enorm, 0)
+        enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
+        mel_filters *= np.expand_dims(enorm, 0)
 
-    if (filterbank.max(axis=0) == 0.0).any():
+    if (mel_filters.max(axis=0) == 0.0).any():
         warnings.warn(
-            "At least one mel filterbank has all zero values. "
-            f"The value for `nb_mel_filters` ({nb_mel_filters}) may be set too high. "
-            f"Or, the value for `nb_frequency_bins` ({nb_frequency_bins}) may be set too low."
+            "At least one mel filter has all zero values. "
+            f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
+            f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
+        )
+
+    return mel_filters
+
+
+def optimal_fft_length(window_length: int) -> int:
+    """
+    Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
+    already a power of two, rounds it up to the next power or two.
+
+    The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
+    of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
+    is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
+    it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
+    """
+    return 2 ** int(np.ceil(np.log2(window_length)))
+
+
+def window_function(
+    window_length: int,
+    name: str = "hann",
+    periodic: bool = True,
+    frame_length: Optional[int] = None,
+    center: bool = True,
+) -> np.ndarray:
+    """
+    Returns an array containing the specified window. This window is intended to be used with `stft`.
+
+    The following window types are supported:
+
+        - `"boxcar"`: a rectangular window
+        - `"hamming"`: the Hamming window
+        - `"hann"`: the Hann window
+
+    Args:
+        window_length (`int`):
+            The length of the window in samples.
+        name (`str`, *optional*, defaults to `"hann"`):
+            The name of the window function.
+        periodic (`bool`, *optional*, defaults to `True`):
+            Whether the window is periodic or symmetric.
+        frame_length (`int`, *optional*):
+            The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
+            than the frame length, so that it will be zero-padded.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
+
+    Returns:
+        `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
+    """
+    length = window_length + 1 if periodic else window_length
+
+    if name == "boxcar":
+        window = np.ones(length)
+    elif name in ["hamming", "hamming_window"]:
+        window = np.hamming(length)
+    elif name in ["hann", "hann_window"]:
+        window = np.hanning(length)
+    else:
+        raise ValueError(f"Unknown window function '{name}'")
+
+    if periodic:
+        window = window[:-1]
+
+    if frame_length is None:
+        return window
+
+    if window_length > frame_length:
+        raise ValueError(
+            f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
         )
 
-    return filterbank
+    padded_window = np.zeros(frame_length)
+    offset = (frame_length - window_length) // 2 if center else 0
+    padded_window[offset : offset + window_length] = window
+    return padded_window
+
+
+# TODO This method does not support batching yet as we are mainly focused on inference.
+def spectrogram(
+    waveform: np.ndarray,
+    window: np.ndarray,
+    frame_length: int,
+    hop_length: int,
+    fft_length: Optional[int] = None,
+    power: Optional[float] = 1.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    preemphasis: Optional[float] = None,
+    mel_filters: Optional[np.ndarray] = None,
+    mel_floor: float = 1e-10,
+    log_mel: Optional[str] = None,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: Optional[float] = None,
+    dtype: np.dtype = np.float32,
+) -> np.ndarray:
+    """
+    Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
+
+    This function can create the following kinds of spectrograms:
+
+      - amplitude spectrogram (`power = 1.0`)
+      - power spectrogram (`power = 2.0`)
+      - complex-valued spectrogram (`power = None`)
+      - log spectrogram (use `log_mel` argument)
+      - mel spectrogram (provide `mel_filters`)
+      - log-mel spectrogram (provide `mel_filters` and `log_mel`)
+
+    How this works:
+
+      1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
+         - hop_length` samples.
+      2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
+      3. The DFT is taken of each windowed frame.
+      4. The results are stacked into a spectrogram.
+
+    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
+
+      - The analysis frame. This is the size of the time slices that the input waveform is split into.
+      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
+      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
+
+    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
+    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
+    typically the next power of two.
+
+    Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
+    `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
+    can be constructed.
+
+    Args:
+        waveform (`np.ndarray` of shape `(length,)`):
+            The input waveform. This must be a single real-valued, mono waveform.
+        window (`np.ndarray` of shape `(frame_length,)`):
+            The windowing function to apply, including zero-padding if necessary. The actual window length may be
+            shorter than `frame_length`, but we're assuming the array has already been zero-padded.
+        frame_length (`int`):
+            The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
+            allow smaller sizes.
+        hop_length (`int`):
+            The stride between successive analysis frames in samples.
+        fft_length (`int`, *optional*):
+            The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
+            For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
+        power (`float`, *optional*, defaults to 1.0):
+            If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
+            complex numbers.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
+            `t` will start at time `t * hop_length`.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
+            (pad with edge values), `"reflect"` (pads with mirrored values).
+        onesided (`bool`, *optional*, defaults to `True`):
+            If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
+            frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
+        preemphasis (`float`, *optional*)
+            Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
+        mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
+            The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Minimum value of mel frequency banks.
+        log_mel (`str`, *optional*):
+            How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
+            the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
+            used when `power` is not `None`.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
+            amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
+            `np.complex64`.
+
+    Returns:
+        `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
+        `(num_mel_filters, length)` for a mel spectrogram.
+    """
+    window_length = len(window)
+
+    if fft_length is None:
+        fft_length = frame_length
+
+    if frame_length > fft_length:
+        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
 
+    if window_length != frame_length:
+        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
 
-def power_to_db(mel_spectrogram, top_db=None, a_min=1e-10, ref=1.0):
+    if hop_length <= 0:
+        raise ValueError("hop_length must be greater than zero")
+
+    if waveform.ndim != 1:
+        raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
+
+    if np.iscomplexobj(waveform):
+        raise ValueError("Complex-valued input waveforms are not currently supported")
+
+    # center pad the waveform
+    if center:
+        padding = [(int(frame_length // 2), int(frame_length // 2))]
+        waveform = np.pad(waveform, padding, mode=pad_mode)
+
+    # promote to float64, since np.fft uses float64 internally
+    waveform = waveform.astype(np.float64)
+    window = window.astype(np.float64)
+
+    # split waveform into frames of frame_length size
+    num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
+
+    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
+    spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
+
+    # rfft is faster than fft
+    fft_func = np.fft.rfft if onesided else np.fft.fft
+    buffer = np.zeros(fft_length)
+
+    timestep = 0
+    for frame_idx in range(num_frames):
+        buffer[:frame_length] = waveform[timestep : timestep + frame_length]
+
+        if preemphasis is not None:
+            buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
+            buffer[0] *= 1 - preemphasis
+
+        buffer[:frame_length] *= window
+
+        spectrogram[frame_idx] = fft_func(buffer)
+        timestep += hop_length
+
+    # note: ** is much faster than np.power
+    if power is not None:
+        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
+
+    spectrogram = spectrogram.T
+
+    if mel_filters is not None:
+        spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
+
+    if power is not None and log_mel is not None:
+        if log_mel == "log":
+            spectrogram = np.log(spectrogram)
+        elif log_mel == "log10":
+            spectrogram = np.log10(spectrogram)
+        elif log_mel == "dB":
+            if power == 1.0:
+                spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
+            elif power == 2.0:
+                spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
+            else:
+                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+
+        spectrogram = np.asarray(spectrogram, dtype)
+
+    return spectrogram
+
+
+def power_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
+    logarithm properties for numerical stability.
+
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+
+    Based on the implementation of `librosa.power_to_db`.
+
+    Args:
+        spectrogram (`np.ndarray`):
+            The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
     """
-    Convert a mel spectrogram from power to db scale, this function is the numpy implementation of librosa.power_to_lb.
-    It computes `10 * log10(mel_spectrogram / ref)`, using basic log properties for stability.
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
 
-    Tips:
-        - The motivation behind applying the log function on the mel spectrogram is that humans do not hear loudness on
-          a
-        linear scale. Generally to double the percieved volume of a sound we need to put 8 times as much energy into
-        it.
-        - This means that large variations in energy may not sound all that different if the sound is loud to begin
-        with. This compression operation makes the mel features match more closely what humans actually hear.
+    reference = max(min_value, reference)
+
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
+
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+
+    return spectrogram
+
+
+def amplitude_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-5,
+    db_range: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
+    basic logarithm properties for numerical stability.
+
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
 
     Args:
-        mel_spectrogram (`np.array`):
-            Input mel spectrogram.
-        top_db (`int`, *optional*):
-            The maximum decibel value.
-        a_min (`int`, *optional*, default to 1e-10):
-            Minimum value to use when cliping the mel spectrogram.
-        ref (`float`, *optional*, default to 1.0):
-            Maximum reference value used to scale the mel_spectrogram.
+        spectrogram (`np.ndarray`):
+            The input amplitude (mel) spectrogram.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-5`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
 
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
     """
-    log_spec = 10 * np.log10(np.clip(mel_spectrogram, a_min=a_min, a_max=None))
-    log_spec -= 10.0 * np.log10(np.maximum(a_min, ref))
-    if top_db is not None:
-        if top_db < 0:
-            raise ValueError("top_db must be non-negative")
-        log_spec = np.clip(log_spec, min=np.maximum(log_spec) - top_db, max=np.inf)
-    return log_spec
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+
+    reference = max(min_value, reference)
+
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
+
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+
+    return spectrogram
+
+
+### deprecated functions below this line ###
+
+
+def get_mel_filter_banks(
+    nb_frequency_bins: int,
+    nb_mel_filters: int,
+    frequency_min: float,
+    frequency_max: float,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> np.array:
+    warnings.warn(
+        "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    return mel_filter_bank(
+        num_frequency_bins=nb_frequency_bins,
+        num_mel_filters=nb_mel_filters,
+        min_frequency=frequency_min,
+        max_frequency=frequency_max,
+        sampling_rate=sample_rate,
+        norm=norm,
+        mel_scale=mel_scale,
+    )
 
 
-# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
 def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
     """
     In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
@@ -270,6 +603,10 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
         framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
             The framed waveforms that can be fed to `np.fft`.
     """
+    warnings.warn(
+        "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
     frames = []
     for i in range(0, waveform.shape[0] + 1, hop_length):
         if center:
@@ -298,9 +635,6 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
     return frames
 
 
-# TODO @ArthurZucker: This method does not support batching yet as we are mainly focus on inference.
-
-
 def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
     """
     Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
@@ -337,6 +671,10 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int =
         spectrogram (`np.ndarray`):
             A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
     """
+    warnings.warn(
+        "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
     frame_size = frames.shape[1]
 
     if fft_window_size is None:
@@ -355,5 +693,5 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int =
             np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
         else:
             fft_signal[:frame_size] = frame
-        spectrogram[f] = fft(fft_signal, axis=0)[:nb_frequency_bins]
+        spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins]
     return spectrogram.T
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index b73873e05652..6edd739fa16d 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -21,7 +21,7 @@
 import numpy as np
 import torch
 
-from ...audio_utils import fram_wave, get_mel_filter_banks, power_to_db, stft
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
@@ -116,21 +116,21 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.frequency_min = frequency_min
         self.frequency_max = frequency_max
-        self.mel_filters = get_mel_filter_banks(
-            nb_frequency_bins=self.nb_frequency_bins,
-            nb_mel_filters=feature_size,
-            frequency_min=frequency_min,
-            frequency_max=frequency_max,
-            sample_rate=sampling_rate,
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
             norm=None,
             mel_scale="htk",
         )
-        self.mel_filters_slaney = get_mel_filter_banks(
-            nb_frequency_bins=self.nb_frequency_bins,
-            nb_mel_filters=feature_size,
-            frequency_min=frequency_min,
-            frequency_max=frequency_max,
-            sample_rate=sampling_rate,
+        self.mel_filters_slaney = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
             norm="slaney",
             mel_scale="slaney",
         )
@@ -153,24 +153,25 @@ def to_dict(self) -> Dict[str, Any]:
 
     def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided `waveform` using the `hanning` window. In CLAP, two different
-        filter banks are used depending on the truncation pattern:
-            - `self.mel_filters`: they correspond to the defaults parameters of `torchaduio` which can be obtained from
+        Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter
+        banks are used depending on the truncation pattern:
+            - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
               calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
               is set to `"fusion"`.
-            - `self.mel_filteres_slaney` : they correspond to the defaults parameters of `torchlibrosa` which used
+            - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
               `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
               implementation when the truncation mode is not `"fusion"`.
         """
-        window = np.hanning(self.fft_window_size + 1)[:-1]
-        frames = fram_wave(waveform, self.hop_length, self.fft_window_size)
-        spectrogram = stft(frames, window, fft_window_size=self.fft_window_size)
-
-        magnitudes = np.abs(spectrogram) ** 2
-        mel_spectrogram = np.matmul(mel_filters.T, magnitudes)
-        log_mel_spectrogram = power_to_db(mel_spectrogram).T
-        log_mel_spectrogram = np.asarray(log_mel_spectrogram, np.float32)
-        return log_mel_spectrogram
+        log_mel_spectrogram = spectrogram(
+            waveform,
+            window_function(self.fft_window_size, "hann"),
+            frame_length=self.fft_window_size,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=mel_filters,
+            log_mel="dB",
+        )
+        return log_mel_spectrogram.T
 
     def _random_mel_fusion(self, mel, total_frames, chunk_frames):
         ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py
index d517e3caf85e..467e654244b9 100644
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ b/src/transformers/models/mctct/feature_extraction_mctct.py
@@ -20,9 +20,8 @@
 
 import numpy as np
 import torch
-import torchaudio
-from packaging import version
 
+from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...file_utils import PaddingStrategy, TensorType
@@ -31,13 +30,6 @@
 
 logger = logging.get_logger(__name__)
 
-parsed_torchaudio_version_base = version.parse(version.parse(torchaudio.__version__).base_version)
-if not parsed_torchaudio_version_base >= version.parse("0.10"):
-    logger.warning(
-        f"You are using torchaudio=={torchaudio.__version__}, but torchaudio>=0.10.0 is required to use "
-        "MCTCTFeatureExtractor. This requires torch>=1.10.0. Please upgrade torch and torchaudio."
-    )
-
 
 class MCTCTFeatureExtractor(SequenceFeatureExtractor):
     r"""
@@ -110,68 +102,9 @@ def __init__(
         self.sample_size = win_length * sampling_rate // 1000
         self.sample_stride = hop_length * sampling_rate // 1000
 
-        self.n_fft = 2 ** int(np.ceil(np.log2(self.sample_size)))
+        self.n_fft = optimal_fft_length(self.sample_size)
         self.n_freqs = (self.n_fft // 2) + 1
 
-    @staticmethod
-    def _num_frames_calc(in_size, frame_size, frame_stride):
-        return int(1 + np.floor((in_size - frame_size) * 1 / frame_stride))
-
-    @staticmethod
-    def _frame_signal(one_waveform, n_frames, frame_signal_scale, window_length, sample_stride):
-        scale = frame_signal_scale
-        frames = np.zeros(n_frames * window_length)
-        for frame_idx in range(n_frames):
-            start = frame_idx * window_length
-            end = (frame_idx + 1) * window_length
-            wave_start = frame_idx * sample_stride
-            wave_end = frame_idx * sample_stride + window_length
-            frames[start:end] = scale * one_waveform[wave_start:wave_end]
-
-        return frames
-
-    @staticmethod
-    def _apply_preemphasis_inplace(frames, window_length, preemphasis_coeff):
-        if frames.size % window_length != 0:
-            raise ValueError(
-                f"`frames` is supposed to have length divisble by `window_length`, but is {frames.size} with"
-                f" window_length={window_length}."
-            )
-
-        n_frames = frames.size // window_length
-        for frame_idx in range(n_frames, 0, -1):
-            start = (frame_idx - 1) * window_length
-            end = frame_idx * window_length - 1
-            frames[start + 1 : end + 1] -= preemphasis_coeff * frames[start:end]
-            frames[start] *= 1 - preemphasis_coeff
-
-    @staticmethod
-    def _windowing(frames, window_length, window):
-        if frames.size % window_length != 0:
-            raise ValueError(
-                f"`frames` is supposed to have length divisble by `window_length`, but is {frames.size} with"
-                f" window_length={window_length}."
-            )
-
-        shaped = frames.reshape(-1, window_length)
-        shaped = window * shaped
-        return shaped
-
-    @staticmethod
-    def _dft(frames, K, n_frames, n_samples, n_fft):
-        dft = np.zeros([n_frames, K])
-
-        for frame in range(n_frames):
-            begin = frame * n_samples
-
-            inwards_buffer = frames[begin : begin + n_samples]
-            inwards_buffer = np.pad(inwards_buffer, (0, n_fft - n_samples), "constant")
-            out = np.fft.rfft(inwards_buffer)
-
-            dft[frame] = np.abs(out[:K])
-
-        return dft
-
     def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
         """
         Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
@@ -183,36 +116,27 @@ def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
 
         window = window.numpy()
 
-        fbanks = torchaudio.functional.melscale_fbanks(
-            n_freqs=self.n_freqs,
-            f_min=0.0,  # change this to zeros
-            f_max=self.sampling_rate / 2.0,
-            n_mels=self.feature_size,
-            sample_rate=self.sampling_rate,
+        fbanks = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.feature_size,
+            min_frequency=0.0,
+            max_frequency=self.sampling_rate / 2.0,
+            sampling_rate=self.sampling_rate,
         )
 
-        fbanks = fbanks.numpy()
-
-        n_frames = self._num_frames_calc(one_waveform.size, self.sample_size, self.sample_stride)
-
-        frames = self._frame_signal(
-            one_waveform, n_frames, self.frame_signal_scale, self.sample_size, self.sample_stride
+        msfc_features = spectrogram(
+            one_waveform * self.frame_signal_scale,
+            window=window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            center=False,
+            preemphasis=self.preemphasis_coeff,
+            mel_filters=fbanks,
+            mel_floor=self.mel_floor,
+            log_mel="log",
         )
-
-        self._apply_preemphasis_inplace(frames, self.sample_size, self.preemphasis_coeff)
-
-        frames = self._windowing(frames, self.sample_size, window)
-
-        dft_out = self._dft(frames.flatten(), self.n_freqs, n_frames, self.sample_size, self.n_fft)
-
-        # msfc_features = STFT * mel frequency banks.
-        msfc_features = np.einsum("...tf,fm->...tm", dft_out, fbanks)
-
-        # clamp feature values then log scale, as implemented in flashlight
-        msfc_features = np.maximum(msfc_features, self.mel_floor)
-        msfc_features = np.log(msfc_features)
-
-        return msfc_features
+        return msfc_features.T
 
     def _normalize_one(self, x, input_length, padding_value):
         # make sure we normalize float32 arrays
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
index 8ceb48dc03c6..5fe6ca39765c 100644
--- a/src/transformers/models/speecht5/feature_extraction_speecht5.py
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -20,7 +20,7 @@
 import numpy as np
 import torch
 
-from ...audio_utils import get_mel_filter_banks
+from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import PaddingStrategy, TensorType, logging
@@ -110,18 +110,18 @@ def __init__(
 
         self.sample_size = win_length * sampling_rate // 1000
         self.sample_stride = hop_length * sampling_rate // 1000
-        self.n_fft = 2 ** int(np.ceil(np.log2(self.sample_size)))
+        self.n_fft = optimal_fft_length(self.sample_size)
         self.n_freqs = (self.n_fft // 2) + 1
 
         window = getattr(torch, self.win_function)(window_length=self.sample_size, periodic=True)
         self.window = window.numpy().astype(np.float64)
 
-        self.mel_filters = get_mel_filter_banks(
-            nb_frequency_bins=self.n_freqs,
-            nb_mel_filters=self.num_mel_bins,
-            frequency_min=self.fmin,
-            frequency_max=self.fmax,
-            sample_rate=self.sampling_rate,
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.num_mel_bins,
+            min_frequency=self.fmin,
+            max_frequency=self.fmax,
+            sampling_rate=self.sampling_rate,
             norm="slaney",
             mel_scale="slaney",
         )
@@ -160,31 +160,6 @@ def zero_mean_unit_var_norm(
 
         return normed_input_values
 
-    @staticmethod
-    def _stft(waveform: np.ndarray, fft_length: int, hop_length: int, window: np.ndarray) -> np.ndarray:
-        """
-        Calculates the magnitude spectrogram over one waveform array.
-        """
-        # center pad the waveform
-        padding = [(int(fft_length // 2), int(fft_length // 2))]
-        waveform = np.pad(waveform, padding, mode="reflect")
-        waveform_size = waveform.size
-
-        # promote to float64, since np.fft uses float64 internally
-        waveform = waveform.astype(np.float64)
-
-        num_frames = int(1 + np.floor((waveform_size - fft_length) / hop_length))
-        num_frequency_bins = (fft_length // 2) + 1
-        spectrogram = np.empty((num_frames, num_frequency_bins))
-
-        start = 0
-        for frame_idx in range(num_frames):
-            frame = waveform[start : start + fft_length] * window
-            spectrogram[frame_idx] = np.abs(np.fft.rfft(frame))
-            start += hop_length
-
-        return spectrogram
-
     def _extract_mel_features(
         self,
         one_waveform: np.ndarray,
@@ -192,14 +167,17 @@ def _extract_mel_features(
         """
         Extracts log-mel filterbank features for one waveform array (unbatched).
         """
-        if self.n_fft != self.sample_size:
-            raise NotImplementedError(
-                f"Currently the STFT frame size must be a power of two, but got {self.sample_size} for a window length of {self.win_length} and sampling rate of {self.sampling_rate}. Ensure `win_length * sampling_rate // 1000` is divisible by two."
-            )
-
-        stft_out = self._stft(one_waveform, self.n_fft, self.sample_stride, self.window)
-
-        return np.log10(np.maximum(self.mel_floor, np.dot(stft_out, self.mel_filters)))
+        log_mel_spec = spectrogram(
+            one_waveform,
+            window=self.window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            mel_filters=self.mel_filters,
+            mel_floor=self.mel_floor,
+            log_mel="log10",
+        )
+        return log_mel_spec.T
 
     def __call__(
         self,
diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py
index ac219502f1bd..6d919550cf55 100644
--- a/src/transformers/models/tvlt/feature_extraction_tvlt.py
+++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py
@@ -18,8 +18,8 @@
 from typing import List, Optional, Union
 
 import numpy as np
-from numpy.fft import fft
 
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
 from ...feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
 from ...utils import TensorType, logging
 
@@ -83,143 +83,34 @@ def __init__(
         self.hop_length = sampling_rate // hop_length_to_sampling_rate
         self.sampling_rate = sampling_rate
         self.padding_value = padding_value
-        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
-
-    # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.get_mel_filters with 45.245640471924965->59.99247463746737
-    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
-        # Initialize the weights
-        n_mels = int(n_mels)
-        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-        # Center freqs of each FFT bin
-        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
-
-        # 'Center freqs' of mel bands - uniformly spaced between limits
-        min_mel = 0.0
-        max_mel = 59.99247463746737
-
-        mels = np.linspace(min_mel, max_mel, n_mels + 2)
-
-        mels = np.asanyarray(mels)
-
-        # Fill in the linear scale
-        f_min = 0.0
-        f_sp = 200.0 / 3
-        freqs = f_min + f_sp * mels
-
-        # And now the nonlinear scale
-        min_log_hz = 1000.0  # beginning of log region (Hz)
-        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-        logstep = np.log(6.4) / 27.0  # step size for log region
-
-        # If we have vector data, vectorize
-        log_t = mels >= min_log_mel
-        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
-
-        mel_f = freqs
-
-        fdiff = np.diff(mel_f)
-        ramps = np.subtract.outer(mel_f, fftfreqs)
-
-        for i in range(n_mels):
-            # lower and upper slopes for all bins
-            lower = -ramps[i] / fdiff[i]
-            upper = ramps[i + 2] / fdiff[i + 1]
-
-            # .. then intersect them with each other and zero
-            weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-        return weights
-
-    # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.fram_wave
-    def fram_wave(self, waveform, center=True):
-        """
-        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
-        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
-        new frame.
-
-        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
-        """
-        frames = []
-        for i in range(0, waveform.shape[0] + 1, self.hop_length):
-            half_window = (self.n_fft - 1) // 2 + 1
-            if center:
-                start = i - half_window if i > half_window else 0
-                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
-
-                frame = waveform[start:end]
-
-                if start == 0:
-                    padd_width = (-i + half_window, 0)
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-                elif end == waveform.shape[0]:
-                    padd_width = (0, (i - waveform.shape[0] + half_window))
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-            else:
-                frame = waveform[i : i + self.n_fft]
-                frame_width = frame.shape[0]
-                if frame_width < waveform.shape[0]:
-                    frame = np.lib.pad(
-                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
-                    )
-
-            frames.append(frame)
-        return np.stack(frames, 0)
-
-    # Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.stft
-    def stft(self, frames, window):
-        """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`.
-        """
-        frame_size = frames.shape[1]
-        fft_size = self.n_fft
-
-        if fft_size is None:
-            fft_size = frame_size
-
-        if fft_size < frame_size:
-            raise ValueError("FFT size must greater or equal the frame size")
-        # number of FFT bins to store
-        num_fft_bins = (fft_size >> 1) + 1
-
-        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
-        fft_signal = np.zeros(fft_size)
-
-        for f, frame in enumerate(frames):
-            if window is not None:
-                np.multiply(frame, window, out=fft_signal[:frame_size])
-            else:
-                fft_signal[:frame_size] = frame
-            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
-        return data.T
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=22050.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        ).T
 
     def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
         implementation with 1e-5 tolerance.
         """
-        window = np.hanning(self.n_fft + 1)[:-1]
-
-        frames = self.fram_wave(waveform)
-        stft = self.stft(frames, window=window)
-        magnitudes = np.abs(stft[:, :-1]) ** 2
-
-        filters = self.mel_filters
-        mel_spec = filters @ magnitudes
-
-        log_spec = 10.0 * np.log10(np.maximum(1e-10, mel_spec))
-        log_spec -= 10.0 * np.log10(np.maximum(1e-10, 1.0))
-        log_spec = np.maximum(log_spec, log_spec.max() - 80.0)
+        log_spec = spectrogram(
+            waveform,
+            window_function(self.n_fft, "hann"),
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=self.mel_filters.T,
+            log_mel="dB",
+            db_range=80.0,
+        )
+        log_spec = log_spec[:, :-1]
         log_spec = log_spec - 20.0
         log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
-
         return log_spec
 
     def __call__(
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index da700f2d2257..e0b772216205 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -19,8 +19,8 @@
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
-from numpy.fft import fft
 
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
@@ -81,138 +81,33 @@ def __init__(
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frames = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
-        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
-
-    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
-        # Initialize the weights
-        n_mels = int(n_mels)
-        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-        # Center freqs of each FFT bin
-        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
-
-        # 'Center freqs' of mel bands - uniformly spaced between limits
-        min_mel = 0.0
-        max_mel = 45.245640471924965
-
-        mels = np.linspace(min_mel, max_mel, n_mels + 2)
-
-        mels = np.asanyarray(mels)
-
-        # Fill in the linear scale
-        f_min = 0.0
-        f_sp = 200.0 / 3
-        freqs = f_min + f_sp * mels
-
-        # And now the nonlinear scale
-        min_log_hz = 1000.0  # beginning of log region (Hz)
-        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-        logstep = np.log(6.4) / 27.0  # step size for log region
-
-        # If we have vector data, vectorize
-        log_t = mels >= min_log_mel
-        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
-
-        mel_f = freqs
-
-        fdiff = np.diff(mel_f)
-        ramps = np.subtract.outer(mel_f, fftfreqs)
-
-        for i in range(n_mels):
-            # lower and upper slopes for all bins
-            lower = -ramps[i] / fdiff[i]
-            upper = ramps[i + 2] / fdiff[i + 1]
-
-            # .. then intersect them with each other and zero
-            weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-        return weights
-
-    def fram_wave(self, waveform, center=True):
-        """
-        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
-        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
-        new frame.
-
-        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
-        """
-        frames = []
-        for i in range(0, waveform.shape[0] + 1, self.hop_length):
-            half_window = (self.n_fft - 1) // 2 + 1
-            if center:
-                start = i - half_window if i > half_window else 0
-                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
-
-                frame = waveform[start:end]
-
-                if start == 0:
-                    padd_width = (-i + half_window, 0)
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-                elif end == waveform.shape[0]:
-                    padd_width = (0, (i - waveform.shape[0] + half_window))
-                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
-
-            else:
-                frame = waveform[i : i + self.n_fft]
-                frame_width = frame.shape[0]
-                if frame_width < waveform.shape[0]:
-                    frame = np.lib.pad(
-                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
-                    )
-
-            frames.append(frame)
-        return np.stack(frames, 0)
-
-    def stft(self, frames, window):
-        """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as `torch.stft`.
-        """
-        frame_size = frames.shape[1]
-        fft_size = self.n_fft
-
-        if fft_size is None:
-            fft_size = frame_size
-
-        if fft_size < frame_size:
-            raise ValueError("FFT size must greater or equal the frame size")
-        # number of FFT bins to store
-        num_fft_bins = (fft_size >> 1) + 1
-
-        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
-        fft_signal = np.zeros(fft_size)
-
-        for f, frame in enumerate(frames):
-            if window is not None:
-                np.multiply(frame, window, out=fft_signal[:frame_size])
-            else:
-                fft_signal[:frame_size] = frame
-            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
-        return data.T
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=8000.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
 
     def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
         implementation with 1e-5 tolerance.
         """
-        window = np.hanning(self.n_fft + 1)[:-1]
-
-        frames = self.fram_wave(waveform)
-        stft = self.stft(frames, window=window)
-        magnitudes = np.abs(stft[:, :-1]) ** 2
-
-        filters = self.mel_filters
-        mel_spec = filters @ magnitudes
-
-        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
+        log_spec = spectrogram(
+            waveform,
+            window_function(self.n_fft, "hann"),
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=self.mel_filters,
+            log_mel="log10",
+        )
+        log_spec = log_spec[:, :-1]
         log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
         log_spec = (log_spec + 4.0) / 4.0
-
         return log_spec
 
     @staticmethod
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index 6fd035af8d04..a7a81dceb153 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -160,6 +160,7 @@ def test_integration(self):
         # fmt: on
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = ASTFeatureExtractor()
-        input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
+        feature_extractor = ASTFeatureExtractor()
+        input_values = feature_extractor(input_speech, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 1024, 128))
         self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
index 29b0cf899ad3..a3c07474d281 100644
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ b/tests/models/mctct/test_feature_extraction_mctct.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from transformers import is_speech_available
-from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.testing_utils import require_torch
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
@@ -47,7 +47,6 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 
 
 @require_torch
-@require_torchaudio
 class MCTCTFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
@@ -102,7 +101,6 @@ def _flatten(list_of_lists):
 
 
 @require_torch
-@require_torchaudio
 class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
     feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None
 
@@ -271,3 +269,38 @@ def test_different_window(self):
             self.assertTrue(np_processed.input_features.dtype == np.float32)
             pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self):
+        # fmt: off
+        expected = np.array([
+            [
+                1.1280,  1.1319,  1.2744,  1.4369,  1.4328,  1.3671,  1.2889,  1.3046,
+                1.4419,  0.8387,  0.2995,  0.0404,  0.1068,  0.0472,  0.3728,  1.3356,
+                1.4491,  0.4770,  0.3997,  0.2776,  0.3184, -0.1243, -0.1170, -0.0828
+            ],
+            [
+                1.0826,  1.0565,  1.2110,  1.3886,  1.3416,  1.2009,  1.1894,  1.2707,
+                1.5153,  0.7005,  0.4916,  0.4017,  0.3743,  0.1935,  0.4228,  1.1084,
+                0.9768,  0.0608,  0.2044,  0.1723,  0.0433, -0.2360, -0.2478, -0.2643
+            ],
+            [
+                1.0590,  0.9923,  1.1185,  1.3309,  1.1971,  1.0067,  1.0080,  1.2036,
+                1.5397,  1.0383,  0.7672,  0.7551,  0.4878,  0.8771,  0.7565,  0.8775,
+                0.9042,  0.4595,  0.6157,  0.4954,  0.1857,  0.0307,  0.0199,  0.1033
+            ],
+        ])
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
+        self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4))
diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
index 749323d3a8c9..aedd445e5d63 100644
--- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
+++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
@@ -247,3 +247,27 @@ def test_double_precision_pad(self):
             self.assertTrue(np_processed.input_features.dtype == np.float32)
             pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self):
+        # fmt: off
+        expected = np.array([
+            -1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241,
+            -1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128,
+            -1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625,
+        ])
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+        self.assertEquals(input_features.shape, (1, 584, 24))
+        self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4))
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
index d19c71dd56f9..da733614913f 100644
--- a/tests/models/speecht5/test_feature_extraction_speecht5.py
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -395,7 +395,8 @@ def test_integration(self):
         input_speech = self._load_datasamples(1)
         feature_extractor = SpeechT5FeatureExtractor()
         input_values = feature_extractor(input_speech, return_tensors="pt").input_values
-        self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
+        self.assertEquals(input_values.shape, (1, 93680))
+        self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
 
     def test_integration_target(self):
         # fmt: off
@@ -410,4 +411,5 @@ def test_integration_target(self):
         input_speech = self._load_datasamples(1)
         feature_extractor = SpeechT5FeatureExtractor()
         input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 366, 80))
         self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
index 9f73a732f197..560abd78f92e 100644
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -198,10 +198,10 @@ def _load_datasamples(self, num_samples):
 
     def test_integration(self):
         input_speech = self._load_datasamples(1)
-        feaure_extractor = TvltFeatureExtractor()
-        audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values
+        feature_extractor = TvltFeatureExtractor()
+        audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values
 
-        self.assertTrue(audio_values.shape, [1, 1, 192, 128])
+        self.assertEquals(audio_values.shape, (1, 1, 192, 128))
 
         expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
         self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index 57c12b86ddbc..31ea28b9ad62 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -218,8 +218,9 @@ def test_integration(self):
         # fmt: on
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = WhisperFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
+        self.assertEqual(input_features.shape, (1, 80, 3000))
         self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
 
     def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py
new file mode 100644
index 000000000000..f0333113ea7e
--- /dev/null
+++ b/tests/utils/test_audio_utils.py
@@ -0,0 +1,652 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.audio_utils import (
+    amplitude_to_db,
+    hertz_to_mel,
+    mel_filter_bank,
+    mel_to_hertz,
+    power_to_db,
+    spectrogram,
+    window_function,
+)
+
+
+class AudioUtilsFunctionTester(unittest.TestCase):
+    def test_hertz_to_mel(self):
+        self.assertEqual(hertz_to_mel(0.0), 0.0)
+        self.assertAlmostEqual(hertz_to_mel(100), 150.48910241)
+
+        inputs = np.array([100, 200])
+        expected = np.array([150.48910241, 283.22989816])
+        self.assertTrue(np.allclose(hertz_to_mel(inputs), expected))
+
+        self.assertEqual(hertz_to_mel(0.0, "slaney"), 0.0)
+        self.assertEqual(hertz_to_mel(100, "slaney"), 1.5)
+
+        inputs = np.array([60, 100, 200, 1000, 1001, 2000])
+        expected = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016])
+        self.assertTrue(np.allclose(hertz_to_mel(inputs, "slaney"), expected))
+
+        with pytest.raises(ValueError):
+            hertz_to_mel(100, mel_scale=None)
+
+    def test_mel_to_hertz(self):
+        self.assertEqual(mel_to_hertz(0.0), 0.0)
+        self.assertAlmostEqual(mel_to_hertz(150.48910241), 100)
+
+        inputs = np.array([150.48910241, 283.22989816])
+        expected = np.array([100, 200])
+        self.assertTrue(np.allclose(mel_to_hertz(inputs), expected))
+
+        self.assertEqual(mel_to_hertz(0.0, "slaney"), 0.0)
+        self.assertEqual(mel_to_hertz(1.5, "slaney"), 100)
+
+        inputs = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016])
+        expected = np.array([60, 100, 200, 1000, 1001, 2000])
+        self.assertTrue(np.allclose(mel_to_hertz(inputs, "slaney"), expected))
+
+        with pytest.raises(ValueError):
+            mel_to_hertz(100, mel_scale=None)
+
+    def test_mel_filter_bank_shape(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+    def test_mel_filter_bank_htk(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm=None,
+            mel_scale="htk",
+        )
+        # fmt: off
+        expected = np.array([
+            [0.0       , 0.0       , 0.0       , 0.0       ],
+            [0.61454786, 0.0       , 0.0       , 0.0       ],
+            [0.82511046, 0.17488954, 0.0       , 0.0       ],
+            [0.35597035, 0.64402965, 0.0       , 0.0       ],
+            [0.0       , 0.91360726, 0.08639274, 0.0       ],
+            [0.0       , 0.55547007, 0.44452993, 0.0       ],
+            [0.0       , 0.19733289, 0.80266711, 0.0       ],
+            [0.0       , 0.0       , 0.87724349, 0.12275651],
+            [0.0       , 0.0       , 0.6038449 , 0.3961551 ],
+            [0.0       , 0.0       , 0.33044631, 0.66955369],
+            [0.0       , 0.0       , 0.05704771, 0.94295229],
+            [0.0       , 0.0       , 0.0       , 0.83483975],
+            [0.0       , 0.0       , 0.0       , 0.62612982],
+            [0.0       , 0.0       , 0.0       , 0.41741988],
+            [0.0       , 0.0       , 0.0       , 0.20870994],
+            [0.0       , 0.0       , 0.0       , 0.0       ]
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected))
+
+    def test_mel_filter_bank_slaney(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm=None,
+            mel_scale="slaney",
+        )
+        # fmt: off
+        expected = np.array([
+            [0.0       , 0.0       , 0.0       , 0.0       ],
+            [0.39869419, 0.0       , 0.0       , 0.0       ],
+            [0.79738839, 0.0       , 0.0       , 0.0       ],
+            [0.80391742, 0.19608258, 0.0       , 0.0       ],
+            [0.40522322, 0.59477678, 0.0       , 0.0       ],
+            [0.00652903, 0.99347097, 0.0       , 0.0       ],
+            [0.0       , 0.60796161, 0.39203839, 0.0       ],
+            [0.0       , 0.20939631, 0.79060369, 0.0       ],
+            [0.0       , 0.0       , 0.84685344, 0.15314656],
+            [0.0       , 0.0       , 0.52418477, 0.47581523],
+            [0.0       , 0.0       , 0.2015161 , 0.7984839 ],
+            [0.0       , 0.0       , 0.0       , 0.9141874 ],
+            [0.0       , 0.0       , 0.0       , 0.68564055],
+            [0.0       , 0.0       , 0.0       , 0.4570937 ],
+            [0.0       , 0.0       , 0.0       , 0.22854685],
+            [0.0       , 0.0       , 0.0       , 0.0       ]
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected))
+
+    def test_mel_filter_bank_slaney_norm(self):
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=16,
+            num_mel_filters=4,
+            min_frequency=0,
+            max_frequency=2000,
+            sampling_rate=4000,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        # fmt: off
+        expected = np.array([
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+            [1.19217795e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+            [2.38435591e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+            [2.40387905e-03, 5.86232616e-04, 0.00000000e+00, 0.00000000e+00],
+            [1.21170110e-03, 1.77821783e-03, 0.00000000e+00, 0.00000000e+00],
+            [1.95231437e-05, 2.97020305e-03, 0.00000000e+00, 0.00000000e+00],
+            [0.00000000e+00, 1.81763684e-03, 1.04857612e-03, 0.00000000e+00],
+            [0.00000000e+00, 6.26036972e-04, 2.11460963e-03, 0.00000000e+00],
+            [0.00000000e+00, 0.00000000e+00, 2.26505954e-03, 3.07332945e-04],
+            [0.00000000e+00, 0.00000000e+00, 1.40202503e-03, 9.54861093e-04],
+            [0.00000000e+00, 0.00000000e+00, 5.38990521e-04, 1.60238924e-03],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.83458185e-03],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.37593638e-03],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.17290923e-04],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.58645462e-04],
+            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(mel_filters, expected))
+
+    def test_window_function(self):
+        window = window_function(16, "hann")
+        self.assertEqual(len(window), 16)
+
+        # fmt: off
+        expected = np.array([
+            0.0, 0.03806023, 0.14644661, 0.30865828, 0.5, 0.69134172, 0.85355339, 0.96193977,
+            1.0, 0.96193977, 0.85355339, 0.69134172, 0.5, 0.30865828, 0.14644661, 0.03806023,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(window, expected))
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        return [x["array"] for x in speech_samples]
+
+    def test_spectrogram_impulse(self):
+        waveform = np.zeros(40)
+        waveform[9] = 1.0  # impulse shifted in time
+
+        spec = spectrogram(
+            waveform,
+            window_function(12, "hann", frame_length=16),
+            frame_length=16,
+            hop_length=4,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (9, 11))
+
+        expected = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
+        self.assertTrue(np.allclose(spec, expected))
+
+    def test_spectrogram_integration_test(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        # fmt: off
+        expected = np.array([
+            0.02464888, 0.04648664, 0.05872392, 0.02311783, 0.0327175 ,
+            0.02433643, 0.01198814, 0.02055709, 0.01559287, 0.01394357,
+            0.01299037, 0.01728045, 0.0254554 , 0.02486533, 0.02011792,
+            0.01755333, 0.02100457, 0.02337024, 0.01436963, 0.01464558,
+            0.0211017 , 0.0193489 , 0.01272165, 0.01858462, 0.03722598,
+            0.0456542 , 0.03281558, 0.00620586, 0.02226466, 0.03618042,
+            0.03508182, 0.02271432, 0.01051649, 0.01225771, 0.02315293,
+            0.02331886, 0.01417785, 0.0106844 , 0.01791214, 0.017177  ,
+            0.02125114, 0.05028201, 0.06830665, 0.05216664, 0.01963666,
+            0.06941418, 0.11513043, 0.12257859, 0.10948435, 0.08568069,
+            0.05509328, 0.05047818, 0.047112  , 0.05060737, 0.02982424,
+            0.02803827, 0.02933729, 0.01760491, 0.00587815, 0.02117637,
+            0.0293578 , 0.03452379, 0.02194803, 0.01676056,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 400], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            fft_length=512,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertTrue(np.allclose(spec[:64, 400], expected))
+
+    def test_spectrogram_center_padding(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=128,
+            center=True,
+            pad_mode="reflect",
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        # fmt: off
+        expected = np.array([
+            0.1287945 , 0.12792738, 0.08311573, 0.03155122, 0.02470202,
+            0.00727857, 0.00910694, 0.00686163, 0.01238981, 0.01473668,
+            0.00336144, 0.00370314, 0.00600871, 0.01120164, 0.01942998,
+            0.03132008, 0.0232842 , 0.01124642, 0.02754783, 0.02423725,
+            0.00147893, 0.00038027, 0.00112299, 0.00596233, 0.00571529,
+            0.02084235, 0.0231855 , 0.00810006, 0.01837943, 0.00651339,
+            0.00093931, 0.00067426, 0.01058399, 0.01270507, 0.00151734,
+            0.00331913, 0.00302416, 0.01081792, 0.00754549, 0.00148963,
+            0.00111943, 0.00152573, 0.00608017, 0.01749986, 0.01205949,
+            0.0143082 , 0.01910573, 0.00413786, 0.03916619, 0.09873404,
+            0.08302026, 0.02673891, 0.00401255, 0.01397392, 0.00751862,
+            0.01024884, 0.01544606, 0.00638907, 0.00623633, 0.0085103 ,
+            0.00217659, 0.00276204, 0.00260835, 0.00299299,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 0], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=128,
+            center=True,
+            pad_mode="constant",
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        # fmt: off
+        expected = np.array([
+            0.06558744, 0.06889656, 0.06263352, 0.04264418, 0.03404115,
+            0.03244197, 0.02279134, 0.01646339, 0.01452216, 0.00826055,
+            0.00062093, 0.0031821 , 0.00419456, 0.00689327, 0.01106367,
+            0.01712119, 0.01721762, 0.00977533, 0.01606626, 0.02275621,
+            0.01727687, 0.00992739, 0.01217688, 0.01049927, 0.01022947,
+            0.01302475, 0.01166873, 0.01081812, 0.01057327, 0.00767912,
+            0.00429567, 0.00089625, 0.00654583, 0.00912084, 0.00700984,
+            0.00225026, 0.00290545, 0.00667712, 0.00730663, 0.00410813,
+            0.00073102, 0.00219296, 0.00527618, 0.00996585, 0.01123781,
+            0.00872816, 0.01165121, 0.02047945, 0.03681747, 0.0514379 ,
+            0.05137928, 0.03960042, 0.02821562, 0.01813349, 0.01201322,
+            0.01260964, 0.00900654, 0.00207905, 0.00456714, 0.00850599,
+            0.00788239, 0.00664407, 0.00824227, 0.00628301,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 0], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=128,
+            center=False,
+        )
+        self.assertEqual(spec.shape, (257, 728))
+
+        # fmt: off
+        expected = np.array([
+            0.00250445, 0.02161521, 0.06232229, 0.04339567, 0.00937727,
+            0.01080616, 0.00248685, 0.0095264 , 0.00727476, 0.0079152 ,
+            0.00839946, 0.00254932, 0.00716622, 0.005559  , 0.00272623,
+            0.00581774, 0.01896395, 0.01829788, 0.01020514, 0.01632692,
+            0.00870888, 0.02065827, 0.0136022 , 0.0132382 , 0.011827  ,
+            0.00194505, 0.0189979 , 0.026874  , 0.02194014, 0.01923883,
+            0.01621437, 0.00661967, 0.00289517, 0.00470257, 0.00957801,
+            0.00191455, 0.00431664, 0.00544359, 0.01126213, 0.00785778,
+            0.00423469, 0.01322504, 0.02226548, 0.02318576, 0.03428908,
+            0.03648811, 0.0202938 , 0.011902  , 0.03226198, 0.06347476,
+            0.01306318, 0.05308729, 0.05474771, 0.03127991, 0.00998512,
+            0.01449977, 0.01272741, 0.00868176, 0.00850386, 0.00313876,
+            0.00811857, 0.00538216, 0.00685749, 0.00535275,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:64, 0], expected))
+
+    def test_spectrogram_shapes(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (201, 732))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            power=1.0,
+            center=False,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (201, 729))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann"),
+            frame_length=400,
+            hop_length=128,
+            fft_length=512,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=True,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=64,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=False,
+        )
+        self.assertEqual(spec.shape, (512, 1464))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=64,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=False,
+        )
+        self.assertEqual(spec.shape, (512, 1464))
+
+        spec = spectrogram(
+            waveform,
+            window_function(512, "hann"),
+            frame_length=512,
+            hop_length=512,
+            power=1.0,
+            center=True,
+            pad_mode="reflect",
+            onesided=False,
+        )
+        self.assertEqual(spec.shape, (512, 183))
+
+    def test_mel_spectrogram(self):
+        waveform = self._load_datasamples(1)[0]
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=513,
+            num_mel_filters=13,
+            min_frequency=100,
+            max_frequency=4000,
+            sampling_rate=16000,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.assertEqual(mel_filters.shape, (513, 13))
+
+        spec = spectrogram(
+            waveform,
+            window_function(800, "hann", frame_length=1024),
+            frame_length=1024,
+            hop_length=128,
+            power=2.0,
+        )
+        self.assertEqual(spec.shape, (513, 732))
+
+        spec = spectrogram(
+            waveform,
+            window_function(800, "hann", frame_length=1024),
+            frame_length=1024,
+            hop_length=128,
+            power=2.0,
+            mel_filters=mel_filters,
+        )
+        self.assertEqual(spec.shape, (13, 732))
+
+        # fmt: off
+        expected = np.array([
+            1.08027889e+02, 1.48080673e+01, 7.70758213e+00, 9.57676639e-01,
+            8.81639061e-02, 5.26073833e-02, 1.52736155e-02, 9.95350117e-03,
+            7.95364356e-03, 1.01148004e-02, 4.29241020e-03, 9.90708797e-03,
+            9.44153646e-04
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[:, 300], expected))
+
+    def test_spectrogram_power(self):
+        waveform = self._load_datasamples(1)[0]
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=None,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertEqual(spec.dtype, np.complex64)
+
+        # fmt: off
+        expected = np.array([
+             0.01452305+0.01820039j, -0.01737362-0.01641946j,
+             0.0121028 +0.01565081j, -0.02794554-0.03021514j,
+             0.04719803+0.04086519j, -0.04391563-0.02779365j,
+             0.05682834+0.01571325j, -0.08604821-0.02023657j,
+             0.07497991+0.0186641j , -0.06366091-0.00922475j,
+             0.11003416+0.0114788j , -0.13677941-0.01523552j,
+             0.10934535-0.00117226j, -0.11635598+0.02551187j,
+             0.14708674-0.03469823j, -0.1328196 +0.06034218j,
+             0.12667368-0.13973421j, -0.14764774+0.18912019j,
+             0.10235471-0.12181523j, -0.00773012+0.04730498j,
+            -0.01487191-0.07312611j, -0.02739162+0.09619419j,
+             0.02895459-0.05398273j,  0.01198589+0.05276592j,
+            -0.02117299-0.10123465j,  0.00666388+0.09526499j,
+            -0.01672773-0.05649684j,  0.02723125+0.05939891j,
+            -0.01879361-0.062954j  ,  0.03686557+0.04568823j,
+            -0.07394181-0.07949649j,  0.06238583+0.13905765j,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[64:96, 321], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=1.0,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertEqual(spec.dtype, np.float64)
+
+        # fmt: off
+        expected = np.array([
+            0.02328461, 0.02390484, 0.01978448, 0.04115711, 0.0624309 ,
+            0.05197181, 0.05896072, 0.08839577, 0.07726794, 0.06432579,
+            0.11063128, 0.13762532, 0.10935163, 0.11911998, 0.15112405,
+            0.14588428, 0.18860507, 0.23992978, 0.15910825, 0.04793241,
+            0.07462307, 0.10001811, 0.06125769, 0.05411011, 0.10342509,
+            0.09549777, 0.05892122, 0.06534349, 0.06569936, 0.05870678,
+            0.10856833, 0.1524107 , 0.11463385, 0.05766969, 0.12385171,
+            0.14472842, 0.11978184, 0.10353675, 0.07244056, 0.03461861,
+            0.02624896, 0.02227475, 0.01238363, 0.00885281, 0.0110049 ,
+            0.00807005, 0.01033663, 0.01703181, 0.01445856, 0.00585615,
+            0.0132431 , 0.02754132, 0.01524478, 0.0204908 , 0.07453328,
+            0.10716327, 0.07195779, 0.08816078, 0.18340898, 0.16449876,
+            0.12322842, 0.1621659 , 0.12334293, 0.06033659,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[64:128, 321], expected))
+
+        spec = spectrogram(
+            waveform,
+            window_function(400, "hann", frame_length=512),
+            frame_length=512,
+            hop_length=128,
+            power=2.0,
+        )
+        self.assertEqual(spec.shape, (257, 732))
+        self.assertEqual(spec.dtype, np.float64)
+
+        # fmt: off
+        expected = np.array([
+            5.42173162e-04, 5.71441371e-04, 3.91425507e-04, 1.69390778e-03,
+            3.89761780e-03, 2.70106923e-03, 3.47636663e-03, 7.81381316e-03,
+            5.97033510e-03, 4.13780799e-03, 1.22392802e-02, 1.89407300e-02,
+            1.19577805e-02, 1.41895693e-02, 2.28384770e-02, 2.12822221e-02,
+            3.55718732e-02, 5.75663000e-02, 2.53154356e-02, 2.29751552e-03,
+            5.56860259e-03, 1.00036217e-02, 3.75250424e-03, 2.92790355e-03,
+            1.06967501e-02, 9.11982451e-03, 3.47171025e-03, 4.26977174e-03,
+            4.31640586e-03, 3.44648538e-03, 1.17870830e-02, 2.32290216e-02,
+            1.31409196e-02, 3.32579296e-03, 1.53392460e-02, 2.09463164e-02,
+            1.43476883e-02, 1.07198600e-02, 5.24763530e-03, 1.19844836e-03,
+            6.89007982e-04, 4.96164430e-04, 1.53354369e-04, 7.83722571e-05,
+            1.21107812e-04, 6.51257360e-05, 1.06845939e-04, 2.90082477e-04,
+            2.09049831e-04, 3.42945241e-05, 1.75379610e-04, 7.58524227e-04,
+            2.32403356e-04, 4.19872697e-04, 5.55520924e-03, 1.14839673e-02,
+            5.17792348e-03, 7.77232368e-03, 3.36388536e-02, 2.70598419e-02,
+            1.51852425e-02, 2.62977779e-02, 1.52134784e-02, 3.64050455e-03,
+        ])
+        # fmt: on
+        self.assertTrue(np.allclose(spec[64:128, 321], expected))
+
+    def test_power_to_db(self):
+        spectrogram = np.zeros((2, 3))
+        spectrogram[0, 0] = 2.0
+        spectrogram[0, 1] = 0.5
+        spectrogram[0, 2] = 0.707
+        spectrogram[1, 1] = 1.0
+
+        output = power_to_db(spectrogram, reference=1.0)
+        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-100.0, 0.0, -100.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, reference=2.0)
+        expected = np.array([[0.0, -6.02059991, -4.51610582], [-103.01029996, -3.01029996, -103.01029996]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, min_value=1e-6)
+        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-60.0, 0.0, -60.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, db_range=80)
+        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-76.98970004, 0.0, -76.98970004]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, reference=2.0, db_range=80)
+        expected = np.array([[0.0, -6.02059991, -4.51610582], [-80.0, -3.01029996, -80.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = power_to_db(spectrogram, reference=2.0, min_value=1e-6, db_range=80)
+        expected = np.array([[0.0, -6.02059991, -4.51610582], [-63.01029996, -3.01029996, -63.01029996]])
+        self.assertTrue(np.allclose(output, expected))
+
+        with pytest.raises(ValueError):
+            power_to_db(spectrogram, reference=0.0)
+        with pytest.raises(ValueError):
+            power_to_db(spectrogram, min_value=0.0)
+        with pytest.raises(ValueError):
+            power_to_db(spectrogram, db_range=-80)
+
+    def test_amplitude_to_db(self):
+        spectrogram = np.zeros((2, 3))
+        spectrogram[0, 0] = 2.0
+        spectrogram[0, 1] = 0.5
+        spectrogram[0, 2] = 0.707
+        spectrogram[1, 1] = 1.0
+
+        output = amplitude_to_db(spectrogram, reference=1.0)
+        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-100.0, 0.0, -100.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, reference=2.0)
+        expected = np.array([[0.0, -12.04119983, -9.03221164], [-106.02059991, -6.02059991, -106.02059991]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, min_value=1e-3)
+        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-60.0, 0.0, -60.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, db_range=80)
+        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-73.97940009, 0.0, -73.97940009]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, reference=2.0, db_range=80)
+        expected = np.array([[0.0, -12.04119983, -9.03221164], [-80.0, -6.02059991, -80.0]])
+        self.assertTrue(np.allclose(output, expected))
+
+        output = amplitude_to_db(spectrogram, reference=2.0, min_value=1e-3, db_range=80)
+        expected = np.array([[0.0, -12.04119983, -9.03221164], [-66.02059991, -6.02059991, -66.02059991]])
+        self.assertTrue(np.allclose(output, expected))
+
+        with pytest.raises(ValueError):
+            amplitude_to_db(spectrogram, reference=0.0)
+        with pytest.raises(ValueError):
+            amplitude_to_db(spectrogram, min_value=0.0)
+        with pytest.raises(ValueError):
+            amplitude_to_db(spectrogram, db_range=-80)

From e02a8065e0515c10d7ebd19855adeb002c18268d Mon Sep 17 00:00:00 2001
From: dumpmemory <64742282+dumpmemory@users.noreply.github.com>
Date: Tue, 9 May 2023 21:14:02 +0800
Subject: [PATCH 051/935] make opt checkpoint dir name correct (#21660)

make opt checkpoint dir name corrent following https://github.com/huggingface/Megatron-LM/blob/100b522bb8044d98413398f9e71563af15b83325/megatron/checkpointing.py#L117
---
 .../checkpoint_reshaping_and_interoperability.py              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index 49a462491f01..fffed940105e 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -690,9 +690,9 @@ def convert_checkpoint_from_transformers_to_megatron(args):
             for j in range(args.target_tensor_model_parallel_size):
                 for k in range(args.target_data_parallel_size):
                     if args.target_pipeline_model_parallel_size == 1:
-                        checkpoint_dir = f"mp_rank_{i:02d}_{k:03d}"
+                        checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}"
                     else:
-                        checkpoint_dir = f"mp_rank_{i:02d}_{j:03d}_{k:03d}"
+                        checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}"
                     checkpoint_dir = os.path.join(release_dir, checkpoint_dir)
                     os.makedirs(checkpoint_dir, exist_ok=True)
                     torch.save(

From 51ae566511280251e879ea6c3d69a56a7d3e83dc Mon Sep 17 00:00:00 2001
From: Furkan Akkurt <71407287+furkanakkurt1335@users.noreply.github.com>
Date: Tue, 9 May 2023 16:19:38 +0300
Subject: [PATCH 052/935] Fix typo ; Update output.mdx (#23227)

---
 docs/source/en/main_classes/output.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/main_classes/output.mdx b/docs/source/en/main_classes/output.mdx
index ca4e8dfc0ace..921031671cad 100644
--- a/docs/source/en/main_classes/output.mdx
+++ b/docs/source/en/main_classes/output.mdx
@@ -31,7 +31,7 @@ outputs = model(**inputs, labels=labels)
 ```
 
 The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
-documentation of that class below, it means it has an optional `loss`, a `logits` an optional `hidden_states` and
+documentation of that class below, it means it has an optional `loss`, a `logits`, an optional `hidden_states` and
 an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
 `hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
 `output_attentions=True`.

From 1a8f61110e4a6b90f75b0ac40ea8cea2817da5ac Mon Sep 17 00:00:00 2001
From: Sebastian <sjrl@users.noreply.github.com>
Date: Tue, 9 May 2023 15:20:10 +0200
Subject: [PATCH 053/935] fix: Update run_qa.py to work with deepset/germanquad
 (#23225)

Call str on id to make sure any ints are converted into the expected format for squad datasets
---
 examples/pytorch/question-answering/run_qa.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index d3377611bdfd..b0dcf6e5d836 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -590,12 +590,12 @@ def post_processing_function(examples, features, predictions, stage="eval"):
         # Format the result to the format the metric expects.
         if data_args.version_2_with_negative:
             formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+                {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
             ]
         else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+            formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
 
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
 
     metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

From 9a50cb6195cb477c1feae8f0fb2bce20b679246f Mon Sep 17 00:00:00 2001
From: Rustin Welter <131769788+rustinwelter@users.noreply.github.com>
Date: Tue, 9 May 2023 14:51:43 +0000
Subject: [PATCH 054/935] Add Japanese translation to accelerate.mdx (#23232)

Co-authored-by: rustinwelter <rustinwelter.alwp9@slmails.com>
---
 docs/source/ja/_toctree.yml   |   4 ++
 docs/source/ja/accelerate.mdx | 132 ++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 docs/source/ja/accelerate.mdx

diff --git a/docs/source/ja/_toctree.yml b/docs/source/ja/_toctree.yml
index a85f0ee11dcf..8ac8b1e3183f 100644
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@@ -4,6 +4,10 @@
   - local: installation
     title: インストール
   title: はじめに
+- sections:
+  - local: accelerate
+    title: 🤗 Accelerate を用いた分散学習
+  title: チュートリアル
 - sections:
   - sections:
     - local: multilingual
diff --git a/docs/source/ja/accelerate.mdx b/docs/source/ja/accelerate.mdx
new file mode 100644
index 000000000000..823ed0dcf72b
--- /dev/null
+++ b/docs/source/ja/accelerate.mdx
@@ -0,0 +1,132 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 🤗 Accelerate を用いた分散学習
+
+モデルが大きくなるにつれて、限られたハードウェアでより大きなモデルを訓練し、訓練速度を大幅に上昇させるための方法として並列処理が浮上してきました。1台のマシンに複数のGPUがあっても、複数のマシンにまたがる複数のGPUがあっても、あらゆるタイプの分散処理セットアップ上でユーザーが簡単に 🤗 Transformers モデルを訓練できるように、 Hugging Face では [🤗 Accelerate](https://huggingface.co/docs/accelerate) ライブラリを作成しました。このチュートリアルでは、PyTorch の訓練ループをカスタマイズして、分散処理環境での訓練を可能にする方法について学びます。
+
+## セットアップ
+
+はじめに 🤗 Accelerate をインストールしましょう:
+
+```bash
+pip install accelerate
+```
+
+そしたらインポートして [`~accelerate.Accelerator`] オブジェクトを作成しましょう。[`~accelerate.Accelerator`] は分散処理セットアップを自動的に検出し、訓練のために必要な全てのコンポーネントを初期化します。モデルをデバイスに明示的に配置する必要はありません。
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Accelerate する準備をしましょう
+
+次に、関連する全ての訓練オブジェクトを [`~accelerate.Accelerator.prepare`] メソッドに渡します。これには、訓練と評価それぞれのDataloader、モデル、optimizer が含まれます:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Backward
+
+最後に訓練ループ内の `loss.backward()` を 🤗 Accelerate の [`~accelerate.Accelerator.backward`] メソッドで置き換えます：
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+以下のコードで確認できる通り、訓練ループに4行のコードを追加するだけで分散学習が可能です！
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## 訓練する
+
+関連するコードを追加したら、スクリプトまたは Colaboratory などのノートブックで訓練を開始します。
+
+### スクリプトで訓練する
+
+スクリプトから訓練をしている場合は、設定ファイルを作成・保存するために以下のコマンドを実行してください:
+
+```bash
+accelerate config
+```
+
+そして次のようにして訓練を開始します:
+
+```bash
+accelerate launch train.py
+```
+
+### ノートブックで訓練する
+
+Colaboratory の TPU の利用をお考えの場合、🤗 Accelerate はノートブック上で実行することもできます。訓練に必要な全てのコードを関数に含め、[`~accelerate.notebook_launcher`] に渡してください:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+🤗 Accelerate と豊富な機能についてもっと知りたい方は[ドキュメント](https://huggingface.co/docs/accelerate)を参照してください。

From b4d4d6fe87ffcd7508307970cdf8fa3eda288701 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 9 May 2023 13:04:10 -0400
Subject: [PATCH 055/935] Add RWKV-4 (#22797)

* First draft of RWKV-4

* Add support for generate

* Style post-rebase

* Properly use state

* Write doc

* Fix doc

* More math

* Add model to README, dummies and clean config

* Fix init

* multiple fixes:

- fix common tests
- fix configuraion default values
- add CI test for checking state computation
- fix some CI tests

* correct tokenizer

* some tweaks

- fix config docstring
- fix failing tests

* fix CI tests

- add output_attention / output_hidden_states
- override test_initialization
- fix failing CIs

* fix conversion script

- fix sharded case
- add new arguments

* add slow tests + more fixes on conversion script

* add another test

* final fixes

* change single name variable

* add mock attention mask for pipeline to work

* correct eos token id

* fix nits

* add checkpoints

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add `tie_word_embeddings` in docstring

* change tensor name

* fix final nits

* Trigger CI

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.mdx                      |   2 +
 docs/source/en/model_doc/rwkv.mdx             | 129 +++
 docs/source/en/tasks/language_modeling.mdx    |   2 +-
 src/transformers/__init__.py                  |  16 +
 src/transformers/generation/utils.py          |   2 +
 src/transformers/kernels/rwkv/wkv_cuda.cu     | 187 ++++
 .../kernels/rwkv/wkv_cuda_bf16.cu             | 186 ++++
 src/transformers/kernels/rwkv/wkv_op.cpp      |  66 ++
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 src/transformers/models/auto/modeling_auto.py |   4 +
 .../models/auto/tokenization_auto.py          |   1 +
 src/transformers/models/rwkv/__init__.py      |  60 ++
 .../models/rwkv/configuration_rwkv.py         | 130 +++
 .../rwkv/convert_rwkv_checkpoint_to_hf.py     | 201 +++++
 src/transformers/models/rwkv/modeling_rwkv.py | 804 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 tests/models/rwkv/__init__.py                 |   0
 tests/models/rwkv/test_modeling_rwkv.py       | 451 ++++++++++
 tests/test_configuration_common.py            |   9 +-
 28 files changed, 2284 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/en/model_doc/rwkv.mdx
 create mode 100644 src/transformers/kernels/rwkv/wkv_cuda.cu
 create mode 100644 src/transformers/kernels/rwkv/wkv_cuda_bf16.cu
 create mode 100644 src/transformers/kernels/rwkv/wkv_op.cpp
 create mode 100644 src/transformers/models/rwkv/__init__.py
 create mode 100644 src/transformers/models/rwkv/configuration_rwkv.py
 create mode 100644 src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
 create mode 100644 src/transformers/models/rwkv/modeling_rwkv.py
 create mode 100644 tests/models/rwkv/__init__.py
 create mode 100644 tests/models/rwkv/test_modeling_rwkv.py

diff --git a/README.md b/README.md
index be4f2d103148..fdceaa1643e0 100644
--- a/README.md
+++ b/README.md
@@ -422,6 +422,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_es.md b/README_es.md
index 58d6fc3cf338..c3c569c531d6 100644
--- a/README_es.md
+++ b/README_es.md
@@ -410,6 +410,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/README_hd.md b/README_hd.md
index e580c37794b2..af9c58011455 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -382,6 +382,7 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
diff --git a/README_ja.md b/README_ja.md
index ebb173495c15..953ff5598e4a 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -444,6 +444,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
diff --git a/README_ko.md b/README_ko.md
index cf01054dcc49..2707f191dad0 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -359,6 +359,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index b46c84983d9b..74d39b1df3d6 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -383,6 +383,7 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index c360cce25a83..4fd2a3caad54 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -395,6 +395,7 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f6e9684f79e3..c92f21a93458 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -399,6 +399,8 @@
         title: RoCBert
       - local: model_doc/roformer
         title: RoFormer
+      - local: model_doc/rwkv
+        title: RWKV
       - local: model_doc/splinter
         title: Splinter
       - local: model_doc/squeezebert
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index cc5724b5fd6c..c9af68ae078e 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -196,6 +196,7 @@ The documentation is organized into five sections:
 1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -396,6 +397,7 @@ Flax), PyTorch, and/or TensorFlow.
 |     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             RWKV              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              SAM              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/rwkv.mdx b/docs/source/en/model_doc/rwkv.mdx
new file mode 100644
index 000000000000..b8812516560f
--- /dev/null
+++ b/docs/source/en/model_doc/rwkv.mdx
@@ -0,0 +1,129 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RWKV
+
+## Overview
+
+The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM)
+
+It suggests a tweak in the traditional Transformer attention to make it linear. This way, the model can be used as recurrent network: passing inputs for timestamp 0 and timestamp 1 together is the same as passing inputs at timestamp 0, then inputs at timestamp 1 along with the state of timestamp 0 (see example below).
+
+This can be more efficient than a regular Transformer and can deal with sentence of any length (even if the model uses a fixed context length for training).
+
+This model was contributed by [sgugger](https://huggingface.co/sgugger).
+The original code can be found [here](https://github.com/BlinkDL/RWKV-LM).
+
+Example of use as an RNN:
+
+```py
+import torch
+from transformers import AutoTokenizer, RwkvConfig, RwkvModel
+
+model = RwkvModel.from_pretrained("sgugger/rwkv-430M-pile")
+tokenizer = AutoTokenizer.from_pretrained("sgugger/rwkv-430M-pile")
+
+inputs = tokenizer("This is an example.", return_tensors="pt")
+# Feed everything to the model
+outputs = model(inputs["input_ids"])
+output_whole = outputs.last_hidden_state
+
+outputs = model(inputs["input_ids"][:, :2])
+output_one = outputs.last_hidden_state
+
+# Using the state computed on the first inputs, we will get the same output
+outputs = model(inputs["input_ids"][:, 2:], state=outputs.state)
+output_two = outputs.last_hidden_state
+
+torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)
+```
+
+## RwkvConfig
+
+[[autodoc]] RwkvConfig
+
+
+## RwkvModel
+
+[[autodoc]] RwkvModel
+    - forward
+
+## RwkvLMHeadModel
+
+[[autodoc]] RwkvForCausalLM
+    - forward
+
+## Rwkv attention and the recurrent formulas
+
+In a traditional auto-regressive Transformer, attention is written as
+
+$$O = \hbox{softmax}(QK^{T} / \sqrt{d}) V$$
+
+with \\(Q\\), \\(K\\) and \\(V\\) are matrices of shape `seq_len x hidden_size` named query, key and value (they are actually bigger matrices with a batch dimension and an attention head dimension but we're only interested in the last two, which is where the matrix product is taken, so for the sake of simplicity we only consider those two). The product \\(QK^{T}\\) then has shape `seq_len x seq_len` and we can take the maxtrix product with \\(V\\) to get the output \\(O\\) of the same shape as the others.  
+
+Replacing the softmax by its value gives:
+
+$$O_{i} = \frac{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}} V_{j}}{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}}}$$
+
+Note that the entries in \\(QK^{T}\\) corresponding to \\(j > i\\) are masked (the sum stops at j) because the attention is not allowed to look at future tokens (only past ones).
+
+In comparison, the RWKV attention is given by
+
+$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$
+
+where \\(R\\) is a new matrix called receptance by the author, \\(K\\) and \\(V\\) are still the key and value (\\(\sigma\\) here is the sigmoid function). \\(W\\) is a new vector that represents the position of the token and is given by
+
+$$W_{0} = u \hbox{  and  } W_{k} = (k-1)w \hbox{ for } k \geq 1$$
+
+with \\(u\\) and \\(w\\) learnable parameters called in the code `time_first` and `time_decay` respectively. The numerator and denominator can both be expressed recursively. Naming them \\(N_{i}\\) and \\(D_{i}\\) we have:
+
+$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}_{i} \hbox{  where  } \hat{N}_{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$
+
+so \\(\hat{N}_{i}\\) (called `numerator_state` in the code) satistfies
+
+$$\hat{N}_{0} = 0 \hbox{  and  } \hat{N}_{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$
+
+and
+
+$$D_{i} = e^{u + K_{i}} + \hat{D}_{i} \hbox{  where  } \hat{D}_{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$
+
+so \\(\hat{D}_{i}\\) (called `denominator_state` in the code) satistfies
+
+$$\hat{D}_{0} = 0 \hbox{  and  } \hat{D}_{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$
+
+The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator:
+
+$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$
+
+with \\(M\\) the maximum of all \\(x_{j}\\). So here on top of saving the numerator state (\\(\hat{N}\\)) and the denominator state (\\(\hat{D}\\)) we also keep track of the maximum of all terms encountered in the exponentials. So we actually use
+
+$$\tilde{N}_{i} = e^{-M_{i}} \hat{N}_{i} \hbox{  and  } \tilde{D}_{i} = e^{-M_{i}} \hat{D}_{i}$$
+
+defined by the following recurrent formulas:
+
+$$\tilde{N}_{0} = 0 \hbox{  and  } \tilde{N}_{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}_{j} \hbox{  where  } q = \max(K_{j}, w + M_{j})$$
+
+and
+
+$$\tilde{D}_{0} = 0 \hbox{  and  } \tilde{D}_{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}_{j} \hbox{  where  } q = \max(K_{j}, w + M_{j})$$
+
+and \\(M_{j+1} = q\\). With those, we can then compute
+
+$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}_{i} \hbox{  where  } q = \max(u + K_{i}, M_{i})$$
+
+and
+
+$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{  where  } q = \max(u + K_{i}, M_{i})$$
+
+which finally gives us
+
+$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$
\ No newline at end of file
diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx
index 33bd8c5a1ab6..ea25e17efa47 100644
--- a/docs/source/en/tasks/language_modeling.mdx
+++ b/docs/source/en/tasks/language_modeling.mdx
@@ -33,8 +33,8 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b0766b0946cd..375966131e0d 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -431,6 +431,7 @@
     "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"],
     "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
     "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
+    "models.rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig"],
     "models.sam": [
         "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "SamConfig",
@@ -2364,6 +2365,14 @@
             "load_tf_weights_in_roformer",
         ]
     )
+    _import_structure["models.rwkv"].extend(
+        [
+            "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RwkvForCausalLM",
+            "RwkvModel",
+            "RwkvPreTrainedModel",
+        ]
+    )
     _import_structure["models.sam"].extend(
         [
             "SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4169,6 +4178,7 @@
     )
     from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig, RoCBertTokenizer
     from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
+    from .models.rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig
     from .models.sam import (
         SAM_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SamConfig,
@@ -5783,6 +5793,12 @@
             RoFormerPreTrainedModel,
             load_tf_weights_in_roformer,
         )
+        from .models.rwkv import (
+            RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RwkvForCausalLM,
+            RwkvModel,
+            RwkvPreTrainedModel,
+        )
         from .models.sam import (
             SAM_PRETRAINED_MODEL_ARCHIVE_LIST,
             SamModel,
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 8c8a67fa5cb0..9f6b072a9a0c 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -753,6 +753,8 @@ def _update_model_kwargs_for_generation(
         model_kwargs["past_key_values"] = self._extract_past_from_model_output(
             outputs, standardize_cache_format=standardize_cache_format
         )
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
 
         # update token_type_ids with last value
         if "token_type_ids" in model_kwargs:
diff --git a/src/transformers/kernels/rwkv/wkv_cuda.cu b/src/transformers/kernels/rwkv/wkv_cuda.cu
new file mode 100644
index 000000000000..571d5a8a8307
--- /dev/null
+++ b/src/transformers/kernels/rwkv/wkv_cuda.cu
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <assert.h>
+
+#define MIN_VALUE (-1e38)
+
+template <typename F>
+__global__ void kernel_forward(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    F aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+}
+
+template <typename F>
+__global__ void kernel_forward_with_state(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y, F *__restrict__ const _s
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset_s = _b * C * 3 + _c * 3;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+    F *__restrict__ const s = _s + _offset_s;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    F aa = s[0], bb = s[1], pp = s[2];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    s[0] = aa;
+    s[1] = bb;
+    s[2] = pp;
+}
+
+template <typename F>
+__global__ void kernel_backward(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _y,
+    const F *__restrict__ const _gy, F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk,
+    F *__restrict__ const _gv
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    const F *__restrict__ const y = _y + _offset;
+    const F *__restrict__ const gy = _gy + _offset;
+    F *__restrict__ const gk = _gk + _offset;
+    F *__restrict__ const gv = _gv + _offset;
+
+    F q[Tmax], r[Tmax];
+
+    F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+        const F yy = y[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        const F qq = gy[ii] / (e1 * bb + e2);
+        gw += (ga - gb * yy) * e1 * qq;
+        gu += (vv - yy) * e2 * qq;
+        q[i] = qq;
+        r[i] = ww - p;
+
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        ga = e1 * (aa + ga);
+        gb = e1 * (bb + gb);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    const int _offsetBC = _b * C + _c;
+    _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward()
+    _gu[_offsetBC] = gu;
+
+    aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = T - 1; i >= 0; i--) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+        const F yy = y[ii];
+        const F qq = q[i];
+        const F rr = r[i];
+
+        F e1 = qq * exp(rr);
+        F e2 = exp(kk + pp);
+        gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb);
+        gv[ii] = e1 + e2 * aa;
+
+        const F ww = w + pp;
+        const F www = rr - u - kk;
+        const F p = max(ww, www);
+        e1 = exp(ww - p);
+        e2 = qq * exp(www - p);
+        aa = e1 * aa + e2;
+        bb = e1 * bb - e2 * yy;
+        pp = p;
+    }
+}
+
+void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
+}
+
+void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_with_state<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, s);
+}
+
+void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_backward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
+}
diff --git a/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu b/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu
new file mode 100644
index 000000000000..042cb4aba1db
--- /dev/null
+++ b/src/transformers/kernels/rwkv/wkv_cuda_bf16.cu
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+#define MIN_VALUE (-1e38)
+typedef at::BFloat16 bf16;
+
+__global__ void kernel_forward_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    bf16 *__restrict__ const y = _y + _offset;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    float aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+}
+
+__global__ void kernel_forward_with_state_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y,
+    float *__restrict__ const _s
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset_s = _b * C * 3 + _c * 3;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    bf16 *__restrict__ const y = _y + _offset;
+    float *__restrict__ const s = _s + _offset_s;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    float aa = s[0], bb = s[1], pp = s[2];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = bf16(e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    s[0] = aa;
+    s[1] = bb;
+    s[2] = pp;
+}
+
+__global__ void kernel_backward_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, const bf16 *__restrict__ const _y,
+    const bf16 *__restrict__ const _gy, bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu,
+    bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    const bf16 *__restrict__ const y = _y + _offset;
+    const bf16 *__restrict__ const gy = _gy + _offset;
+    bf16 *__restrict__ const gk = _gk + _offset;
+    bf16 *__restrict__ const gv = _gv + _offset;
+
+    float q[Tmax], r[Tmax];
+
+    float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        const float yy = float(y[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        const float qq = float(gy[ii]) / (e1 * bb + e2);
+        gw += (ga - gb * yy) * e1 * qq;
+        gu += (vv - yy) * e2 * qq;
+        q[i] = qq;
+        r[i] = ww - p;
+
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        ga = e1 * (aa + ga);
+        gb = e1 * (bb + gb);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    const int _offsetBC = _b * C + _c;
+    _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward()
+    _gu[_offsetBC] = bf16(gu);
+
+    aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = T - 1; i >= 0; i--) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        const float yy = float(y[ii]);
+        const float qq = q[i];
+        const float rr = r[i];
+
+        float e1 = qq * exp(rr);
+        float e2 = exp(kk + pp);
+        gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb));
+        gv[ii] = bf16(e1 + e2 * aa);
+
+        const float ww = w + pp;
+        const float www = rr - u - kk;
+        const float p = max(ww, www);
+        e1 = exp(ww - p);
+        e2 = qq * exp(www - p);
+        aa = e1 * aa + e2;
+        bb = e1 * bb - e2 * yy;
+        pp = p;
+    }
+}
+
+void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
+}
+
+void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_with_state_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, s);
+}
+
+void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_backward_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
+}
diff --git a/src/transformers/kernels/rwkv/wkv_op.cpp b/src/transformers/kernels/rwkv/wkv_op.cpp
new file mode 100644
index 000000000000..55e728066592
--- /dev/null
+++ b/src/transformers/kernels/rwkv/wkv_op.cpp
@@ -0,0 +1,66 @@
+#include <torch/extension.h>
+#include "ATen/ATen.h"
+typedef at::BFloat16 bf16;
+
+void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);
+void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y);
+void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s);
+void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s);
+void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);
+void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv);
+
+void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>());
+}
+void forward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>());
+}
+void forward_with_state(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_with_state(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), s.data_ptr<float>());
+}
+void forward_with_state_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_with_state_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(), s.data_ptr<float>());
+}
+void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_backward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), gy.data_ptr<float>(), gw.data_ptr<float>(), gu.data_ptr<float>(), gk.data_ptr<float>(), gv.data_ptr<float>());
+}
+void backward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_backward_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(),
+        gy.data_ptr<bf16>(), gw.data_ptr<bf16>(), gu.data_ptr<bf16>(), gk.data_ptr<bf16>(), gv.data_ptr<bf16>());
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &forward, "wkv forward");
+    m.def("forward_bf16", &forward_bf16, "wkv forward bf16");
+    m.def("forward_with_state", &forward_with_state, "wkv forward with state");
+    m.def("forward_with_state_bf16", &forward_with_state_bf16, "wkv forward with state bf16");
+    m.def("backward", &backward, "wkv backward");
+    m.def("backward_bf16", &backward_bf16, "wkv backward bf16");
+}
+
+TORCH_LIBRARY(wkv, m) {
+    m.def("forward", forward);
+    m.def("forward_bf16", forward_bf16);
+    m.def("forward_with_state", forward_with_state);
+    m.def("forward_with_state_bf16", forward_with_state_bf16);
+    m.def("backward", backward);
+    m.def("backward_bf16", backward_bf16);
+}
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 4753f253df74..9c955a008f30 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -162,6 +162,7 @@
     roberta_prelayernorm,
     roc_bert,
     roformer,
+    rwkv,
     sam,
     segformer,
     sew,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c3623437348a..8da4a15b29ee 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -163,6 +163,7 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
         ("roc_bert", "RoCBertConfig"),
         ("roformer", "RoFormerConfig"),
+        ("rwkv", "RwkvConfig"),
         ("sam", "SamConfig"),
         ("segformer", "SegformerConfig"),
         ("sew", "SEWConfig"),
@@ -343,6 +344,7 @@
         ("roberta-prelayernorm", "ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roc_bert", "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("rwkv", "RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sam", "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -545,6 +547,7 @@
         ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
         ("roc_bert", "RoCBert"),
         ("roformer", "RoFormer"),
+        ("rwkv", "RWKV"),
         ("sam", "SAM"),
         ("segformer", "SegFormer"),
         ("sew", "SEW"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1ebc906d2d67..8e1835e31f09 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -158,6 +158,7 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormModel"),
         ("roc_bert", "RoCBertModel"),
         ("roformer", "RoFormerModel"),
+        ("rwkv", "RwkvModel"),
         ("sam", "SamModel"),
         ("segformer", "SegformerModel"),
         ("sew", "SEWModel"),
@@ -248,6 +249,7 @@
         ("roberta", "RobertaForMaskedLM"),
         ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
         ("roc_bert", "RoCBertForPreTraining"),
+        ("rwkv", "RwkvForCausalLM"),
         ("splinter", "SplinterForPreTraining"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
@@ -332,6 +334,7 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"),
         ("roc_bert", "RoCBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
+        ("rwkv", "RwkvForCausalLM"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("squeezebert", "SqueezeBertForMaskedLM"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
@@ -395,6 +398,7 @@
         ("roberta-prelayernorm", "RobertaPreLayerNormForCausalLM"),
         ("roc_bert", "RoCBertForCausalLM"),
         ("roformer", "RoFormerForCausalLM"),
+        ("rwkv", "RwkvForCausalLM"),
         ("speech_to_text_2", "Speech2Text2ForCausalLM"),
         ("transfo-xl", "TransfoXLLMHeadModel"),
         ("trocr", "TrOCRForCausalLM"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index de954e206ae1..cb6c91521de9 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -297,6 +297,7 @@
             ),
             ("roc_bert", ("RoCBertTokenizer", None)),
             ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
+            ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
             ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
             ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
diff --git a/src/transformers/models/rwkv/__init__.py b/src/transformers/models/rwkv/__init__.py
new file mode 100644
index 000000000000..e68eefe9f8aa
--- /dev/null
+++ b/src/transformers/models/rwkv/__init__.py
@@ -0,0 +1,60 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig", "RwkvOnnxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_rwkv"] = [
+        "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RwkvForCausalLM",
+        "RwkvModel",
+        "RwkvPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig, RwkvOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_rwkv import (
+            RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RwkvForCausalLM,
+            RwkvModel,
+            RwkvPreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py
new file mode 100644
index 000000000000..89b2f5fb6483
--- /dev/null
+++ b/src/transformers/models/rwkv/configuration_rwkv.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RWKV configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "RWKV/rwkv-4-169m-pile": "https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-430m-pile": "https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-1b5-pile": "https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-3b-pile": "https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-7b-pile": "https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json",
+    "RWKV/rwkv-4-14b-pile": "https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json",
+    "RWKV/rwkv-raven-1b5": "https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json",
+    "RWKV/rwkv-raven-3b": "https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json",
+    "RWKV/rwkv-raven-7b": "https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json",
+    "RWKV/rwkv-raven-14b": "https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json",
+}
+
+
+class RwkvConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the RWVK-4
+    [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50277):
+            Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RwkvModel`].
+        context_length (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
+            lets use any sequence length).
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        attention_hidden_size (`int`, *optional*):
+            Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
+        intermediate_size (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
+            as GPTNeoX.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
+            GPTNeoX.
+        rescale_every (`int`, *optional*, default to 6):
+            At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
+            `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the input token embeddings.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last state.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import RwkvConfig, RwkvModel
+
+    >>> # Initializing a Rwkv configuration
+    >>> configuration = RwkvConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = RwkvModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "rwkv"
+    attribute_map = {"max_position_embeddings": "context_length"}
+
+    def __init__(
+        self,
+        vocab_size=50277,
+        context_length=1024,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        attention_hidden_size=None,
+        intermediate_size=None,
+        layer_norm_epsilon=1e-5,
+        bos_token_id=0,
+        eos_token_id=0,
+        rescale_every=6,
+        tie_word_embeddings=False,
+        use_cache=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.context_length = context_length
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.rescale_every = rescale_every
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
+        )
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
new file mode 100644
index 000000000000..b340b9f028b3
--- /dev/null
+++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""
+
+
+import argparse
+import gc
+import json
+import os
+import re
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
+from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
+
+
+NUM_HIDDEN_LAYERS_MAPPING = {
+    "169M": 12,
+    "430M": 24,
+    "1B5": 24,
+    "3B": 32,
+    "7B": 32,
+    "14B": 40,
+}
+
+HIDEN_SIZE_MAPPING = {
+    "169M": 768,
+    "430M": 1024,
+    "1B5": 2048,
+    "3B": 2560,
+    "7B": 4096,
+    "14B": 5120,
+}
+
+
+def convert_state_dict(state_dict):
+    state_dict_keys = list(state_dict.keys())
+    for name in state_dict_keys:
+        weight = state_dict.pop(name)
+        # emb -> embedding
+        if name.startswith("emb."):
+            name = name.replace("emb.", "embeddings.")
+        # ln_0 -> pre_ln (only present at block 0)
+        if name.startswith("blocks.0.ln0"):
+            name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
+        # att -> attention
+        name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
+        # ffn -> feed_forward
+        name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
+        # time_mix_k -> time_mix_key and reshape
+        if name.endswith(".time_mix_k"):
+            name = name.replace(".time_mix_k", ".time_mix_key")
+        # time_mix_v -> time_mix_value and reshape
+        if name.endswith(".time_mix_v"):
+            name = name.replace(".time_mix_v", ".time_mix_value")
+        # time_mix_r -> time_mix_key and reshape
+        if name.endswith(".time_mix_r"):
+            name = name.replace(".time_mix_r", ".time_mix_receptance")
+
+        if name != "head.weight":
+            name = "rwkv." + name
+
+        state_dict[name] = weight
+    return state_dict
+
+
+def convert_rmkv_checkpoint_to_hf_format(
+    repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
+):
+    # 1. If possible, build the tokenizer.
+    if tokenizer_file is None:
+        print("No `--tokenizer_file` provided, we will use the default tokenizer.")
+        vocab_size = 50277
+        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    else:
+        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
+        vocab_size = len(tokenizer)
+    tokenizer.save_pretrained(output_dir)
+
+    # 2. Build the config
+    possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
+    if size is None:
+        # Try to infer size from the checkpoint name
+        for candidate in possible_sizes:
+            if candidate in checkpoint_file:
+                size = candidate
+                break
+        if size is None:
+            raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
+    if size not in possible_sizes:
+        raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")
+
+    config = RwkvConfig(
+        vocab_size=vocab_size,
+        num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
+        hidden_size=HIDEN_SIZE_MAPPING[size],
+    )
+    config.save_pretrained(output_dir)
+
+    # 3. Download model file then convert state_dict
+    model_file = hf_hub_download(repo_id, checkpoint_file)
+    state_dict = torch.load(model_file, map_location="cpu")
+    state_dict = convert_state_dict(state_dict)
+
+    # 4. Split in shards and save
+    shards, index = shard_checkpoint(state_dict)
+    for shard_file, shard in shards.items():
+        torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if index is not None:
+        save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
+        # Save the index as well
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+        # 5. Clean up shards (for some reason the file PyTorch saves take the same space as the whole state_dict
+        print(
+            "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
+        )
+        shard_files = list(shards.keys())
+
+        del state_dict
+        del shards
+        gc.collect()
+
+        for shard_file in shard_files:
+            state_dict = torch.load(os.path.join(output_dir, shard_file))
+            torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))
+
+    del state_dict
+    gc.collect()
+
+    if push_to_hub:
+        if model_name is None:
+            raise ValueError("Please provide a `model_name` to push the model to the Hub.")
+        model = AutoModelForCausalLM.from_pretrained(output_dir)
+        model.push_to_hub(model_name, max_shard_size="2GB")
+        tokenizer.push_to_hub(model_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
+    )
+    parser.add_argument(
+        "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
+    )
+    parser.add_argument(
+        "--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
+    )
+    parser.add_argument(
+        "--tokenizer_file",
+        default=None,
+        type=str,
+        help="Path to the tokenizer file to use (if not provided, only the model is converted).",
+    )
+    parser.add_argument(
+        "--size",
+        default=None,
+        type=str,
+        help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Push to the Hub the converted model.",
+    )
+    parser.add_argument(
+        "--model_name",
+        default=None,
+        type=str,
+        help="Name of the pushed model on the Hub, including the username / organization.",
+    )
+
+    args = parser.parse_args()
+    convert_rmkv_checkpoint_to_hf_format(
+        args.repo_id,
+        args.checkpoint_file,
+        args.output_dir,
+        size=args.size,
+        tokenizer_file=args.tokenizer_file,
+        push_to_hub=args.push_to_hub,
+        model_name=args.model_name,
+    )
diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
new file mode 100644
index 000000000000..dd85c279daeb
--- /dev/null
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -0,0 +1,804 @@
+# coding=utf-8
+# Copyright 2023 Bo Peng and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RWKV model."""
+
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_ninja_available,
+    is_torch_cuda_available,
+    logging,
+)
+from .configuration_rwkv import RwkvConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile"
+_CONFIG_FOR_DOC = "RwkvConfig"
+
+RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "RWKV/rwkv-4-169m-pile",
+    "RWKV/rwkv-4-430m-pile",
+    "RWKV/rwkv-4-1b5-pile",
+    "RWKV/rwkv-4-3b-pile",
+    "RWKV/rwkv-4-7b-pile",
+    "RWKV/rwkv-4-14b-pile",
+    "RWKV/rwkv-raven-1b5",
+    "RWKV/rwkv-raven-3b",
+    "RWKV/rwkv-raven-7b",
+    "RWKV/rwkv-raven-14b",
+    # See all RWKV models at https://huggingface.co/models?filter=rwkv
+]
+
+
+rwkv_cuda_kernel = None
+
+
+def load_wkv_cuda_kernel(context_length):
+    from torch.utils.cpp_extension import load as load_kernel
+
+    global rwkv_cuda_kernel
+
+    kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv"
+    cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]]
+
+    # Only load the kernel if it's not been loaded yet or if we changed the context length
+    if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length:
+        return
+
+    logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")
+
+    flags = [
+        "-res-usage",
+        "--maxrregcount 60",
+        "--use_fast_math",
+        "-O3",
+        "-Xptxas -O3",
+        "--extra-device-vectorization",
+        f"-DTmax={context_length}",
+    ]
+    rwkv_cuda_kernel = load_kernel(
+        name=f"wkv_{context_length}",
+        sources=cuda_kernel_files,
+        verbose=(logging.get_verbosity() == logging.DEBUG),
+        extra_cuda_cflags=flags,
+    )
+    rwkv_cuda_kernel.max_seq_length = context_length
+
+
+class RwkvLinearAttention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False):
+        batch_size, seq_len, hidden_size = key.size()
+        if seq_len > rwkv_cuda_kernel.max_seq_length:
+            raise ValueError(
+                f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
+                f"{rwkv_cuda_kernel.max_seq_length} with this model."
+            )
+        if batch_size * hidden_size % min(hidden_size, 32) != 0:
+            raise ValueError(
+                f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
+                f"multiple of {min(hidden_size, 32)}."
+            )
+
+        ctx.input_dtype = key.dtype
+
+        if (
+            time_decay.device.type != "cuda"
+            or time_first.device.type != "cuda"
+            or key.device.type != "cuda"
+            or value.device.type != "cuda"
+        ):
+            raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")
+
+        time_decay = -torch.exp(time_decay.float().contiguous())
+        if key.dtype == torch.float16:
+            time_first = time_first.float()
+            key = key.float()
+            value = value.float()
+        time_first = time_first.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        # The CUDA kernel will fill this tensor.
+        output = torch.empty_like(key, memory_format=torch.contiguous_format)
+        if return_state or state is not None:
+            if state is None:
+                state = torch.zeros(
+                    batch_size,
+                    hidden_size,
+                    3,
+                    dtype=torch.float32,
+                    device=key.device,
+                    memory_format=torch.contiguous_format,
+                )
+                state[:, :, 2] -= 1e38
+            else:
+                state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous()
+            if key.dtype == torch.bfloat16:
+                forward_func = rwkv_cuda_kernel.forward_with_state_bf16
+            else:
+                forward_func = rwkv_cuda_kernel.forward_with_state
+            forward_func(time_decay, time_first, key, value, output, state)
+        else:
+            forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
+            forward_func(time_decay, time_first, key, value, output)
+
+        ctx.save_for_backward(time_decay, time_first, key, value, output)
+
+        if state is not None:
+            state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)]
+
+        return output.to(ctx.input_dtype), state
+
+    @staticmethod
+    # g stands for grad
+    def backward(ctx, g_output):
+        input_dtype = ctx.input_dtype
+
+        time_decay, time_first, key, value, output = ctx.saved_tensors
+        # The CUDA kernel will fill those tensors.
+        g_time_decay = torch.empty_like(
+            time_decay,
+            memory_format=torch.contiguous_format,
+            dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,
+        )
+        g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
+        g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
+        g_value = torch.empty_like(value, memory_format=torch.contiguous_format)
+
+        if input_dtype == torch.float16:
+            g_output = g_output.float()
+        backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
+        backward_func(
+            time_decay,
+            time_first,
+            key,
+            value,
+            output,
+            g_output.contiguous(),
+            g_time_decay,
+            g_time_first,
+            g_key,
+            g_value,
+        )
+        g_time_decay = torch.sum(g_time_decay, dim=0)
+        g_time_first = torch.sum(g_time_first, dim=0)
+
+        return (
+            None,
+            None,
+            None,
+            g_time_decay.to(input_dtype),
+            g_time_first.to(input_dtype),
+            g_key.to(input_dtype),
+            g_value.to(input_dtype),
+        )
+
+
+def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
+    # For CPU fallback. Will be slower and probably take more memory than the custom CUDA kernel if not executed
+    # within a torch.no_grad.
+    _, seq_length, _ = key.size()
+    output = torch.zeros_like(key)
+
+    if state is None:
+        num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
+        den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
+        max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
+    else:
+        num_state, den_state, max_state = state
+    # For numerical stability
+    #    real_numerator_state = num_state * torch.exp(max_state)
+    #    real_denominator_state = den_state * torch.exp(max_state)
+
+    time_decay = -torch.exp(time_decay)
+
+    for current_index in range(seq_length):
+        current_key = key[:, current_index].float()
+        current_value = value[:, current_index]
+
+        # wkv computation at time t
+        max_for_output = torch.maximum(max_state, current_key + time_first)
+        e1 = torch.exp(max_state - max_for_output)
+        e2 = torch.exp(current_key + time_first - max_for_output)
+        numerator = e1 * num_state + e2 * current_value
+        denominator = e1 * den_state + e2
+        output[:, current_index] = (numerator / denominator).to(output.dtype)
+
+        # Update state for next iteration
+        max_for_state = torch.maximum(max_state + time_decay, current_key)
+        e1 = torch.exp(max_state + time_decay - max_for_state)
+        e2 = torch.exp(current_key - max_for_state)
+        num_state = e1 * num_state + e2 * current_value
+        den_state = e1 * den_state + e2
+        max_state = max_for_state
+
+    if return_state or state is not None:
+        state = [num_state, den_state, max_state]
+
+    return output, state
+
+
+def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):
+    no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value])
+    # Launching the CUDA kernel for just one token will actually be slower (there is no for loop in the CPU version
+    # in this case).
+    one_token = key.size(1) == 1
+    if rwkv_cuda_kernel is None or no_cuda or one_token:
+        return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)
+    else:
+        return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)
+
+
+class RwkvSelfAttention(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.config = config
+        kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length
+        if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded:
+            try:
+                load_wkv_cuda_kernel(config.context_length)
+            except Exception:
+                logger.info("Could not load the custom CUDA kernel for RWKV attention.")
+        self.layer_id = layer_id
+        hidden_size = config.hidden_size
+        attention_hidden_size = (
+            config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
+        )
+        self.attention_hidden_size = attention_hidden_size
+
+        self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))
+        self.time_first = nn.Parameter(torch.empty(attention_hidden_size))
+
+        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
+        self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
+        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
+
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
+
+    # TODO: maybe jit, otherwise move inside forward
+    def extract_key_value(self, hidden, state=None):
+        # Mix hidden with the previous timestep to produce key, value, receptance
+        if hidden.size(1) == 1 and state is not None:
+            shifted = state[1][:, :, self.layer_id]
+        else:
+            shifted = self.time_shift(hidden)
+            if state is not None:
+                shifted[:, 0] = state[1][:, :, self.layer_id]
+        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
+        value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
+        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
+
+        key = self.key(key)
+        value = self.value(value)
+        receptance = torch.sigmoid(self.receptance(receptance))
+        if state is not None:
+            state[1][:, :, self.layer_id] = hidden[:, -1]
+        return receptance, key, value, state
+
+    def forward(self, hidden, state=None, use_cache=False):
+        receptance, key, value, state = self.extract_key_value(hidden, state=state)
+        layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None
+        rwkv, layer_state = rwkv_linear_attention(
+            self.time_decay,
+            self.time_first,
+            key,
+            value,
+            state=layer_state,
+            return_state=use_cache,
+        )
+
+        if layer_state is not None:
+            state[2][:, :, self.layer_id] = layer_state[0]
+            state[3][:, :, self.layer_id] = layer_state[1]
+            state[4][:, :, self.layer_id] = layer_state[2]
+
+        return self.output(receptance * rwkv), state
+
+
+class RwkvFeedForward(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        hidden_size = config.hidden_size
+        intermediate_size = (
+            config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
+        )
+
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
+        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
+
+        self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
+
+    def forward(self, hidden, state=None):
+        if hidden.size(1) == 1 and state is not None:
+            shifted = state[0][:, :, self.layer_id]
+        else:
+            shifted = self.time_shift(hidden)
+            if state is not None:
+                shifted[:, 0] = state[0][:, :, self.layer_id]
+        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
+        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
+
+        key = torch.square(torch.relu(self.key(key)))
+        value = self.value(key)
+        receptance = torch.sigmoid(self.receptance(receptance))
+
+        if state is not None:
+            state[0][:, :, self.layer_id] = hidden[:, -1]
+
+        return receptance * value, state
+
+
+class RwkvBlock(nn.Module):
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+
+        if layer_id == 0:
+            self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.attention = RwkvSelfAttention(config, layer_id)
+        self.feed_forward = RwkvFeedForward(config, layer_id)
+
+    def forward(self, hidden, state=None, use_cache=False, output_attentions=False):
+        if self.layer_id == 0:
+            hidden = self.pre_ln(hidden)
+
+        attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)
+        hidden = hidden + attention
+
+        feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)
+        hidden = hidden + feed_forward
+
+        outputs = (hidden, state)
+        if output_attentions:
+            outputs += (attention,)
+        else:
+            outputs += (None,)
+
+        return outputs
+
+
+class RwkvPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RwkvConfig
+    base_model_prefix = "rwkv"
+    _no_split_modules = ["RwkvBlock"]
+    _keep_in_fp32_modules = ["time_decay", "time_first"]
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, RwkvSelfAttention):
+            layer_id = module.layer_id
+            num_hidden_layers = module.config.num_hidden_layers
+            hidden_size = module.config.hidden_size
+            attention_hidden_size = module.attention_hidden_size
+
+            ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
+            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
+
+            time_weight = torch.tensor(
+                [i / hidden_size for i in range(hidden_size)],
+                dtype=module.time_mix_key.dtype,
+                device=module.time_mix_key.device,
+            )
+            time_weight = time_weight[None, None, :]
+
+            decay_speed = [
+                -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+                for h in range(attention_hidden_size)
+            ]
+            decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
+            zigzag = (
+                torch.tensor(
+                    [(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
+                    dtype=module.time_first.dtype,
+                    device=module.time_first.device,
+                )
+                * 0.5
+            )
+
+            with torch.no_grad():
+                module.time_decay.data = decay_speed
+                module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag)
+
+                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
+                module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
+                module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
+        elif isinstance(module, RwkvFeedForward):
+            layer_id = module.layer_id
+            num_hidden_layers = module.config.num_hidden_layers
+            hidden_size = module.config.hidden_size
+
+            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
+
+            time_weight = torch.tensor(
+                [i / hidden_size for i in range(hidden_size)],
+                dtype=module.time_mix_key.dtype,
+                device=module.time_mix_key.device,
+            )
+            time_weight = time_weight[None, None, :]
+
+            with torch.no_grad():
+                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
+                module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RwkvModel):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class RwkvOutput(ModelOutput):
+    """
+    Class for the RWKV model outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    state: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RwkvCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    state: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+RWKV_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`RwkvConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+RWKV_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
+    RWKV_START_DOCSTRING,
+)
+class RwkvModel(RwkvPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
+        self.ln_out = nn.LayerNorm(config.hidden_size)
+
+        self.layers_are_rescaled = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=RwkvOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        state: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, RwkvOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training == self.layers_are_rescaled:
+            self._rescale_layers()
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        if use_cache and state is None:
+            shape = (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers)
+            state = [
+                torch.zeros(
+                    *shape, dtype=inputs_embeds.dtype if i <= 1 else torch.float32, device=inputs_embeds.device
+                )
+                for i in range(5)
+            ]
+            state[4] -= 1e30
+
+        hidden_states = inputs_embeds
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for idx, block in enumerate(self.blocks):
+            hidden_states, state, attentions = block(
+                hidden_states, state=state, use_cache=use_cache, output_attentions=output_attentions
+            )
+            if (
+                self.layers_are_rescaled
+                and self.config.rescale_every > 0
+                and (idx + 1) % self.config.rescale_every == 0
+            ):
+                hidden_states = hidden_states / 2
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (attentions,)
+
+        hidden_states = self.ln_out(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return (hidden_states, state, all_hidden_states, all_self_attentions)
+
+        return RwkvOutput(
+            last_hidden_state=hidden_states,
+            state=state,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _rescale_layers(self):
+        # Layers should be rescaled for inference only.
+        if self.layers_are_rescaled == (not self.training):
+            return
+        if self.config.rescale_every > 0:
+            with torch.no_grad():
+                for block_id, block in enumerate(self.blocks):
+                    if self.training:
+                        block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
+                        block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
+                    else:
+                        block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
+                        block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
+
+        self.layers_are_rescaled = not self.training
+
+
+@add_start_docstrings(
+    """
+    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    RWKV_START_DOCSTRING,
+)
+class RwkvForCausalLM(RwkvPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.rwkv = RwkvModel(config)
+        self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs):
+        # only last token for inputs_ids if the state is passed along.
+        if state is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and state is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs["state"] = state
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=RwkvCausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        state: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, RwkvCausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        rwkv_outputs = self.rwkv(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            state=state,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = rwkv_outputs[0]
+
+        logits = self.head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + rwkv_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return RwkvCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            state=rwkv_outputs.state,
+            hidden_states=rwkv_outputs.hidden_states,
+            attentions=rwkv_outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 7279117698db..eeb799e18875 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -6105,6 +6105,30 @@ def load_tf_weights_in_roformer(*args, **kwargs):
     requires_backends(load_tf_weights_in_roformer, ["torch"])
 
 
+RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RwkvForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RwkvModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RwkvPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 SAM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/rwkv/__init__.py b/tests/models/rwkv/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
new file mode 100644
index 000000000000..4afcc9b41f86
--- /dev/null
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+from unittest.util import safe_repr
+
+from transformers import AutoTokenizer, RwkvConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
+        RwkvForCausalLM,
+        RwkvModel,
+    )
+
+
+class RwkvModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=False,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def get_large_model_config(self):
+        return RwkvConfig.from_pretrained("sgugger/rwkv-4-pile-7b")
+
+    def prepare_config_and_inputs(
+        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+    ):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config(
+            gradient_checkpointing=gradient_checkpointing,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            None,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def get_config(
+        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+    ):
+        return RwkvConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            intermediate_size=self.intermediate_size,
+            activation_function=self.hidden_act,
+            resid_pdrop=self.hidden_dropout_prob,
+            attn_pdrop=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_rwkv_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        config.output_hidden_states = True
+        model = RwkvModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.hidden_states), config.num_hidden_layers + 1)
+
+    def create_and_check_causl_lm(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = RwkvForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_state_equivalency(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = RwkvModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs = model(input_ids)
+        output_whole = outputs.last_hidden_state
+
+        outputs = model(input_ids[:, :2])
+        output_one = outputs.last_hidden_state
+
+        # Using the state computed on the first inputs, we will get the same output
+        outputs = model(input_ids[:, 2:], state=outputs.state)
+        output_two = outputs.last_hidden_state
+
+        self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
+
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
+    ):
+        model = RwkvForCausalLM(config)
+        model.to(torch_device)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids}
+
+        return config, inputs_dict
+
+
+@require_torch
+class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (RwkvModel, RwkvForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": RwkvModel,
+            "text-generation": RwkvForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+    # all_generative_model_classes = (RwkvForCausalLM,) if is_torch_available() else ()
+    fx_compatible = False
+    test_missing_keys = False
+    test_model_parallel = False
+    test_pruning = False
+    test_head_masking = False  # Rwkv does not support head masking
+
+    def setUp(self):
+        self.model_tester = RwkvModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=RwkvConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"]
+        )
+
+    def assertInterval(self, member, container, msg=None):
+        r"""
+        Simple utility function to check if a member is inside an interval.
+        """
+        if isinstance(member, torch.Tensor):
+            max_value, min_value = member.max().item(), member.min().item()
+        elif isinstance(member, list) or isinstance(member, tuple):
+            max_value, min_value = max(member), min(member)
+
+        if not isinstance(container, list):
+            raise TypeError("container should be a list or tuple")
+        elif len(container) != 2:
+            raise ValueError("container should have 2 elements")
+
+        expected_min, expected_max = container
+
+        is_inside_interval = (min_value >= expected_min) and (max_value <= expected_max)
+
+        if not is_inside_interval:
+            standardMsg = "%s not found in %s" % (safe_repr(member), safe_repr(container))
+            self.fail(self._formatMessage(msg, standardMsg))
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_rwkv_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_rwkv_model(*config_and_inputs)
+
+    def test_rwkv_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_causl_lm(*config_and_inputs)
+
+    def test_state_equivalency(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_state_equivalency(*config_and_inputs)
+
+    def test_initialization(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, param in model.named_parameters():
+                if "time_decay" in name:
+                    if param.requires_grad:
+                        self.assertTrue(param.data.max().item() == 3.0)
+                        self.assertTrue(param.data.min().item() == -5.0)
+                elif "time_first" in name:
+                    if param.requires_grad:
+                        # check if it's a ones like
+                        self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5))
+                elif any([x in name for x in ["time_mix_key", "time_mix_receptance"]]):
+                    if param.requires_grad:
+                        self.assertInterval(
+                            param.data,
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                elif "time_mix_value" in name:
+                    if param.requires_grad:
+                        self.assertInterval(
+                            param.data,
+                            [0.0, 1.3],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def test_attention_outputs(self):
+        r"""
+        Overriding the test_attention_outputs test as the attention outputs of Rwkv are different from other models
+        it has a shape `batch_size, seq_len, hidden_size`.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            batch_size = inputs["input_ids"].shape[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            batch_size = inputs["input_ids"].shape[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [batch_size, seq_len, config.hidden_size],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            batch_size = inputs["input_ids"].shape[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [batch_size, seq_len, config.hidden_size],
+            )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in RWKV_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RwkvModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@slow
+class RWKVIntegrationTests(unittest.TestCase):
+    def setUp(self):
+        self.model_id = "RWKV/rwkv-4-169m-pile"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+
+    def test_simple_generate(self):
+        expected_output = "Hello my name is Jasmine and I am a newbie to the"
+        model = RwkvForCausalLM.from_pretrained(self.model_id).to(torch_device)
+
+        input_ids = self.tokenizer("Hello my name is", return_tensors="pt").input_ids.to(torch_device)
+        output = model.generate(input_ids, max_new_tokens=10)
+        output_sentence = self.tokenizer.decode(output[0].tolist())
+
+        self.assertEqual(output_sentence, expected_output)
+
+    def test_simple_generate_bf16(self):
+        expected_output = "Hello my name is Jasmine and I am a newbie to the"
+
+        input_ids = self.tokenizer("Hello my name is", return_tensors="pt").input_ids.to(torch_device)
+        model = RwkvForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)
+
+        output = model.generate(input_ids, max_new_tokens=10)
+        output_sentence = self.tokenizer.decode(output[0].tolist())
+
+        self.assertEqual(output_sentence, expected_output)
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 1d249c3b52dd..fdb679529d2b 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -93,15 +93,20 @@
 
 
 class ConfigTester(object):
-    def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs):
+    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
         self.parent = parent
         self.config_class = config_class
         self.has_text_modality = has_text_modality
         self.inputs_dict = kwargs
+        self.common_properties = common_properties
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
-        common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"]
+        common_properties = (
+            ["hidden_size", "num_attention_heads", "num_hidden_layers"]
+            if self.common_properties is None
+            else self.common_properties
+        )
 
         # Add common fields for text models
         if self.has_text_modality:

From c34a525d2faea2976fbeeabbaaae929d05f8d8a7 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 9 May 2023 19:04:27 +0200
Subject: [PATCH 056/935] Proposed fix for TF example now running on
 safetensors. (#23208)

* Proposed fix for TF example now running on safetensors.

* Adding more warnings and returning keys.

* Trigger CI

* Trigger CI

---------

Co-authored-by: Sylvain Gugger <Sylvain.gugger@gmail.com>
---
 .../tensorflow/test_tensorflow_examples.py    |  1 -
 src/transformers/modeling_tf_pytorch_utils.py | 35 +++++++++++++++++--
 src/transformers/modeling_tf_utils.py         |  1 +
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py
index d5ae4c71b869..956209baade4 100644
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@@ -297,7 +297,6 @@ def test_run_translation(self):
             result = get_results(tmp_dir)
             self.assertGreaterEqual(result["bleu"], 30)
 
-    @skip("Fix me Matt")
     def test_run_image_classification(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 402159cc6f9e..3b1c030699b9 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -246,6 +246,7 @@ def load_pytorch_state_dict_in_tf2_model(
     output_loading_info=False,
     _prefix=None,
     tf_to_pt_weight_rename=None,
+    ignore_mismatched_sizes=False,
 ):
     """Load a pytorch state_dict in a TF 2.0 model."""
     import tensorflow as tf
@@ -297,6 +298,7 @@ def load_pytorch_state_dict_in_tf2_model(
     weight_value_tuples = []
     all_pytorch_weights = set(pt_state_dict.keys())
     missing_keys = []
+    mismatched_keys = []
     for symbolic_weight in symbolic_weights:
         sw_name = symbolic_weight.name
         name, transpose = convert_tf_weight_name_to_pt_weight_name(
@@ -319,7 +321,18 @@ def load_pytorch_state_dict_in_tf2_model(
                     continue
             raise AttributeError(f"{name} not found in PyTorch model")
 
-        array = apply_transpose(transpose, pt_state_dict[name], symbolic_weight.shape)
+        try:
+            array = apply_transpose(transpose, pt_state_dict[name], symbolic_weight.shape)
+        except tf.errors.InvalidArgumentError as e:
+            if not ignore_mismatched_sizes:
+                error_msg = str(e)
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+                raise tf.errors.InvalidArgumentError(error_msg)
+            else:
+                mismatched_keys.append((name, pt_state_dict[name].shape, symbolic_weight.shape))
+                continue
 
         tf_loaded_numel += tensor_size(array)
 
@@ -367,8 +380,26 @@ def load_pytorch_state_dict_in_tf2_model(
             f"you can already use {tf_model.__class__.__name__} for predictions without further training."
         )
 
+    if len(mismatched_keys) > 0:
+        mismatched_warning = "\n".join(
+            [
+                f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                for key, shape1, shape2 in mismatched_keys
+            ]
+        )
+        logger.warning(
+            f"Some weights of {tf_model.__class__.__name__} were not initialized from the model checkpoint"
+            f" are newly initialized because the shapes did not"
+            f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
+            " to use it for predictions and inference."
+        )
+
     if output_loading_info:
-        loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys}
+        loading_info = {
+            "missing_keys": missing_keys,
+            "unexpected_keys": unexpected_keys,
+            "mismatched_keys": mismatched_keys,
+        }
         return tf_model, loading_info
 
     return tf_model
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index f48651a6e9cc..35c526379c88 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2820,6 +2820,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 allow_missing_keys=True,
                 output_loading_info=output_loading_info,
                 _prefix=load_weight_prefix,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
             )
 
         # 'by_name' allow us to do transfer learning by skipping/adding layers

From 650a71e157478cc8c9d9dc648a6a79108f49e047 Mon Sep 17 00:00:00 2001
From: Konstantin Dobler <konstantin.j.dobler@gmail.com>
Date: Tue, 9 May 2023 19:05:13 +0200
Subject: [PATCH 057/935] Support ratios for `logging_steps`, `eval_steps`, and
 `save_steps` (#23235)

* Ratio option for `logging_steps`, `eval_steps`, `save_steps`

* Add guards if arguments are not set

* Add more detailed comments + formatting

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Convert args values to `int` if bigger than 1

* `black`

* `make fixup`

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/trainer.py       |  8 ++++
 src/transformers/training_args.py | 72 +++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d18e3efeb87a..f7fb3558df7f 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1712,6 +1712,14 @@ def _inner_training_loop(
                 f" {args.max_steps}"
             )
 
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps and args.logging_steps < 1:
+            args.logging_steps = math.ceil(max_steps * args.logging_steps)
+        if args.eval_steps and args.eval_steps < 1:
+            args.eval_steps = math.ceil(max_steps * args.eval_steps)
+        if args.save_steps and args.save_steps < 1:
+            args.save_steps = math.ceil(max_steps * args.save_steps)
+
         if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
             if self.args.n_gpu > 1:
                 # nn.DataParallel(model) replicates the model, creating new variables and module
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 4b0ea975b26d..44f28ff99ef4 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -251,8 +251,9 @@ class TrainingArguments:
 
         logging_first_step (`bool`, *optional*, defaults to `False`):
             Whether to log and evaluate the first `global_step` or not.
-        logging_steps (`int`, *optional*, defaults to 500):
-            Number of update steps between two logs if `logging_strategy="steps"`.
+        logging_steps (`int` or `float`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
+            range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
         logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
             Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
             or `inf` is filtered and the average loss of the current logging window is taken instead.
@@ -270,8 +271,9 @@ class TrainingArguments:
                 - `"no"`: No save is done during training.
                 - `"epoch"`: Save is done at the end of each epoch.
                 - `"steps"`: Save is done every `save_steps`.
-        save_steps (`int`, *optional*, defaults to 500):
-            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_steps (`int` or `float`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
+            float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
         save_total_limit (`int`, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
             `output_dir`.
@@ -332,9 +334,10 @@ class TrainingArguments:
         dataloader_drop_last (`bool`, *optional*, defaults to `False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
-        eval_steps (`int`, *optional*):
+        eval_steps (`int` or `float`, *optional*):
             Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
-            value as `logging_steps` if not set.
+            value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
+            will be interpreted as ratio of total training steps.
         dataloader_num_workers (`int`, *optional*, defaults to 0):
             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
             main process.
@@ -721,13 +724,29 @@ class TrainingArguments:
         metadata={"help": "The logging strategy to use."},
     )
     logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
-    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    logging_steps: float = field(
+        default=500,
+        metadata={
+            "help": (
+                "Log every X updates steps. Should be an integer or a float in range `[0,1)`."
+                "If smaller than 1, will be interpreted as ratio of total training steps."
+            )
+        },
+    )
     logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
     save_strategy: Union[IntervalStrategy, str] = field(
         default="steps",
         metadata={"help": "The checkpoint save strategy to use."},
     )
-    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    save_steps: float = field(
+        default=500,
+        metadata={
+            "help": (
+                "Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`."
+                "If smaller than 1, will be interpreted as ratio of total training steps."
+            )
+        },
+    )
     save_total_limit: Optional[int] = field(
         default=None,
         metadata={
@@ -854,7 +873,15 @@ class TrainingArguments:
     dataloader_drop_last: bool = field(
         default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
     )
-    eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    eval_steps: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`."
+                "If smaller than 1, will be interpreted as ratio of total training steps."
+            )
+        },
+    )
     dataloader_num_workers: int = field(
         default=0,
         metadata={
@@ -1186,6 +1213,19 @@ def __post_init__(self):
         if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
             raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
 
+        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
+            if self.logging_steps != int(self.logging_steps):
+                raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
+            self.logging_steps = int(self.logging_steps)
+        if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
+            if self.eval_steps != int(self.eval_steps):
+                raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
+            self.eval_steps = int(self.eval_steps)
+        if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
+            if self.save_steps != int(self.save_steps):
+                raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
+            self.save_steps = int(self.save_steps)
+
         # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
         if self.load_best_model_at_end:
             if self.evaluation_strategy != self.save_strategy:
@@ -1194,6 +1234,20 @@ def __post_init__(self):
                     f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
                 )
             if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+                if self.eval_steps < 1 or self.save_steps < 1:
+                    if not (self.eval_steps < 1 and self.save_steps < 1):
+                        raise ValueError(
+                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
+                            "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
+                            f"{self.save_steps} and eval_steps {self.eval_steps}."
+                        )
+                    # Work around floating point precision issues
+                    LARGE_MULTIPLIER = 1_000_000
+                    if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
+                        raise ValueError(
+                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
+                            f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
+                        )
                 raise ValueError(
                     "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
                     f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."

From 627f44799a9f4948a6a1b8fe9e536eee0e29ea68 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 10 May 2023 03:34:48 +0900
Subject: [PATCH 058/935] [Doctests] Refactor doctests + add CI (#22987)

* intiial commit

* new styling

* update

* just run doctest in CI

* remove more test for fast dev

* update

* update refs

* update path and fetch upstream

* update documentatyion trests

* typo

* parse pwd

* don't check for files that are in hidden folders

* just give paths relative to transformers

* update

* update

* update

* major refactoring

* make sure options is ok

* lest test that mdx is tested

* doctest glob

* nits

* update doctest nightly

* some cleaning

* run correct test on diff

* debug

* run on a single worker

* skip_cuda_test tampkate

* updates

* add rA and continue on failure

* test options

* parse `py` codeblock?

* we don't need to replace ignore results, don't remember whyu I put it

* cleanup

* more cleaning

* fix arg

* more cleaning

* clean an todo

* more pre-processing

* doctest-module has none so extra `- ` is needed

* remove logs

* nits

* doctest-modules ....

* oups

* let's use sugar

* make dataset go quiet

* add proper timeout

* nites

* spleling timeout

* update

* properly skip tests that have CUDSA

* proper skipping

* cleaning main and get tests to run

* remove make report?

* remove tee

* some updates

* tee was removed but is the full output still available?

* [all-test]

* only our tests

* don't  touch tee in this PR

* no atee-sys

* proper sub

* monkey

* only replace call

* fix sub

* nits

* nits

* fix invalid syntax

* add skip cuda doctest env variable

* make sure all packages are installed

* move file

* update check repo

* revert changes

* nit

* finish cleanup

* fix re

* findall

* update don't test init files

* ignore pycache

* `-ignore-pycache` when running pytests

* try to fix the import missmatch error

* install dec

* pytest is required as doctest_utils imports things from it

* the only log issues were dataset, ignore results should work

* more cleaning

* Update .circleci/create_circleci_config.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* [ydshieh] empty string if cuda is found

* [ydshieh] fix condition

* style

* [ydshieh] fix

* Add comment

* style

* style

* show failure

* trigger CI

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml                    |   1 +
 .circleci/create_circleci_config.py     |  85 ++++++++++-
 .github/workflows/doctests.yml          |   8 -
 Makefile                                |   8 +-
 conftest.py                             |  12 +-
 docs/source/en/testing.mdx              |  12 +-
 setup.cfg                               |   1 +
 src/transformers/utils/__init__.py      |   1 +
 src/transformers/utils/doctest_utils.py | 189 ++++++++++++++++++++++++
 utils/prepare_for_doc_test.py           | 148 -------------------
 10 files changed, 287 insertions(+), 178 deletions(-)
 create mode 100644 src/transformers/utils/doctest_utils.py
 delete mode 100644 utils/prepare_for_doc_test.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 63c6162fc15d..9af649045803 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -166,6 +166,7 @@ jobs:
                       - v0.6-repository_consistency
             - run: pip install --upgrade pip
             - run: pip install .[all,quality]
+            - run: pip install pytest
             - save_cache:
                   key: v0.5-repository_consistency-{{ checksum "setup.py" }}
                   paths:
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 7208d876a97c..30898f9e1c2a 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -51,6 +51,8 @@ class CircleCIJob:
     resource_class: Optional[str] = "xlarge"
     tests_to_run: Optional[List[str]] = None
     working_directory: str = "~/transformers"
+    # This should be only used for doctest job!
+    command_timeout: Optional[int] = None
 
     def __post_init__(self):
         # Deal with defaults for mutable attributes.
@@ -107,11 +109,15 @@ def to_dict(self):
         steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
 
         all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
+        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
         pytest_flags.append(
             f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
         )
-        test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+        test_command = ""
+        if self.command_timeout:
+            test_command = f"timeout {self.command_timeout} "
+        test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+        
         if self.parallelism == 1:
             if self.tests_to_run is None:
                 test_command += " << pipeline.parameters.tests_to_run >>"
@@ -161,12 +167,37 @@ def to_dict(self):
             steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
             steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
 
-            test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+            test_command = ""
+            if self.timeout:
+                test_command = f"timeout {self.timeout} "
+            test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
             test_command += " $(cat splitted_tests.txt)"
         if self.marker is not None:
             test_command += f" -m {self.marker}"
-        test_command += " | tee tests_output.txt"
+
+        if self.name == "pr_documentation_tests":
+            # can't use ` | tee tee tests_output.txt` as usual
+            test_command += " > tests_output.txt"
+            # Save the return code, so we can check if it is timeout in the next step.
+            test_command += '; touch "$?".txt'
+            # Never fail the test step for the doctest job. We will check the results in the next step, and fail that
+            # step instead if the actual test failures are found. This is to avoid the timeout being reported as test
+            # failure.
+            test_command = f"({test_command}) || true"
+        else:
+            test_command += " | tee tests_output.txt"
         steps.append({"run": {"name": "Run tests", "command": test_command}})
+
+        # return code `124` means the previous (pytest run) step is timeout
+        if self.name == "pr_documentation_tests":
+            checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; '
+            checkout_doctest_command += 'then echo "some test failed"; '
+            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; '
+            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; '
+            checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; '
+            checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;'
+            steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}})
+
         steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
         steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
         job["steps"] = steps
@@ -401,6 +432,51 @@ def job_name(self):
     tests_to_run="tests/repo_utils",
 )
 
+# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file).
+py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
+py_command = f"$(python3 -c '{py_command}')"
+command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt'
+doc_test_job = CircleCIJob(
+    "pr_documentation_tests",
+    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
+    install_steps=[
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
+        "pip install --upgrade pip",
+        "pip install -e .[dev]",
+        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade pytest pytest-sugar",
+        "find -name __pycache__ -delete",
+        "find . -name \*.pyc -delete",
+        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
+        "touch dummy.py",
+        {
+            "name": "Get files to test",
+            "command":
+                "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n"
+                "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt"
+        },
+        {
+            "name": "List files beings changed: pr_documentation_tests.txt",
+            "command":
+                "cat pr_documentation_tests.txt"
+        },
+        {
+            "name": "Filter pr_documentation_tests.txt",
+            "command":
+                command
+        },
+        {
+            "name": "List files beings tested: pr_documentation_tests_filtered.txt",
+            "command":
+                "cat pr_documentation_tests_filtered.txt"
+        },
+    ],
+    tests_to_run="$(cat pr_documentation_tests_filtered.txt)",  # noqa
+    pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
+    command_timeout=1200,  # test cannot run longer than 1200 seconds
+    pytest_num_workers=1,
+)
+
 REGULAR_TESTS = [
     torch_and_tf_job,
     torch_and_flax_job,
@@ -411,6 +487,7 @@ def job_name(self):
     hub_job,
     onnx_job,
     exotic_models_job,
+    doc_test_job
 ]
 EXAMPLES_TESTS = [
     examples_torch_job,
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index a0efe40cbbe9..55c09b1acc82 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -37,18 +37,10 @@ jobs:
       - name: Show installed libraries and their versions
         run: pip freeze
 
-      - name: Prepare files for doctests
-        run: |
-          python3 utils/prepare_for_doc_test.py src docs
-
       - name: Run doctests
         run: |
           python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
 
-      - name: Clean files after doctests
-        run: |
-          python3 utils/prepare_for_doc_test.py src docs --remove_new_line
-
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
diff --git a/Makefile b/Makefile
index 5e5a11a1fee0..d6d6966a1dad 100644
--- a/Makefile
+++ b/Makefile
@@ -47,10 +47,10 @@ repo-consistency:
 # this target runs checks on all files
 
 quality:
-	black --check $(check_dirs) setup.py
+	black --check $(check_dirs) setup.py conftest.py
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
-	ruff $(check_dirs) setup.py
+	ruff $(check_dirs) setup.py conftest.py
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py
 
@@ -65,8 +65,8 @@ extra_style_checks:
 # this target runs checks on all files and potentially modifies some of them
 
 style:
-	black $(check_dirs) setup.py
-	ruff $(check_dirs) setup.py --fix
+	black $(check_dirs) setup.py conftest.py
+	ruff $(check_dirs) setup.py conftest.py --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks
 
diff --git a/conftest.py b/conftest.py
index 53efec7a6c2d..c57fac2b1d9c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -20,6 +20,10 @@
 import warnings
 from os.path import abspath, dirname, join
 
+import _pytest
+
+from transformers.utils.doctest_utils import HfDoctestModule, HfDocTestParser
+
 
 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
@@ -38,9 +42,7 @@ def pytest_configure(config):
     config.addinivalue_line(
         "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
     )
-    config.addinivalue_line(
-        "markers", "is_pipeline_test: mark test to run only when pipelines are tested"
-    )
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
     config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
 
@@ -66,7 +68,7 @@ def pytest_sessionfinish(session, exitstatus):
 
 
 # Doctest custom flag to ignore output.
-IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
+IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
 
 OutputChecker = doctest.OutputChecker
 
@@ -79,3 +81,5 @@ def check_output(self, want, got, optionflags):
 
 
 doctest.OutputChecker = CustomOutputChecker
+_pytest.doctest.DoctestModule = HfDoctestModule
+doctest.DocTestParser = HfDocTestParser
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx
index 4663b8ac4d93..5adbc8e44db7 100644
--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.mdx
@@ -212,20 +212,12 @@ Example:
     ```"""
 
 ```
-3 steps are required to debug the docstring examples: 
-1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: 
-```bash 
-python utils/prepare_for_doc_test.py <path_to_file_or_dir>
-```
 
-2. Then, you can use the following line to automatically test every docstring example in the desired file: 
+Just run the following line to automatically test every docstring example in the desired file: 
 ```bash 
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: 
-```bash 
-python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
-```
+If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument.
 
 ### Run only modified tests
 
diff --git a/setup.cfg b/setup.cfg
index 5f47c5c6be69..8b84d3a6d9b9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,3 @@
 [tool:pytest]
 doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
+doctest_glob=**/*.mdx
\ No newline at end of file
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 42e856d9e4ac..0600eb382818 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -27,6 +27,7 @@
     copy_func,
     replace_return_docstrings,
 )
+from .doctest_utils import HfDocTestParser
 from .generic import (
     ContextManagers,
     ExplicitEnum,
diff --git a/src/transformers/utils/doctest_utils.py b/src/transformers/utils/doctest_utils.py
new file mode 100644
index 000000000000..90f37e8ce694
--- /dev/null
+++ b/src/transformers/utils/doctest_utils.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utils to run the documentation tests without having to overwrite any files.
+
+The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
+made as a print would otherwise fail the corresonding line.
+
+To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
+"""
+import doctest
+import inspect
+import os
+import re
+from typing import Iterable
+
+from _pytest.doctest import (
+    Module,
+    _get_checker,
+    _get_continue_on_failure,
+    _get_runner,
+    _is_mocked,
+    _patch_unwrap_mock_aware,
+    get_optionflags,
+    import_path,
+)
+from _pytest.outcomes import skip
+from pytest import DoctestItem
+
+
+def preprocess_string(string, skip_cuda_tests):
+    """Prepare a docstring or a `.mdx` file to be run by doctest.
+
+    The argument `string` would be the whole file content if it is a `.mdx` file. For a python file, it would be one of
+    its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
+    cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
+    `string`.
+    """
+    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
+    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
+    is_cuda_found = False
+    for i, codeblock in enumerate(codeblocks):
+        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
+            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
+        if (
+            (">>>" in codeblock or "..." in codeblock)
+            and re.search(r"cuda|to\(0\)|device=0", codeblock)
+            and skip_cuda_tests
+        ):
+            is_cuda_found = True
+            break
+    modified_string = ""
+    if not is_cuda_found:
+        modified_string = "".join(codeblocks)
+    return modified_string
+
+
+class HfDocTestParser(doctest.DocTestParser):
+    """
+    Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This
+    means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also
+    added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line.
+
+    Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
+    """
+
+    # This regular expression is used to find doctest examples in a
+    # string.  It defines three groups: `source` is the source code
+    # (including leading indentation and prompts); `indent` is the
+    # indentation of the first (PS1) line of the source code; and
+    # `want` is the expected output (including leading indentation).
+    # fmt: off
+    _EXAMPLE_RE = re.compile(r'''
+        # Source consists of a PS1 line followed by zero or more PS2 lines.
+        (?P<source>
+            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
+            (?:\n           [ ]*  \.\.\. .*)*)  # PS2 lines
+        \n?
+        # Want consists of any non-blank lines that do not start with PS1.
+        (?P<want> (?:(?![ ]*$)    # Not a blank line
+             (?![ ]*>>>)          # Not a line starting with PS1
+             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+             (?:(?!```).)*        # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line)
+             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+             (?:\n|$)  # Match a new line or end of string
+          )*)
+        ''', re.MULTILINE | re.VERBOSE
+    )
+    # fmt: on
+
+    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+    skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False))
+    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+
+    def parse(self, string, name="<string>"):
+        """
+        Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before
+        calling `super().parse`
+        """
+        string = preprocess_string(string, self.skip_cuda_tests)
+        return super().parse(string, name)
+
+
+class HfDoctestModule(Module):
+    """
+    Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
+    tests.
+    """
+
+    def collect(self) -> Iterable[DoctestItem]:
+        class MockAwareDocTestFinder(doctest.DocTestFinder):
+            """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
+
+            https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532
+            """
+
+            def _find_lineno(self, obj, source_lines):
+                """Doctest code does not take into account `@property`, this
+                is a hackish way to fix it. https://bugs.python.org/issue17446
+
+                Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be
+                reported upstream. #8796
+                """
+                if isinstance(obj, property):
+                    obj = getattr(obj, "fget", obj)
+
+                if hasattr(obj, "__wrapped__"):
+                    # Get the main obj in case of it being wrapped
+                    obj = inspect.unwrap(obj)
+
+                # Type ignored because this is a private function.
+                return super()._find_lineno(  # type:ignore[misc]
+                    obj,
+                    source_lines,
+                )
+
+            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
+                if _is_mocked(obj):
+                    return
+                with _patch_unwrap_mock_aware():
+                    # Type ignored because this is a private function.
+                    super()._find(  # type:ignore[misc]
+                        tests, obj, name, module, source_lines, globs, seen
+                    )
+
+        if self.path.name == "conftest.py":
+            module = self.config.pluginmanager._importconftest(
+                self.path,
+                self.config.getoption("importmode"),
+                rootpath=self.config.rootpath,
+            )
+        else:
+            try:
+                module = import_path(
+                    self.path,
+                    root=self.config.rootpath,
+                    mode=self.config.getoption("importmode"),
+                )
+            except ImportError:
+                if self.config.getvalue("doctest_ignore_import_errors"):
+                    skip("unable to import module %r" % self.path)
+                else:
+                    raise
+
+        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+        finder = MockAwareDocTestFinder(parser=HfDocTestParser())
+        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+        optionflags = get_optionflags(self)
+        runner = _get_runner(
+            verbose=False,
+            optionflags=optionflags,
+            checker=_get_checker(),
+            continue_on_failure=_get_continue_on_failure(self.config),
+        )
+        for test in finder.find(module, module.__name__):
+            if test.examples:  # skip empty doctests and cuda
+                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
diff --git a/utils/prepare_for_doc_test.py b/utils/prepare_for_doc_test.py
deleted file mode 100644
index c55f3540d994..000000000000
--- a/utils/prepare_for_doc_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Style utils to preprocess files for doc tests.
-
-    The doc precossing function can be run on a list of files and/org
-    directories of files. It will recursively check if the files have
-    a python code snippet by looking for a ```python or ```py syntax.
-    In the default mode - `remove_new_line==False` the script will
-    add a new line before every python code ending ``` line to make
-    the docstrings ready for pytest doctests.
-    However, we don't want to have empty lines displayed in the
-    official documentation which is why the new line command can be
-    reversed by adding the flag `--remove_new_line` which sets
-    `remove_new_line==True`.
-
-    When debugging the doc tests locally, please make sure to
-    always run:
-
-    ```python utils/prepare_for_doc_test.py src docs```
-
-    before running the doc tests:
-
-    ```pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"```
-
-    Afterwards you should revert the changes by running
-
-    ```python utils/prepare_for_doc_test.py src docs --remove_new_line```
-"""
-
-import argparse
-import os
-
-
-def process_code_block(code, add_new_line=True):
-    if add_new_line:
-        return maybe_append_new_line(code)
-    else:
-        return maybe_remove_new_line(code)
-
-
-def maybe_append_new_line(code):
-    """
-    Append new line if code snippet is a
-    Python code snippet
-    """
-    lines = code.split("\n")
-
-    if lines[0] in ["py", "python"]:
-        # add new line before last line being ```
-        last_line = lines[-1]
-        lines.pop()
-        lines.append("\n" + last_line)
-
-    return "\n".join(lines)
-
-
-def maybe_remove_new_line(code):
-    """
-    Remove new line if code snippet is a
-    Python code snippet
-    """
-    lines = code.split("\n")
-
-    if lines[0] in ["py", "python"]:
-        # add new line before last line being ```
-        lines = lines[:-2] + lines[-1:]
-
-    return "\n".join(lines)
-
-
-def process_doc_file(code_file, add_new_line=True):
-    """
-    Process given file.
-
-    Args:
-        code_file (`str` or `os.PathLike`): The file in which we want to style the docstring.
-    """
-    with open(code_file, "r", encoding="utf-8", newline="\n") as f:
-        code = f.read()
-
-    # fmt: off
-    splits = code.split("```")
-    if len(splits) % 2 != 1:
-        raise ValueError("The number of occurrences of ``` should be an even number.")
-
-    splits = [s if i % 2 == 0 else process_code_block(s, add_new_line=add_new_line) for i, s in enumerate(splits)]
-    clean_code = "```".join(splits)
-    # fmt: on
-
-    diff = clean_code != code
-    if diff:
-        print(f"Overwriting content of {code_file}.")
-        with open(code_file, "w", encoding="utf-8", newline="\n") as f:
-            f.write(clean_code)
-
-
-def process_doc_files(*files, add_new_line=True):
-    """
-    Applies doc styling or checks everything is correct in a list of files.
-
-    Args:
-        files (several `str` or `os.PathLike`): The files to treat.
-            Whether to restyle file or just check if they should be restyled.
-
-    Returns:
-        List[`str`]: The list of files changed or that should be restyled.
-    """
-    for file in files:
-        # Treat folders
-        if os.path.isdir(file):
-            files = [os.path.join(file, f) for f in os.listdir(file)]
-            files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")]
-            process_doc_files(*files, add_new_line=add_new_line)
-        else:
-            try:
-                process_doc_file(file, add_new_line=add_new_line)
-            except Exception:
-                print(f"There is a problem in {file}.")
-                raise
-
-
-def main(*files, add_new_line=True):
-    process_doc_files(*files, add_new_line=add_new_line)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
-    parser.add_argument(
-        "--remove_new_line",
-        action="store_true",
-        help="Whether to remove new line after each python code block instead of adding one.",
-    )
-    args = parser.parse_args()
-
-    main(*args.files, add_new_line=not args.remove_new_line)

From a0c0a7823393f7276f835fef815eabcadd8eaf64 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Tue, 9 May 2023 14:59:38 -0400
Subject: [PATCH 059/935] v4.30.0.dev0

---
 README.md                                                 | 8 ++++----
 examples/flax/question-answering/run_qa.py                | 2 +-
 examples/flax/text-classification/run_flax_glue.py        | 2 +-
 examples/flax/token-classification/run_flax_ner.py        | 2 +-
 .../audio-classification/run_audio_classification.py      | 2 +-
 examples/pytorch/contrastive-image-text/run_clip.py       | 2 +-
 .../image-classification/run_image_classification.py      | 2 +-
 .../run_image_classification_no_trainer.py                | 2 +-
 examples/pytorch/image-pretraining/run_mae.py             | 2 +-
 examples/pytorch/image-pretraining/run_mim.py             | 2 +-
 examples/pytorch/image-pretraining/run_mim_no_trainer.py  | 2 +-
 examples/pytorch/language-modeling/run_clm.py             | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py  | 2 +-
 examples/pytorch/language-modeling/run_mlm.py             | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py  | 2 +-
 examples/pytorch/language-modeling/run_plm.py             | 2 +-
 examples/pytorch/multiple-choice/run_swag.py              | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py   | 2 +-
 examples/pytorch/question-answering/run_qa.py             | 2 +-
 examples/pytorch/question-answering/run_qa_beam_search.py | 2 +-
 .../question-answering/run_qa_beam_search_no_trainer.py   | 2 +-
 examples/pytorch/question-answering/run_qa_no_trainer.py  | 2 +-
 examples/pytorch/question-answering/run_seq2seq_qa.py     | 2 +-
 .../semantic-segmentation/run_semantic_segmentation.py    | 2 +-
 .../run_semantic_segmentation_no_trainer.py               | 2 +-
 .../speech-recognition/run_speech_recognition_ctc.py      | 2 +-
 .../speech-recognition/run_speech_recognition_seq2seq.py  | 2 +-
 examples/pytorch/summarization/run_summarization.py       | 2 +-
 .../pytorch/summarization/run_summarization_no_trainer.py | 2 +-
 examples/pytorch/text-classification/run_glue.py          | 2 +-
 .../pytorch/text-classification/run_glue_no_trainer.py    | 2 +-
 examples/pytorch/text-classification/run_xnli.py          | 2 +-
 examples/pytorch/token-classification/run_ner.py          | 2 +-
 .../pytorch/token-classification/run_ner_no_trainer.py    | 2 +-
 examples/pytorch/translation/run_translation.py           | 2 +-
 .../pytorch/translation/run_translation_no_trainer.py     | 2 +-
 examples/tensorflow/contrastive-image-text/run_clip.py    | 2 +-
 .../image-classification/run_image_classification.py      | 2 +-
 examples/tensorflow/multiple-choice/run_swag.py           | 2 +-
 examples/tensorflow/question-answering/run_qa.py          | 2 +-
 examples/tensorflow/summarization/run_summarization.py    | 2 +-
 examples/tensorflow/text-classification/run_glue.py       | 2 +-
 examples/tensorflow/translation/run_translation.py        | 2 +-
 setup.py                                                  | 2 +-
 src/transformers/__init__.py                              | 2 +-
 45 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index fdceaa1643e0..7b0ff6f293cf 100644
--- a/README.md
+++ b/README.md
@@ -341,7 +341,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@@ -400,7 +400,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -422,9 +422,9 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 5a9f5eb16e0f..78d51fa7e949 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index ffd98152d77c..ac31c2a00db2 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -54,7 +54,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 8e038ac13679..a2f8ba2e60b6 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -55,7 +55,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index e9beb8dcf917..f689864271ee 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index c30349c37aaa..9fb0da7c35f2 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 4b4fee5b5175..10a8ece093f3 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index e30699435431..6a900ff76137 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 3f7ef47c6a67..7af1b3e0b5ba 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 874b7c651248..344f5d9b5b81 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index a94585d39698..126029150b9e 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.25.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 020a6f10dde9..3ffa7a5bba3e 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 4bb750a0b024..e7f5e88b2912 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 7b8d67980251..eced8377bcdc 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 9334de8c0331..30138bfbc31a 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index ab955d5b941a..1af82f5fc2da 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index a0660f0085f3..add403dfb8ad 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 6d3987c8feb4..21f2e1bf04bc 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index b0dcf6e5d836..29591a457793 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index fc4b0e0288be..42b5b7b6580b 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index ba7e82cc6470..05c85bdf50e3 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 67d9fb8b455d..2e363ae97098 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index 3c0ac4dfbc22..314c4a201522 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index e1027f5d67b4..0f989351eefb 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 00d115646ac4..783810be64b7 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 8c4f4352489a..73eea0b79f33 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 419be107b164..22a29fc73054 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index e083e68848ef..c89881b0beb9 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 37ea3bcfbb9e..ea09c6b89e06 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index dd81d535df7b..8f9fa5c1a4b3 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index c71581f7811c..2fbacade06c6 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 88139986b286..4397b4a9433b 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 9e5dd8d31bd2..3391bcecfc2e 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 800312839440..bc51fab14e04 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index d31a6a8ca035..035bb3b06d98 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 29f3e49f0a07..e52050308ab2 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 35359a2fa708..2c696244f629 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index ee6ebfb46901..61c6cea2cd94 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 9c6a90f1dc1b..d3ddca3f134c 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 7059a9a03212..a42d5111814a 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index b60a2129166d..f3195d39d96d 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 8aa3d4c7fe80..df062c342e5d 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 5f45f752c503..1f31c69245fb 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.29.0.dev0")
+check_min_version("4.30.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index 41e52e7f7c8e..1d8115f66845 100644
--- a/setup.py
+++ b/setup.py
@@ -428,7 +428,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.29.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.30.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 375966131e0d..2f1e04c3cb6b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.29.0.dev0"
+__version__ = "4.30.0.dev0"
 
 from typing import TYPE_CHECKING
 

From 69ee46243c40ea61f63d4b8f78d171ad27b4a046 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 9 May 2023 15:26:15 -0400
Subject: [PATCH 060/935] Revert "[Doctests] Refactor doctests + add CI"
 (#23245)

Revert "[Doctests] Refactor doctests + add CI (#22987)"

This reverts commit 627f44799a9f4948a6a1b8fe9e536eee0e29ea68.
---
 .circleci/config.yml                    |   1 -
 .circleci/create_circleci_config.py     |  85 +----------
 .github/workflows/doctests.yml          |   8 +
 Makefile                                |   8 +-
 conftest.py                             |  12 +-
 docs/source/en/testing.mdx              |  12 +-
 setup.cfg                               |   1 -
 src/transformers/utils/__init__.py      |   1 -
 src/transformers/utils/doctest_utils.py | 189 ------------------------
 utils/prepare_for_doc_test.py           | 148 +++++++++++++++++++
 10 files changed, 178 insertions(+), 287 deletions(-)
 delete mode 100644 src/transformers/utils/doctest_utils.py
 create mode 100644 utils/prepare_for_doc_test.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9af649045803..63c6162fc15d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -166,7 +166,6 @@ jobs:
                       - v0.6-repository_consistency
             - run: pip install --upgrade pip
             - run: pip install .[all,quality]
-            - run: pip install pytest
             - save_cache:
                   key: v0.5-repository_consistency-{{ checksum "setup.py" }}
                   paths:
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 30898f9e1c2a..7208d876a97c 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -51,8 +51,6 @@ class CircleCIJob:
     resource_class: Optional[str] = "xlarge"
     tests_to_run: Optional[List[str]] = None
     working_directory: str = "~/transformers"
-    # This should be only used for doctest job!
-    command_timeout: Optional[int] = None
 
     def __post_init__(self):
         # Deal with defaults for mutable attributes.
@@ -109,15 +107,11 @@ def to_dict(self):
         steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
 
         all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
+        pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
         pytest_flags.append(
             f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
         )
-        test_command = ""
-        if self.command_timeout:
-            test_command = f"timeout {self.command_timeout} "
-        test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
-        
+        test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
         if self.parallelism == 1:
             if self.tests_to_run is None:
                 test_command += " << pipeline.parameters.tests_to_run >>"
@@ -167,37 +161,12 @@ def to_dict(self):
             steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
             steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
 
-            test_command = ""
-            if self.timeout:
-                test_command = f"timeout {self.timeout} "
-            test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+            test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
             test_command += " $(cat splitted_tests.txt)"
         if self.marker is not None:
             test_command += f" -m {self.marker}"
-
-        if self.name == "pr_documentation_tests":
-            # can't use ` | tee tee tests_output.txt` as usual
-            test_command += " > tests_output.txt"
-            # Save the return code, so we can check if it is timeout in the next step.
-            test_command += '; touch "$?".txt'
-            # Never fail the test step for the doctest job. We will check the results in the next step, and fail that
-            # step instead if the actual test failures are found. This is to avoid the timeout being reported as test
-            # failure.
-            test_command = f"({test_command}) || true"
-        else:
-            test_command += " | tee tests_output.txt"
+        test_command += " | tee tests_output.txt"
         steps.append({"run": {"name": "Run tests", "command": test_command}})
-
-        # return code `124` means the previous (pytest run) step is timeout
-        if self.name == "pr_documentation_tests":
-            checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; '
-            checkout_doctest_command += 'then echo "some test failed"; '
-            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; '
-            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; '
-            checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; '
-            checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;'
-            steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}})
-
         steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
         steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
         job["steps"] = steps
@@ -432,51 +401,6 @@ def job_name(self):
     tests_to_run="tests/repo_utils",
 )
 
-# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file).
-py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
-py_command = f"$(python3 -c '{py_command}')"
-command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt'
-doc_test_job = CircleCIJob(
-    "pr_documentation_tests",
-    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
-        "pip install --upgrade pip",
-        "pip install -e .[dev]",
-        "pip install git+https://github.com/huggingface/accelerate",
-        "pip install --upgrade pytest pytest-sugar",
-        "find -name __pycache__ -delete",
-        "find . -name \*.pyc -delete",
-        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
-        "touch dummy.py",
-        {
-            "name": "Get files to test",
-            "command":
-                "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n"
-                "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt"
-        },
-        {
-            "name": "List files beings changed: pr_documentation_tests.txt",
-            "command":
-                "cat pr_documentation_tests.txt"
-        },
-        {
-            "name": "Filter pr_documentation_tests.txt",
-            "command":
-                command
-        },
-        {
-            "name": "List files beings tested: pr_documentation_tests_filtered.txt",
-            "command":
-                "cat pr_documentation_tests_filtered.txt"
-        },
-    ],
-    tests_to_run="$(cat pr_documentation_tests_filtered.txt)",  # noqa
-    pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
-    command_timeout=1200,  # test cannot run longer than 1200 seconds
-    pytest_num_workers=1,
-)
-
 REGULAR_TESTS = [
     torch_and_tf_job,
     torch_and_flax_job,
@@ -487,7 +411,6 @@ def job_name(self):
     hub_job,
     onnx_job,
     exotic_models_job,
-    doc_test_job
 ]
 EXAMPLES_TESTS = [
     examples_torch_job,
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index 55c09b1acc82..a0efe40cbbe9 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -37,10 +37,18 @@ jobs:
       - name: Show installed libraries and their versions
         run: pip freeze
 
+      - name: Prepare files for doctests
+        run: |
+          python3 utils/prepare_for_doc_test.py src docs
+
       - name: Run doctests
         run: |
           python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
 
+      - name: Clean files after doctests
+        run: |
+          python3 utils/prepare_for_doc_test.py src docs --remove_new_line
+
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
diff --git a/Makefile b/Makefile
index d6d6966a1dad..5e5a11a1fee0 100644
--- a/Makefile
+++ b/Makefile
@@ -47,10 +47,10 @@ repo-consistency:
 # this target runs checks on all files
 
 quality:
-	black --check $(check_dirs) setup.py conftest.py
+	black --check $(check_dirs) setup.py
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
-	ruff $(check_dirs) setup.py conftest.py
+	ruff $(check_dirs) setup.py
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py
 
@@ -65,8 +65,8 @@ extra_style_checks:
 # this target runs checks on all files and potentially modifies some of them
 
 style:
-	black $(check_dirs) setup.py conftest.py
-	ruff $(check_dirs) setup.py conftest.py --fix
+	black $(check_dirs) setup.py
+	ruff $(check_dirs) setup.py --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks
 
diff --git a/conftest.py b/conftest.py
index c57fac2b1d9c..53efec7a6c2d 100644
--- a/conftest.py
+++ b/conftest.py
@@ -20,10 +20,6 @@
 import warnings
 from os.path import abspath, dirname, join
 
-import _pytest
-
-from transformers.utils.doctest_utils import HfDoctestModule, HfDocTestParser
-
 
 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
@@ -42,7 +38,9 @@ def pytest_configure(config):
     config.addinivalue_line(
         "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
     )
-    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
+    config.addinivalue_line(
+        "markers", "is_pipeline_test: mark test to run only when pipelines are tested"
+    )
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
     config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
 
@@ -68,7 +66,7 @@ def pytest_sessionfinish(session, exitstatus):
 
 
 # Doctest custom flag to ignore output.
-IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
+IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
 
 OutputChecker = doctest.OutputChecker
 
@@ -81,5 +79,3 @@ def check_output(self, want, got, optionflags):
 
 
 doctest.OutputChecker = CustomOutputChecker
-_pytest.doctest.DoctestModule = HfDoctestModule
-doctest.DocTestParser = HfDocTestParser
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx
index 5adbc8e44db7..4663b8ac4d93 100644
--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.mdx
@@ -212,12 +212,20 @@ Example:
     ```"""
 
 ```
+3 steps are required to debug the docstring examples: 
+1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: 
+```bash 
+python utils/prepare_for_doc_test.py <path_to_file_or_dir>
+```
 
-Just run the following line to automatically test every docstring example in the desired file: 
+2. Then, you can use the following line to automatically test every docstring example in the desired file: 
 ```bash 
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument.
+3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: 
+```bash 
+python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
+```
 
 ### Run only modified tests
 
diff --git a/setup.cfg b/setup.cfg
index 8b84d3a6d9b9..5f47c5c6be69 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,2 @@
 [tool:pytest]
 doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
-doctest_glob=**/*.mdx
\ No newline at end of file
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 0600eb382818..42e856d9e4ac 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -27,7 +27,6 @@
     copy_func,
     replace_return_docstrings,
 )
-from .doctest_utils import HfDocTestParser
 from .generic import (
     ContextManagers,
     ExplicitEnum,
diff --git a/src/transformers/utils/doctest_utils.py b/src/transformers/utils/doctest_utils.py
deleted file mode 100644
index 90f37e8ce694..000000000000
--- a/src/transformers/utils/doctest_utils.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utils to run the documentation tests without having to overwrite any files.
-
-The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
-made as a print would otherwise fail the corresonding line.
-
-To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
-"""
-import doctest
-import inspect
-import os
-import re
-from typing import Iterable
-
-from _pytest.doctest import (
-    Module,
-    _get_checker,
-    _get_continue_on_failure,
-    _get_runner,
-    _is_mocked,
-    _patch_unwrap_mock_aware,
-    get_optionflags,
-    import_path,
-)
-from _pytest.outcomes import skip
-from pytest import DoctestItem
-
-
-def preprocess_string(string, skip_cuda_tests):
-    """Prepare a docstring or a `.mdx` file to be run by doctest.
-
-    The argument `string` would be the whole file content if it is a `.mdx` file. For a python file, it would be one of
-    its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
-    cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
-    `string`.
-    """
-    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
-    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
-    is_cuda_found = False
-    for i, codeblock in enumerate(codeblocks):
-        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
-            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
-        if (
-            (">>>" in codeblock or "..." in codeblock)
-            and re.search(r"cuda|to\(0\)|device=0", codeblock)
-            and skip_cuda_tests
-        ):
-            is_cuda_found = True
-            break
-    modified_string = ""
-    if not is_cuda_found:
-        modified_string = "".join(codeblocks)
-    return modified_string
-
-
-class HfDocTestParser(doctest.DocTestParser):
-    """
-    Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This
-    means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also
-    added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line.
-
-    Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
-    """
-
-    # This regular expression is used to find doctest examples in a
-    # string.  It defines three groups: `source` is the source code
-    # (including leading indentation and prompts); `indent` is the
-    # indentation of the first (PS1) line of the source code; and
-    # `want` is the expected output (including leading indentation).
-    # fmt: off
-    _EXAMPLE_RE = re.compile(r'''
-        # Source consists of a PS1 line followed by zero or more PS2 lines.
-        (?P<source>
-            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
-            (?:\n           [ ]*  \.\.\. .*)*)  # PS2 lines
-        \n?
-        # Want consists of any non-blank lines that do not start with PS1.
-        (?P<want> (?:(?![ ]*$)    # Not a blank line
-             (?![ ]*>>>)          # Not a line starting with PS1
-             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-             (?:(?!```).)*        # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line)
-             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-             (?:\n|$)  # Match a new line or end of string
-          )*)
-        ''', re.MULTILINE | re.VERBOSE
-    )
-    # fmt: on
-
-    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-    skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False))
-    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-
-    def parse(self, string, name="<string>"):
-        """
-        Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before
-        calling `super().parse`
-        """
-        string = preprocess_string(string, self.skip_cuda_tests)
-        return super().parse(string, name)
-
-
-class HfDoctestModule(Module):
-    """
-    Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
-    tests.
-    """
-
-    def collect(self) -> Iterable[DoctestItem]:
-        class MockAwareDocTestFinder(doctest.DocTestFinder):
-            """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
-
-            https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532
-            """
-
-            def _find_lineno(self, obj, source_lines):
-                """Doctest code does not take into account `@property`, this
-                is a hackish way to fix it. https://bugs.python.org/issue17446
-
-                Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be
-                reported upstream. #8796
-                """
-                if isinstance(obj, property):
-                    obj = getattr(obj, "fget", obj)
-
-                if hasattr(obj, "__wrapped__"):
-                    # Get the main obj in case of it being wrapped
-                    obj = inspect.unwrap(obj)
-
-                # Type ignored because this is a private function.
-                return super()._find_lineno(  # type:ignore[misc]
-                    obj,
-                    source_lines,
-                )
-
-            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
-                if _is_mocked(obj):
-                    return
-                with _patch_unwrap_mock_aware():
-                    # Type ignored because this is a private function.
-                    super()._find(  # type:ignore[misc]
-                        tests, obj, name, module, source_lines, globs, seen
-                    )
-
-        if self.path.name == "conftest.py":
-            module = self.config.pluginmanager._importconftest(
-                self.path,
-                self.config.getoption("importmode"),
-                rootpath=self.config.rootpath,
-            )
-        else:
-            try:
-                module = import_path(
-                    self.path,
-                    root=self.config.rootpath,
-                    mode=self.config.getoption("importmode"),
-                )
-            except ImportError:
-                if self.config.getvalue("doctest_ignore_import_errors"):
-                    skip("unable to import module %r" % self.path)
-                else:
-                    raise
-
-        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-        finder = MockAwareDocTestFinder(parser=HfDocTestParser())
-        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
-        optionflags = get_optionflags(self)
-        runner = _get_runner(
-            verbose=False,
-            optionflags=optionflags,
-            checker=_get_checker(),
-            continue_on_failure=_get_continue_on_failure(self.config),
-        )
-        for test in finder.find(module, module.__name__):
-            if test.examples:  # skip empty doctests and cuda
-                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
diff --git a/utils/prepare_for_doc_test.py b/utils/prepare_for_doc_test.py
new file mode 100644
index 000000000000..c55f3540d994
--- /dev/null
+++ b/utils/prepare_for_doc_test.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Style utils to preprocess files for doc tests.
+
+    The doc precossing function can be run on a list of files and/org
+    directories of files. It will recursively check if the files have
+    a python code snippet by looking for a ```python or ```py syntax.
+    In the default mode - `remove_new_line==False` the script will
+    add a new line before every python code ending ``` line to make
+    the docstrings ready for pytest doctests.
+    However, we don't want to have empty lines displayed in the
+    official documentation which is why the new line command can be
+    reversed by adding the flag `--remove_new_line` which sets
+    `remove_new_line==True`.
+
+    When debugging the doc tests locally, please make sure to
+    always run:
+
+    ```python utils/prepare_for_doc_test.py src docs```
+
+    before running the doc tests:
+
+    ```pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"```
+
+    Afterwards you should revert the changes by running
+
+    ```python utils/prepare_for_doc_test.py src docs --remove_new_line```
+"""
+
+import argparse
+import os
+
+
+def process_code_block(code, add_new_line=True):
+    if add_new_line:
+        return maybe_append_new_line(code)
+    else:
+        return maybe_remove_new_line(code)
+
+
+def maybe_append_new_line(code):
+    """
+    Append new line if code snippet is a
+    Python code snippet
+    """
+    lines = code.split("\n")
+
+    if lines[0] in ["py", "python"]:
+        # add new line before last line being ```
+        last_line = lines[-1]
+        lines.pop()
+        lines.append("\n" + last_line)
+
+    return "\n".join(lines)
+
+
+def maybe_remove_new_line(code):
+    """
+    Remove new line if code snippet is a
+    Python code snippet
+    """
+    lines = code.split("\n")
+
+    if lines[0] in ["py", "python"]:
+        # add new line before last line being ```
+        lines = lines[:-2] + lines[-1:]
+
+    return "\n".join(lines)
+
+
+def process_doc_file(code_file, add_new_line=True):
+    """
+    Process given file.
+
+    Args:
+        code_file (`str` or `os.PathLike`): The file in which we want to style the docstring.
+    """
+    with open(code_file, "r", encoding="utf-8", newline="\n") as f:
+        code = f.read()
+
+    # fmt: off
+    splits = code.split("```")
+    if len(splits) % 2 != 1:
+        raise ValueError("The number of occurrences of ``` should be an even number.")
+
+    splits = [s if i % 2 == 0 else process_code_block(s, add_new_line=add_new_line) for i, s in enumerate(splits)]
+    clean_code = "```".join(splits)
+    # fmt: on
+
+    diff = clean_code != code
+    if diff:
+        print(f"Overwriting content of {code_file}.")
+        with open(code_file, "w", encoding="utf-8", newline="\n") as f:
+            f.write(clean_code)
+
+
+def process_doc_files(*files, add_new_line=True):
+    """
+    Applies doc styling or checks everything is correct in a list of files.
+
+    Args:
+        files (several `str` or `os.PathLike`): The files to treat.
+            Whether to restyle file or just check if they should be restyled.
+
+    Returns:
+        List[`str`]: The list of files changed or that should be restyled.
+    """
+    for file in files:
+        # Treat folders
+        if os.path.isdir(file):
+            files = [os.path.join(file, f) for f in os.listdir(file)]
+            files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")]
+            process_doc_files(*files, add_new_line=add_new_line)
+        else:
+            try:
+                process_doc_file(file, add_new_line=add_new_line)
+            except Exception:
+                print(f"There is a problem in {file}.")
+                raise
+
+
+def main(*files, add_new_line=True):
+    process_doc_files(*files, add_new_line=add_new_line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
+    parser.add_argument(
+        "--remove_new_line",
+        action="store_true",
+        help="Whether to remove new line after each python code block instead of adding one.",
+    )
+    args = parser.parse_args()
+
+    main(*args.files, add_new_line=not args.remove_new_line)

From 366a8ca09e8dd92dfb1956c8be3118b5a2b13639 Mon Sep 17 00:00:00 2001
From: Kunhao ZHENG <kunhao@huggingface.co>
Date: Tue, 9 May 2023 22:58:39 +0200
Subject: [PATCH 061/935] Fix `from_config` (#23246)

fix
---
 src/transformers/models/auto/auto_factory.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index f8bc266fe832..17cc7f95799f 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -407,8 +407,7 @@ def from_config(cls, config, **kwargs):
                 repo_id, class_ref = class_ref.split("--")
             else:
                 repo_id = config.name_or_path
-            module_file, class_name = class_ref.split(".")
-            model_class = get_class_from_dynamic_module(repo_id, module_file + ".py", class_name, **kwargs)
+            model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
             model_class = _get_model_class(config, cls._model_mapping)

From 3335724376319a0c453049d0cd883504f530ff52 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 9 May 2023 20:37:57 -0400
Subject: [PATCH 062/935] Test composition (#23214)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Remove nestedness in tool config

* Really do it

* Use remote tools descriptions

* Work

* Clean up eval

* Changes

* Tools

* Tools

* tool

* Fix everything

* Use last result/assign for evaluation

* Prompt

* Remove hardcoded selection

* Evaluation for chat agents

* correct some spelling

* Small fixes

* Change summarization model (#23172)

* Fix link displayed

* Update description of the tool

* Fixes in chat prompt

* Custom tools, custom prompt

* Tool clean up

* save_pretrained and push_to_hub for tool

* Fix init

* Tests

* Fix tests

* Tool save/from_hub/push_to_hub and tool->load_tool

* Clean push_to_hub and add app file

* Custom inference API for endpoints too

* Clean up

* old remote tool and new remote tool

* Make a requirements

* return_code adds tool creation

* Avoid redundancy between global variables

* Remote tools can be loaded

* Tests

* Text summarization tests

* Quality

* Properly mark tests

* Test the python interpreter

* And the CI shall be green.

* fix loading of additional tools

* Work on RemoteTool and fix tests

* General clean up

* Guard imports

* Fix tools

* docs: Fix broken link in 'How to add a model...'  (#23216)

fix link

* Get default endpoint from the Hub

* Add guide

* Simplify tool config

* Docs

* Some fixes

* Docs

* Docs

* Docs

* Fix code returned by agent

* Try this

* Match args with signature in remote tool

* Should fix python interpreter for Python 3.8

* Fix push_to_hub for tools

* Other fixes to push_to_hub

* Add API doc page

* Docs

* Docs

* Custom tools

* Pin tensorflow-probability (#23220)

* Pin tensorflow-probability

* [all-test]

* [all-test] Fix syntax for bash

* PoC for some chaining API

* Text to speech

* J'ai pris des libertés

* Rename

* Basic python interpreter

* Add agents

* Quality

* Add translation tool

* temp

* GenQA + LID + S2T

* Quality + word missing in translation

* Add open assistance, support f-strings in evaluate

* captioning + s2t fixes

* Style

* Refactor descriptions and remove chain

* Support errors and rename OpenAssistantAgent

* Add setup

* Deal with typos + example of inference API

* Some rename + README

* Fixes

* Update prompt

* Unwanted change

* Make sure everyone has a default

* One prompt to rule them all.

* SD

* Description

* Clean up remote tools

* More remote tools

* Add option to return code and update doc

* Image segmentation

* ControlNet

* Gradio demo

* Diffusers protection

* Lib protection

* ControlNet description

* Cleanup

* Style

* Remove accelerate and try to be reproducible

* No randomness

* Male Basic optional in token

* Clean description

* Better prompts

* Fix args eval in interpreter

* Add tool wrapper

* Tool on the Hub

* Style post-rebase

* Big refactor of descriptions, batch generation and evaluation for agents

* Make problems easier - interface to debug

* More problems, add python primitives

* Back to one prompt

* Remove dict for translation

* Be consistent

* Add prompts

* New version of the agent

* Evaluate new agents

* New endpoints agents

* Make all tools a dict variable

* Typo

* Add problems

* Add to big prompt

* Harmonize

* Add tools

* New evaluation

* Add more tools

* Build prompt with tools descriptions

* Tools on the Hub

* Let's chat!

* Cleanup

* Temporary bs4 safeguard

* Cache agents and clean up

* Blank init

* Fix evaluation for agents

* New format for tools on the Hub

* Add method to reset state

* Remove nestedness in tool config

* Really do it

* Use remote tools descriptions

* Work

* Clean up eval

* Changes

* Tools

* Tools

* tool

* Fix everything

* Use last result/assign for evaluation

* Prompt

* Remove hardcoded selection

* Evaluation for chat agents

* correct some spelling

* Small fixes

* Change summarization model (#23172)

* Fix link displayed

* Update description of the tool

* Fixes in chat prompt

* Custom tools, custom prompt

* Tool clean up

* save_pretrained and push_to_hub for tool

* Fix init

* Tests

* Fix tests

* Tool save/from_hub/push_to_hub and tool->load_tool

* Clean push_to_hub and add app file

* Custom inference API for endpoints too

* Clean up

* old remote tool and new remote tool

* Make a requirements

* return_code adds tool creation

* Avoid redundancy between global variables

* Remote tools can be loaded

* Tests

* Text summarization tests

* Quality

* Properly mark tests

* Test the python interpreter

* And the CI shall be green.

* Work on RemoteTool and fix tests

* fix loading of additional tools

* General clean up

* Guard imports

* Fix tools

* Get default endpoint from the Hub

* Simplify tool config

* Add guide

* Docs

* Some fixes

* Docs

* Docs

* Fix code returned by agent

* Try this

* Docs

* Match args with signature in remote tool

* Should fix python interpreter for Python 3.8

* Fix push_to_hub for tools

* Other fixes to push_to_hub

* Add API doc page

* Fixes

* Doc fixes

* Docs

* Fix audio

* Custom tools

* Audio fix

* Improve custom tools docstring

* Docstrings

* Trigger CI

* Mode docstrings

* More docstrings

* Improve custom tools

* Fix for remote tools

* Style

* Fix repo consistency

* Quality

* Tip

* Cleanup on doc

* Cleanup toc

* Add disclaimer for starcoder vs openai

* Remove disclaimer

* Small fixed in the prompts

* 4.29

* Update src/transformers/tools/agents.py

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

* Complete documentation

* Small fixes

* Agent evaluation

* Note about gradio-tools & LC

* Clean up agents and prompt

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Note about gradio-tools & LC

* Add copyrights and address review comments

* Quality

* Add all language codes

* Add remote tool tests

* Move custom prompts to other docs

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* TTS tests

* Quality

---------

Co-authored-by: Lysandre <hi@lyand.re>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Co-authored-by: Connor Henderson <connor.henderson@talkiatry.com>
Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
Co-authored-by: Lysandre <lysandre@huggingface.co>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 conftest.py                                   |   1 +
 docs/source/en/_toctree.yml                   |   6 +
 docs/source/en/custom_tools.mdx               | 503 ++++++++++++
 docs/source/en/main_classes/agent.mdx         |  64 ++
 docs/source/en/transformers_agents.mdx        | 329 ++++++++
 src/transformers/__init__.py                  |  13 +
 src/transformers/dynamic_module_utils.py      |  33 +-
 src/transformers/image_utils.py               |   4 +
 src/transformers/testing_utils.py             |  16 +
 src/transformers/tools/__init__.py            |  73 ++
 src/transformers/tools/agents.py              | 489 ++++++++++++
 src/transformers/tools/base.py                | 722 ++++++++++++++++++
 .../tools/document_question_answering.py      |  80 ++
 src/transformers/tools/evaluate_agent.py      | 692 +++++++++++++++++
 src/transformers/tools/image_captioning.py    |  51 ++
 .../tools/image_question_answering.py         |  57 ++
 src/transformers/tools/image_segmentation.py  |  60 ++
 src/transformers/tools/prompts.py             | 186 +++++
 src/transformers/tools/python_interpreter.py  | 238 ++++++
 src/transformers/tools/speech_to_text.py      |  41 +
 src/transformers/tools/text_classification.py |  70 ++
 .../tools/text_question_answering.py          |  52 ++
 src/transformers/tools/text_summarization.py  |  52 ++
 src/transformers/tools/text_to_speech.py      |  65 ++
 src/transformers/tools/translation.py         | 271 +++++++
 src/transformers/utils/__init__.py            |   1 +
 src/transformers/utils/hub.py                 |  13 +-
 src/transformers/utils/import_utils.py        |  15 +
 tests/tools/__init__.py                       |   0
 .../tools/test_document_question_answering.py |  57 ++
 tests/tools/test_image_captioning.py          |  53 ++
 tests/tools/test_image_question_answering.py  |  53 ++
 tests/tools/test_image_segmentation.py        |  53 ++
 tests/tools/test_python_interpreter.py        | 124 +++
 tests/tools/test_speech_to_text.py            |  38 +
 tests/tools/test_text_classification.py       |  43 ++
 tests/tools/test_text_question_answering.py   |  52 ++
 tests/tools/test_text_summarization.py        |  64 ++
 tests/tools/test_text_to_speech.py            |  54 ++
 tests/tools/test_tools_common.py              | 100 +++
 tests/tools/test_translation.py               |  53 ++
 41 files changed, 4933 insertions(+), 8 deletions(-)
 create mode 100644 docs/source/en/custom_tools.mdx
 create mode 100644 docs/source/en/main_classes/agent.mdx
 create mode 100644 docs/source/en/transformers_agents.mdx
 create mode 100644 src/transformers/tools/__init__.py
 create mode 100644 src/transformers/tools/agents.py
 create mode 100644 src/transformers/tools/base.py
 create mode 100644 src/transformers/tools/document_question_answering.py
 create mode 100644 src/transformers/tools/evaluate_agent.py
 create mode 100644 src/transformers/tools/image_captioning.py
 create mode 100644 src/transformers/tools/image_question_answering.py
 create mode 100644 src/transformers/tools/image_segmentation.py
 create mode 100644 src/transformers/tools/prompts.py
 create mode 100644 src/transformers/tools/python_interpreter.py
 create mode 100644 src/transformers/tools/speech_to_text.py
 create mode 100644 src/transformers/tools/text_classification.py
 create mode 100644 src/transformers/tools/text_question_answering.py
 create mode 100644 src/transformers/tools/text_summarization.py
 create mode 100644 src/transformers/tools/text_to_speech.py
 create mode 100644 src/transformers/tools/translation.py
 create mode 100644 tests/tools/__init__.py
 create mode 100644 tests/tools/test_document_question_answering.py
 create mode 100644 tests/tools/test_image_captioning.py
 create mode 100644 tests/tools/test_image_question_answering.py
 create mode 100644 tests/tools/test_image_segmentation.py
 create mode 100644 tests/tools/test_python_interpreter.py
 create mode 100644 tests/tools/test_speech_to_text.py
 create mode 100644 tests/tools/test_text_classification.py
 create mode 100644 tests/tools/test_text_question_answering.py
 create mode 100644 tests/tools/test_text_summarization.py
 create mode 100644 tests/tools/test_text_to_speech.py
 create mode 100644 tests/tools/test_tools_common.py
 create mode 100644 tests/tools/test_translation.py

diff --git a/conftest.py b/conftest.py
index 53efec7a6c2d..683b47705bf4 100644
--- a/conftest.py
+++ b/conftest.py
@@ -43,6 +43,7 @@ def pytest_configure(config):
     )
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
     config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
+    config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")
 
 
 def pytest_addoption(parser):
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c92f21a93458..c2c50082274d 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -21,6 +21,8 @@
     title: Set up distributed training with 🤗 Accelerate
   - local: model_sharing
     title: Share your model
+  - local: transformers_agents
+    title: Agents
   title: Tutorials
 - sections:
   - sections:
@@ -99,6 +101,8 @@
       title: Notebooks with examples
     - local: community
       title: Community resources
+    - local: custom_tools
+      title: Custom Tools
     - local: troubleshooting
       title: Troubleshoot
   title: Developer guides
@@ -179,6 +183,8 @@
   title: Conceptual guides
 - sections:
   - sections:
+    - local: main_classes/agent
+      title: Agents and Tools
     - local: model_doc/auto
       title: Auto Classes
     - local: main_classes/callback
diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
new file mode 100644
index 000000000000..f69a2cde90d7
--- /dev/null
+++ b/docs/source/en/custom_tools.mdx
@@ -0,0 +1,503 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Tools and Prompts
+
+<Tip>
+
+If you are not aware of what tools and agents are in the context of transformers, we recommend you read the
+[Transformers Agents](transformers_agents) page first.
+
+</Tip>
+
+<Tip warning={true}>
+
+Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks.
+In this guide we'll take a look at:
+
+- How to customize the prompt
+- How to use custom tools
+- How to create custom tools
+
+## Customizing the prompt
+
+As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode.
+Both the run and chat mode underlie the same logic. The language model powering the agent is conditioned on a long prompt 
+and simply asked to complete the prompt by generating next tokens until the stop token is reached.
+The only difference between the `run` and `chat` mode is that during the `chat` mode the prompt is extended with 
+previous user inputs and model generations, which seemingly gives the agent a memory and allows it to refer to 
+past interactions.
+
+Let's take a closer look into how the prompt is structured to understand how it can be best customized.
+The prompt is structured broadly into four parts.
+
+- 1. Introduction: how the agent should behave, explanation of the concept of tools.
+- 2. Description of all the tools. This is defined by a `<<all_tools>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
+- 3. A set of examples of tasks and their solution
+- 4. Current example, and request for solution.
+
+To better understand each part, let's look at a shortened version of how such a prompt can look like in practice.
+
+```
+I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
+[...]
+You can print intermediate results if it makes sense to do so.
+
+Tools:
+- document_qa: This is a tool that answers a question about an document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to caption, and returns a text that contains the description in English.
+[...]
+
+Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
+
+I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+
+Answer:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+print(f"The answer is {answer}")
+```
+
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+[...]
+
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+```
+
+The first part explains precisely how the model shall behave and what it should do. This part 
+most likely does not need to be customized.
+
+TODO(PVP) - explain better how the .description and .name influence the prompt
+
+### Customizing the tool descriptions
+
+The performance of the agent is directly linked to the prompt itself. We structure the prompt so that it works well 
+with what we intend for the agent to do; but for maximum customization we also offer the ability to specify a different prompt when instantiating the agent.
+
+### Customizing the single-execution prompt
+
+In order to specify a custom single-execution prompt, one would so the following:
+
+```py
+template = """ [...] """
+
+agent = HfAgent(your_endpoint, run_prompt_template=template)
+```
+
+<Tip>
+
+Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware 
+of the tools it has available to it.
+
+</Tip>
+
+#### Chat-execution prompt
+
+In order to specify a custom single-execution prompt, one would so the following:
+
+```
+template = """ [...] """
+
+agent = HfAgent(
+	url_endpoint=your_endpoint,
+	token=your_hf_token,
+	chat_prompt_template=template
+)
+```
+
+<Tip>
+
+Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be 
+aware of the tools it has available to it.
+
+</Tip>
+
+## Using custom tools
+
+In this section, we'll be leveraging two existing custom tools that are specific to image generation:
+
+- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation),
+  with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) 
+  to allow for more image modifications.
+- We add a new tool for image upscaling to the default toolbox: 
+  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool.
+
+We'll start by loading the custom tools with the convenient [`load_tool`] function:
+
+```py
+from transformers import load_tool
+
+controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
+upscaler = load_tool("diffusers/latent-upscaler-tool")
+```
+
+Upon adding custom tools to an agent, the tools' descriptions and names are automatically
+included in the agents' prompts. Thus, it is imperative that custom tools have
+a well-written description and name in order for the agent to understand how to use them.
+Let's take a look at the description and name of `controlnet_transformer`:
+
+```py
+print(f"Description: '{controlnet_transformer.description}'")
+print(f"Name: '{controlnet_transformer.name}'")
+```
+
+gives 
+```
+Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
+It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
+Name: 'image_transformer'
+```
+
+The name and description is accurate and fits the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
+Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`:
+
+```py
+tools = [controlnet_transformer, upscaler]
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
+```
+
+This command should give you the following info:
+
+```
+image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
+8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
+```
+
+The set of curated tools already has a `image_transformer` tool which is hereby replaced with our custom tool.
+
+<Tip>
+
+Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool 
+because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API 
+as the overwritten tool in this case.
+
+</Tip>
+
+The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore is simply added to the list of tools.
+You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute:
+
+```py
+print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
+```
+
+```
+- document_qa
+- image_captioner
+- image_qa
+- image_segmenter
+- transcriber
+- summarizer
+- text_classifier
+- text_qa
+- text_reader
+- translator
+- image_transformer
+- text_downloader
+- image_generator
+- video_generator
+- image_upscaler
+```
+
+Note how `image_upscaler` is now part of the agents' toolbox.
+
+Let's now try out the new tools! We will re-use the image we generated in (Transformers Agents Quickstart)[./transformers_agents#single-execution-run].
+
+```py
+from diffusers.utils import load_image
+
+image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
+)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+Let's transform the image into a beautiful winter landscape:
+
+```py
+image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
+```
+
+```
+==Explanation from the agent==
+I will use the following  tool: `image_transformer` to transform the image.
+
+
+==Code generated by the agent==
+image = image_transformer(image, prompt="A frozen lake and snowy forest")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
+
+The new image processing tool is based on ControlNet which is can make very strong modifications to the image.
+By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it.
+
+```py
+image = agent.run("Upscale the image", image)
+```
+
+```
+==Explanation from the agent==
+I will use the following  tool: `image_upscaler` to upscale the image.
+
+
+==Code generated by the agent==
+upscaled_image = image_upscaler(image)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
+
+The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool 
+and was able to correctly run it.
+
+Next, let's have a look into how you can create a new custom tool.
+
+### Adding new tools
+
+In this section we show how to create a new tool that can be added to the agent.
+
+#### Creating a new tool
+
+We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face
+Hub with the most downloads for a given task.
+
+We can do that with the following code:
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`.
+
+How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the
+main attributes necessary. We'll create a class that inherits from it:
+
+```python
+from transformers import Tool
+
+
+class HFModelDownloadsTool(Tool):
+    pass
+```
+
+This class has a few needs:
+- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a
+  performative name, we'll name it `model_download_counter`.
+- An attribute `description`, which will be used to populate the prompt of the agent.
+- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types,
+  and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected
+  values, which can be `text`, `image`, or `audio`.
+- A `__call__` method which contains the inference code. This is the code we've played with above!
+
+Here's what our class looks like now:
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
+        "returns the name of the checkpoint."
+    )
+
+    inputs = ["text"]
+    outputs = ["text"]
+
+    def __call__(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+We now have our tool handy. Save it in a file and import it from your main script. Let's name this file
+`model_downloads.py`, so the resulting import code looks like this:
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your 
+namespace. To do so, just call `push_to_hub` on the `tool` variable:
+
+```python
+tool.push_to_hub("lysandre/hf-model-downloads")
+```
+
+You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it.
+
+#### Having the agent use the tool
+
+We now have our tool that lives on the Hub which can be instantiated as such:
+
+```python
+from transformers import load_tool
+
+tool = load_tool("lysandre/hf-model-downloads")
+```
+
+In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run(
+    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+which outputs the following:
+```
+==Code generated by the agent==
+model = model_download_counter(task="text-to-video")
+print(f"The model with the most downloads is {model}.")
+audio_model = text_reader(model)
+
+
+==Result==
+The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
+```
+
+and generates the following audio.
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+
+<Tip>
+
+Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. Having a well-defined
+description of the tool is paramount to having it be leveraged by the agent.
+
+</Tip>
+
+### Replacing existing tools
+
+Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. Here's how one would do so:
+
+```python
+from transformers import HfAgent, load_tool
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
+```
+
+<Tip>
+
+Beware when replacing tools with others! This will also adjust the agent's prompt. This can be good if you have a better
+prompt suited for the task, but it can also result in your tool being selected way more than others or for other
+tools to be selected instead of the one you have defined.
+
+</Tip>
+
+## Leveraging gradio-tools
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
+Face Spaces as tools. It supports many existing Spaces as well as custom Spaces to be designed with it.
+
+We offer support for `gradio_tools` by using the `Tool.from_gradio` method. For example, we want to take
+advantage of the `StableDiffusionPromptGeneratorTool` tool offered in the `gradio-tools` toolkit so as to
+improve our prompts and generate better images.
+
+We first import the tool from `gradio_tools` and instantiate it:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+
+gradio_tool = StableDiffusionPromptGeneratorTool()
+```
+
+We pass that instance to the `Tool.from_gradio` method:
+
+```python
+from transformers import Tool
+
+tool = Tool.from_gradio(gradio_tools)
+```
+
+Now we can manage it exactly as we would a usual custom tool. We leverage it to improve our prompt
+` a rabbit wearing a space suit`:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
+```
+
+The model adequately leverages the tool:
+```
+==Explanation from the agent==
+I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
+
+
+==Code generated by the agent==
+improved_prompt = StableDiffusionPromptGenerator(prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(improved_prompt)
+```
+
+Before finally generating the image:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
+
+<Tip warning={true}>
+
+gradio-tools requires *textual* inputs and outputs, even when working with different modalities. This implementation
+works with image and audio objects. The two are currently incompatible, but will rapidly become compatible as we
+work to improve the support.
+
+</Tip>
+
+## Future compatibility with Langchain
+
+We love Langchain and think it has a very compelling suite of tools. In order to handle these tools,
+Langchain requires *textual* inputs and outputs, even when working with different modalities.
+This is often the serialized version (i.e., saved to disk) of the objects.
+
+This difference means that multi-modality isn't handled between transformers-agents and langchain.
+We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain
+users to help us achieve this compatibility.
+
+We would love to have better support. If you would like to help, please 
+[open an issue](https://github.com/huggingface/transformers/issues/new) and share what you have in mind.
diff --git a/docs/source/en/main_classes/agent.mdx b/docs/source/en/main_classes/agent.mdx
new file mode 100644
index 000000000000..ee910b893b6d
--- /dev/null
+++ b/docs/source/en/main_classes/agent.mdx
@@ -0,0 +1,64 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Agents & Tools
+
+<Tip warning={true}>
+
+Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+To learn more about agents and tools make sure to read the [introductory guide](../agents_and_tools). This page
+contains the API docs for the underlying classes.
+
+## Agents
+
+We provide two types of agents: [`HfAgent`] uses inference endpoints for opensource models and [`OpenAiAgent`] uses OpenAI closed models.
+
+### HfAgent
+
+[[autodoc]] HfAgent
+
+### OpenAiAgent
+
+[[autodoc]] OpenAiAgent
+
+### Agent
+
+[[autodoc]] Agent
+    - chat
+    - run
+    - prepare_for_new_chat
+
+## Tools
+
+### load_tool
+
+[[autodoc]] load_tool
+
+### Tool
+
+[[autodoc]] Tool
+
+### PipelineTool
+
+[[autodoc]] PipelineTool
+
+### RemoteTool
+
+[[autodoc]] RemoteTool
+
+### launch_gradio_demo
+
+[[autodoc]] launch_gradio_demo
diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.mdx
new file mode 100644
index 000000000000..bddc908b0201
--- /dev/null
+++ b/docs/source/en/transformers_agents.mdx
@@ -0,0 +1,329 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Transformers Agent
+
+<Tip warning={true}>
+
+Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+Transformers version v4.29.0, building on the concept of *tools* and *agents*.
+
+In short, it provides a natural language API on top of transformers: we define a set of curated tools, and design an 
+agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, 
+but we'll show you how the system can be extended easily to use any tool developed by the community.
+
+Let's start with a few examples of what can be achieved with this new API. It is particularly powerful when it comes 
+to multimodal tasks, so let's take it for a spin to generate images and read text out loud.
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **Input**                                                                                                                   | **Output**                        |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **Input**                                                                                                               | **Output**                                   |
+|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **Input**                                                                                                                   | **Output**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## Quickstart
+
+Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM). 
+We recommend using the [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) checkpoint as it works very well 
+for the task at hand and is open-source, but please find other examples below.
+
+Start by logging-in to have access to the Inference API:
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+Then, instantiate the agent
+
+```py
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+```
+
+This is using the inference API that Hugging Face provides for free at the moment, if you have your own inference
+endpoint for this model (or another one) you can replace the url above by your url endpoint.
+
+<Tip>
+
+We're showcasing StarCoder as the default in the documentation as the model is free to use and performs admirably well
+on simple tasks. However, the checkpoint doesn't hold up when handling more complex prompts. If you're facing such an
+issue, we recommend trying out the OpenAI model which, while sadly not open-source, performs better at this given time.
+
+</Tip>
+
+You're now good to go! Let's dive into the two APIs that you now have at your disposal.
+
+### Single execution (run)
+
+The single execution method is when using the [`~Agent.run`] method of the agent:
+
+```py
+agent.run("Draw me a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+It automatically select the tool (or tools) appropriate for the task you want to perform and run them appropriately. It
+can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely
+the agent is to fail).
+
+```py
+agent.chat("Draw me a picture of the sea then transform the picture to add an island.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+
+Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks.
+
+Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely
+different results. It's important to explain as clearly as possible the task you want to perform.
+
+If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
+variables that you would like the agent to use. For example you could generate the first image of rivers and lakes, 
+and ask the model to update that picture to add an island by doing the following:
+
+```python
+picture = agent.run("Draw me a picture of rivers and lakes")
+updated_picture = agent.chat("Take that `picture` and add an island to it", picture=picture)
+```
+
+<Tip>
+
+This can be helpful when the model is unable to understand your request and mixes tools. An example would be:
+
+```python
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+Here, the model could interpret it two ways:
+- Have the `text-to-image` generate a capybara swimming in the sea
+- Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea
+
+In case you would like to force the first scenario, you could do so by passing it the prompt as an argument:
+
+```python
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### Chat-based execution (chat)
+
+The agent also has a chat-based approach, using the [`~Agent.chat`] method:
+
+```py
+agent.chat("Draw me a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+This is an interesting approach when you want to keep the state across instructions. It's better for experimentation, 
+but will tend to be much better at single instructions rather than complex instructions (which the [`~Agent.run`]
+method is better at handling).
+
+This method can also take arguments if you would like to pass non-text types or specific prompts.
+
+### ⚠️ Remote execution
+
+For demonstration purposes and so that this can be used with all setups, we have created remote executors for several 
+of the default tools the agent has access to. These are created using 
+[inference endpoints](https://huggingface.co/inference-endpoints). To see how to setup remote executors tools yourself,
+we recommend reading the custom tool guide [TODO LINK].
+
+In order to run with remote tools, specifying `remote=True` to either [`~Agent.run`] or [`~Agent.chat`] is sufficient.
+
+For example, the following command could be run on any device efficiently, without needing significant RAM or GPU:
+
+```python
+agent.run("Draw me a picture of rivers and lakes", remote=True)
+```
+
+The same can be said for [`~Agent.chat`]:
+
+```py
+agent.chat("Draw me a picture of rivers and lakes", remote=True)
+```
+
+### What's happening here? What are tools, and what are agents?
+
+#### Agents
+
+The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
+
+LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the 
+LLM to give a small sample of code performing a task with a set of tools. This prompt is then completed by the 
+task you give your agent and the description of the tools you give it. This way it gets access to the doc of the 
+tools you are using, especially their expected inputs and outputs and can generate the relevant code.
+
+#### Tools
+
+Tools are very simple: they're a single function, with a name, and a description. We then use these tools description 
+to prompt the agent. Through the prompt, we show the agent how it would leverage tools in order to perform what was 
+requests in the query.
+
+This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. 
+Pipelines are more refactored and often combine several tasks in one. Tools are really meant to be focused on
+one very simple task only.
+
+#### Code-execution?!
+
+This code is then executed with our small Python interpreter on the set of inputs passed along with your tools. 
+We hear you screaming "Arbitrary code execution!" in the back, but let us explain why that is not the case.
+
+The only functions that can be called are the tools you provided and the print function, so you're already 
+limited in what can be executed. You should be safe if it's limited to Hugging Face tools. 
+
+Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along 
+inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM 
+to output them anyway) shouldn't be an issue. If you want to be on the super safe side, you can execute the 
+run() method with the additional argument return_code=True, in which case the agent will just return the code 
+to execute and you can decide whether to do it or not.
+
+The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error 
+with the code generated by the agent.
+
+### A curated set of tools
+
+We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated 
+in `transformers`:
+
+- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](../model_doc/donut))
+- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](../model_doc/flan-t5))
+- **Unconditional image captioning**: Caption the image! ([BLIP](../model_doc/blip))
+- **Image question answering**: given an image, answer a question on this image ([VILT](../model_doc/vilt))
+- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](../model_doc/clipseg))
+- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](../model_doc/whisper))
+- **Text to speech**: convert text to speech ([SpeechT5](../model_doc/speecht5))
+- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](../model_doc/bart))
+- **Text summarization**: summarize a long text in one or a few sentences ([BART](../model_doc/bart))
+- **Translation**: translate the text into a given language ([NLLB](../model_doc/nllb))
+
+These tools have an integration in transformers, and can be used manually as well, for example:
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### Custom tools
+
+While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is 
+the ability to quickly create and share custom tools.
+
+By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool 
+directly with the agent. We've added a few 
+**transformers-agnostic** tools to the `huggingface-tools` organization:
+
+- **Text downloader**: to download a text from a web URL
+- **Text to image**: generate an image according to a prompt, leveraging stable diffusion
+- **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
+
+The text-to-image tool we have been using since the beginning is actually a remote tool that lives in 
+[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
+continue releasing such tools on this and other organization, to further supercharge this implementation.
+
+The agents have by default access to tools that reside on `huggingface-tools`.
+We explain how to you can write and share your own tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
+[following guide](custom_tools).
+
+### Leveraging different agents
+
+We showcase here how to use the [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) model as an LLM, but 
+it isn't the only model available. We also support the OpenAssistant model and OpenAI's davinci models (3.5 and 4).
+
+We're planning on supporting local language models in an ulterior version.
+
+The tools defined in this implementation are agnostic to the agent used; we are showcasing the agents that work with 
+our prompts below, but the tools can also be used with Langchain, Minichain, or any other Agent-based library.
+
+#### Example code for the OpenAssistant model
+
+```py
+from transformers import HfAgent
+
+agent = HfAgent(url_endpoint="https://OpenAssistant/oasst-sft-1-pythia-12b", token="<HF_TOKEN>")
+```
+
+#### Example code for OpenAI models
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<API_KEY>")
+```
+
+### Code generation
+
+So far we have shown how to use the agents to perform actions for you. However, the agent is really only generating code
+that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in 
+a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports.
+
+For example, the following instruction
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+returns the following code
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+that you can then modify and execute yourself.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2f1e04c3cb6b..894aed09900e 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -610,6 +610,16 @@
         "SpecialTokensMixin",
         "TokenSpan",
     ],
+    "tools": [
+        "Agent",
+        "HfAgent",
+        "OpenAiAgent",
+        "PipelineTool",
+        "RemoteTool",
+        "Tool",
+        "launch_gradio_demo",
+        "load_tool",
+    ],
     "trainer_callback": [
         "DefaultFlowCallback",
         "EarlyStoppingCallback",
@@ -4340,6 +4350,9 @@
         TokenSpan,
     )
 
+    # Tools
+    from .tools import Agent, HfAgent, OpenAiAgent, PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
+
     # Trainer
     from .trainer_callback import (
         DefaultFlowCallback,
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 8d0ff2c34f2b..fd8a5204d8de 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -115,9 +115,9 @@ def get_relative_import_files(module_file):
     return all_relative_imports
 
 
-def check_imports(filename):
+def get_imports(filename):
     """
-    Check if the current Python environment contains all the libraries that are imported in a file.
+    Extracts all the libraries that are imported in a file.
     """
     with open(filename, "r", encoding="utf-8") as f:
         content = f.read()
@@ -131,9 +131,14 @@ def check_imports(filename):
     imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
     # Only keep the top-level module
     imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
+    return list(set(imports))
+
 
-    # Unique-ify and test we got them all
-    imports = list(set(imports))
+def check_imports(filename):
+    """
+    Check if the current Python environment contains all the libraries that are imported in a file.
+    """
+    imports = get_imports(filename)
     missing_packages = []
     for imp in imports:
         try:
@@ -169,6 +174,7 @@ def get_cached_module_file(
     use_auth_token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
+    repo_type: Optional[str] = None,
     _commit_hash: Optional[str] = None,
 ):
     """
@@ -207,6 +213,8 @@ def get_cached_module_file(
             identifier allowed by git.
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
 
     <Tip>
 
@@ -229,7 +237,7 @@ def get_cached_module_file(
     else:
         submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
         cached_module = try_to_load_from_cache(
-            pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash
+            pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
         )
 
     new_files = []
@@ -245,6 +253,7 @@ def get_cached_module_file(
             local_files_only=local_files_only,
             use_auth_token=use_auth_token,
             revision=revision,
+            repo_type=repo_type,
             _commit_hash=_commit_hash,
         )
         if not is_local and cached_module != resolved_module_file:
@@ -309,8 +318,10 @@ def get_cached_module_file(
 
     if len(new_files) > 0:
         new_files = "\n".join([f"- {f}" for f in new_files])
+        repo_type_str = "" if repo_type is None else f"{repo_type}/"
+        url = f"https://huggingface.co/{repo_type_str}{pretrained_model_name_or_path}"
         logger.warning(
-            f"A new version of the following files was downloaded from {pretrained_model_name_or_path}:\n{new_files}"
+            f"A new version of the following files was downloaded from {url}:\n{new_files}"
             "\n. Make sure to double-check they do not contain any added malicious code. To avoid downloading new "
             "versions of the code file, you can pin a revision."
         )
@@ -328,6 +339,7 @@ def get_class_from_dynamic_module(
     use_auth_token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
+    repo_type: Optional[str] = None,
     **kwargs,
 ):
     """
@@ -377,6 +389,8 @@ def get_class_from_dynamic_module(
             identifier allowed by git.
         local_files_only (`bool`, *optional*, defaults to `False`):
             If `True`, will only try to load the tokenizer configuration from local files.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
 
     <Tip>
 
@@ -418,6 +432,7 @@ def get_class_from_dynamic_module(
         use_auth_token=use_auth_token,
         revision=revision,
         local_files_only=local_files_only,
+        repo_type=repo_type,
     )
     return get_class_in_module(class_name, final_module.replace(".py", ""))
 
@@ -439,6 +454,7 @@ def custom_object_save(obj, folder, config=None):
             "this code in a separate module so we can include it in the saved folder and make it easier to share via "
             "the Hub."
         )
+        return
 
     def _set_auto_map_in_config(_config):
         module_name = obj.__class__.__module__
@@ -478,12 +494,17 @@ def _set_auto_map_in_config(_config):
     elif config is not None:
         _set_auto_map_in_config(config)
 
+    result = []
     # Copy module file to the output folder.
     object_file = sys.modules[obj.__module__].__file__
     dest_file = Path(folder) / (Path(object_file).name)
     shutil.copy(object_file, dest_file)
+    result.append(dest_file)
 
     # Gather all relative imports recursively and make sure they are copied as well.
     for needed_file in get_relative_import_files(object_file):
         dest_file = Path(folder) / (Path(needed_file).name)
         shutil.copy(needed_file, dest_file)
+        result.append(dest_file)
+
+    return result
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 08ec05fa09c3..109b148f34e9 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -64,6 +64,10 @@ class ChannelDimension(ExplicitEnum):
     LAST = "channels_last"
 
 
+def is_pil_image(img):
+    return is_vision_available() and isinstance(img, PIL.Image.Image)
+
+
 def is_valid_image(img):
     return (
         (is_vision_available() and isinstance(img, PIL.Image.Image))
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a9c2eadb45cc..e9e9c7e2c1c6 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -148,6 +148,7 @@ def parse_int_from_env(key, default=None):
 _run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
 _tf_gpu_memory_limit = parse_int_from_env("TF_GPU_MEMORY_LIMIT", default=None)
 _run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
+_run_tool_tests = parse_flag_from_env("RUN_TOOL_TESTS", default=False)
 
 
 def is_pt_tf_cross_test(test_case):
@@ -221,6 +222,21 @@ def is_pipeline_test(test_case):
             return pytest.mark.is_pipeline_test()(test_case)
 
 
+def is_tool_test(test_case):
+    """
+    Decorator marking a test as a tool test. If RUN_TOOL_TESTS is set to a falsy value, those tests will be skipped.
+    """
+    if not _run_tool_tests:
+        return unittest.skip("test is a tool test")(test_case)
+    else:
+        try:
+            import pytest  # We don't need a hard dependency on pytest in the main library
+        except ImportError:
+            return test_case
+        else:
+            return pytest.mark.is_tool_test()(test_case)
+
+
 def slow(test_case):
     """
     Decorator marking a test as slow.
diff --git a/src/transformers/tools/__init__.py b/src/transformers/tools/__init__.py
new file mode 100644
index 000000000000..d064f35a5e66
--- /dev/null
+++ b/src/transformers/tools/__init__.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "agents": ["Agent", "HfAgent", "OpenAiAgent"],
+    "base": ["PipelineTool", "RemoteTool", "Tool", "launch_gradio_demo", "load_tool"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["document_question_answering"] = ["DocumentQuestionAnsweringTool"]
+    _import_structure["image_captioning"] = ["ImageCaptioningTool"]
+    _import_structure["image_question_answering"] = ["ImageQuestionAnsweringTool"]
+    _import_structure["image_segmentation"] = ["ImageSegmentationTool"]
+    _import_structure["language_identifier"] = ["LanguageIdentificationTool"]
+    _import_structure["speech_to_text"] = ["SpeechToTextTool"]
+    _import_structure["text_classification"] = ["TextClassificationTool"]
+    _import_structure["text_question_answering"] = ["TextQuestionAnsweringTool"]
+    _import_structure["text_summarization"] = ["TextSummarizationTool"]
+    _import_structure["text_to_speech"] = ["TextToSpeechTool"]
+    _import_structure["translation"] = ["TranslationTool"]
+
+if TYPE_CHECKING:
+    from .agents import Agent, HfAgent, OpenAiAgent
+    from .base import PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .document_question_answering import DocumentQuestionAnsweringTool
+        from .image_captioning import ImageCaptioningTool
+        from .image_question_answering import ImageQuestionAnsweringTool
+        from .image_segmentation import ImageSegmentationTool
+        from .language_identifier import LanguageIdentificationTool
+        from .speech_to_text import SpeechToTextTool
+        from .text_classification import TextClassificationTool
+        from .text_question_answering import TextQuestionAnsweringTool
+        from .text_summarization import TextSummarizationTool
+        from .text_to_speech import TextToSpeechTool
+        from .translation import TranslationTool
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
new file mode 100644
index 000000000000..e1ae12804b3a
--- /dev/null
+++ b/src/transformers/tools/agents.py
@@ -0,0 +1,489 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.util
+import json
+import os
+import time
+from dataclasses import dataclass
+
+import requests
+from huggingface_hub import HfFolder, hf_hub_download, list_spaces
+
+from ..utils import is_openai_available, logging
+from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
+from .prompts import CHAT_MESSAGE_PROMPT, CHAT_PROMPT_TEMPLATE, RUN_PROMPT_TEMPLATE
+from .python_interpreter import evaluate
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_openai_available():
+    import openai
+
+_tools_are_initialized = False
+
+
+BASE_PYTHON_TOOLS = {
+    "print": print,
+    "float": float,
+    "int": int,
+    "bool": bool,
+    "str": str,
+}
+
+
+@dataclass
+class PreTool:
+    task: str
+    description: str
+    repo_id: str
+
+
+HUGGINGFACE_DEFAULT_TOOLS = {}
+
+
+HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB = [
+    "image-transformation",
+    "text-download",
+    "text-to-image",
+    "text-to-video",
+]
+
+
+def get_remote_tools(organization="huggingface-tools"):
+    spaces = list_spaces(author=organization)
+    tools = {}
+    for space_info in spaces:
+        repo_id = space_info.id
+        resolved_config_file = hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="space")
+        with open(resolved_config_file, encoding="utf-8") as reader:
+            config = json.load(reader)
+
+        task = repo_id.split("/")[-1]
+        tools[config["name"]] = PreTool(task=task, description=config["description"], repo_id=repo_id)
+
+    return tools
+
+
+def _setup_default_tools():
+    global HUGGINGFACE_DEFAULT_TOOLS
+    global _tools_are_initialized
+
+    if _tools_are_initialized:
+        return
+
+    main_module = importlib.import_module("transformers")
+    tools_module = main_module.tools
+
+    remote_tools = get_remote_tools()
+    for task_name in TASK_MAPPING:
+        tool_class_name = TASK_MAPPING.get(task_name)
+        tool_class = getattr(tools_module, tool_class_name)
+        description = tool_class.description
+        HUGGINGFACE_DEFAULT_TOOLS[tool_class.name] = PreTool(task=task_name, description=description, repo_id=None)
+
+    for task_name in HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB:
+        found = False
+        for tool_name, tool in remote_tools.items():
+            if tool.task == task_name:
+                HUGGINGFACE_DEFAULT_TOOLS[tool_name] = tool
+                found = True
+                break
+
+        if not found:
+            raise ValueError(f"{task_name} is not implemented on the Hub.")
+
+    _tools_are_initialized = True
+
+
+def resolve_tools(code, toolbox, remote=False, cached_tools=None):
+    if cached_tools is None:
+        resolved_tools = BASE_PYTHON_TOOLS.copy()
+    else:
+        resolved_tools = cached_tools
+    for name, tool in toolbox.items():
+        if name not in code or name in resolved_tools:
+            continue
+
+        if isinstance(tool, Tool):
+            resolved_tools[name] = tool
+        else:
+            task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
+            _remote = remote and supports_remote(task_or_repo_id)
+            resolved_tools[name] = load_tool(task_or_repo_id, remote=_remote)
+
+    return resolved_tools
+
+
+def get_tool_creation_code(code, toolbox, remote=False):
+    code_lines = ["from transformers import load_tool", ""]
+    for name, tool in toolbox.items():
+        if name not in code or isinstance(tool, Tool):
+            continue
+
+        task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
+        line = f'{name} = load_tool("{task_or_repo_id}"'
+        if remote:
+            line += ", remote=True"
+        line += ")"
+        code_lines.append(line)
+
+    return "\n".join(code_lines) + "\n"
+
+
+def clean_code_for_chat(result):
+    lines = result.split("\n")
+    idx = 0
+    while idx < len(lines) and not lines[idx].lstrip().startswith("```"):
+        idx += 1
+    explanation = "\n".join(lines[:idx]).strip()
+    if idx == len(lines):
+        return explanation, None
+
+    idx += 1
+    start_idx = idx
+    while not lines[idx].lstrip().startswith("```"):
+        idx += 1
+    code = "\n".join(lines[start_idx:idx]).strip()
+
+    return explanation, code
+
+
+def clean_code_for_run(result):
+    result = f"I will use the following {result}"
+    explanation, code = result.split("Answer:")
+    explanation = explanation.strip()
+    code = code.strip()
+
+    code_lines = code.split("\n")
+    if code_lines[0] in ["```", "```py", "```python"]:
+        code_lines = code_lines[1:]
+    if code_lines[-1] == "```":
+        code_lines = code_lines[:-1]
+    code = "\n".join(code_lines)
+
+    return explanation, code
+
+
+class Agent:
+    """
+    Base class for all agents which contains the main API methods.
+
+    Args:
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+    """
+
+    def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
+        _setup_default_tools()
+
+        self.chat_prompt_template = CHAT_MESSAGE_PROMPT if chat_prompt_template is None else chat_prompt_template
+        self.run_prompt_template = RUN_PROMPT_TEMPLATE if run_prompt_template is None else run_prompt_template
+        self.toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
+        if additional_tools is not None:
+            if isinstance(additional_tools, (list, tuple)):
+                additional_tools = {t.name: t for t in additional_tools}
+            elif not isinstance(additional_tools, dict):
+                additional_tools = {additional_tools.name: additional_tools}
+
+            replacements = {name: tool for name, tool in additional_tools.items() if name in HUGGINGFACE_DEFAULT_TOOLS}
+            self.toolbox.update(additional_tools)
+            if len(replacements) > 1:
+                names = "\n".join([f"- {n}: {t}" for n, t in replacements.items()])
+                logger.warn(
+                    f"The following tools have been replaced by the ones provided in `additional_tools`:\n{names}."
+                )
+            elif len(replacements) == 1:
+                name = list(replacements.keys())[0]
+                logger.warn(f"{name} has been replaced by {replacements[name]} as provided in `additional_tools`.")
+
+        self.prepare_for_new_chat()
+
+    def format_prompt(self, task, chat_mode=False):
+        description = "\n".join([f"- {name}: {tool.description}" for name, tool in self.toolbox.items()])
+        if chat_mode:
+            if self.chat_history is None:
+                prompt = CHAT_PROMPT_TEMPLATE.replace("<<all_tools>>", description)
+            else:
+                prompt = self.chat_history
+            prompt += CHAT_MESSAGE_PROMPT.replace("<<task>>", task)
+        else:
+            prompt = self.run_prompt_template.replace("<<all_tools>>", description)
+            prompt = prompt.replace("<<prompt>>", task)
+        return prompt
+
+    def chat(self, task, *, return_code=False, remote=False, **kwargs):
+        """
+        Sends a new request to the agent in a chat. Will use the previous ones in its history.
+
+        Args:
+            task (`str`): The task to perform
+            return_code (`bool`, *optional*, defaults to `False`):
+                Whether to just return code and not evaluate it.
+            remote (`bool`, *optional*, defaults to `False`):
+                Whether or not to use remote tools (inference endpoints) instead of local ones.
+            kwargs:
+                Any keyword argument to send to the agent when evaluating the code.
+
+        Example:
+
+        ```py
+        from transformers import HfAgent
+
+        agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+        agent.chat("Draw me a picture of rivers and lakes")
+
+        agent.chat("Transform the picture so that there is a rock in there")
+        ```
+        """
+        prompt = self.format_prompt(task, chat_mode=True)
+        result = self.generate_one(prompt, stop=["Human:", "====="])
+        self.chat_history = prompt + result + "\n"
+        explanation, code = clean_code_for_chat(result)
+
+        print(f"==Explanation from the agent==\n{explanation}")
+
+        if code is not None:
+            print(f"\n\n==Code generated by the agent==\n{code}")
+            if not return_code:
+                print("\n\n==Result==")
+                self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
+                self.chat_state.update(kwargs)
+                return evaluate(code, self.cached_tools, self.chat_state, chat_mode=True)
+            else:
+                tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
+                return f"{tool_code}\n{code}"
+
+    def prepare_for_new_chat(self):
+        """
+        Clears the history of prior calls to [`~Agent.chat`].
+        """
+        self.chat_history = None
+        self.chat_state = {}
+        self.cached_tools = None
+
+    def run(self, task, *, return_code=False, remote=False, **kwargs):
+        """
+        Sends a request to the agent.
+
+        Args:
+            task (`str`): The task to perform
+            return_code (`bool`, *optional*, defaults to `False`):
+                Whether to just return code and not evaluate it.
+            remote (`bool`, *optional*, defaults to `False`):
+                Whether or not to use remote tools (inference endpoints) instead of local ones.
+            kwargs:
+                Any keyword argument to send to the agent when evaluating the code.
+
+        Example:
+
+        ```py
+        from transformers import HfAgent
+
+        agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+        agent.run("Draw me a picture of rivers and lakes")
+        ```
+        """
+        prompt = self.format_prompt(task)
+        result = self.generate_one(prompt, stop=["Task:"])
+        explanation, code = clean_code_for_run(result)
+
+        print(f"==Explanation from the agent==\n{explanation}")
+
+        print(f"\n\n==Code generated by the agent==\n{code}")
+        if not return_code:
+            print("\n\n==Result==")
+            self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
+            return evaluate(code, self.cached_tools, state=kwargs.copy())
+        else:
+            tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
+            return f"{tool_code}\n{code}"
+
+    def generate_one(self, prompt, stop):
+        # This is the method to implement in your custom agent.
+        raise NotImplementedError
+
+    def generate_many(self, prompts, stop):
+        # Override if you have a way to do batch generation faster than one by one
+        return [self.generate_one(prompt, stop) for prompt in prompts]
+
+
+class OpenAiAgent(Agent):
+    """
+    Agent that uses the openai API to generate code.
+
+    <Tip warning={true}>
+
+    The openAI models are used in generation mode, so even for the `chat()` API, it's better to use models like
+    `"text-davinci-003"` over the chat-GPT variant. Proper support for chat-GPT models will come in a next version.
+
+    </Tip>
+
+    Args:
+        model (`str`, *optional*, defaults to `"text-davinci-003"`):
+            The name of the OpenAI model to use.
+        api_key (`str`, *optional*):
+            The API key to use. If unset, will look for the environment variable `"OPENAI_API_KEY"`.
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    from transformers import OpenAiAgent
+
+    agent = OpenAiAgent(model="text-davinci-003", api_key=xxx)
+    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
+    ```
+    """
+
+    def __init__(
+        self,
+        model="text-davinci-003",
+        api_key=None,
+        chat_prompt_template=None,
+        run_prompt_template=None,
+        additional_tools=None,
+    ):
+        if not is_openai_available():
+            raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
+
+        if api_key is None:
+            api_key = os.environ.get("OPENAI_API_KEY", None)
+        if api_key is None:
+            raise ValueError(
+                "You need an openai key to use `OpenAIAgent`. You can get one here: Get one here "
+                "https://openai.com/api/`. If you have one, set it in your env with `os.environ['OPENAI_API_KEY'] = "
+                "xxx."
+            )
+        else:
+            openai.api_key = api_key
+        self.model = model
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    def generate_many(self, prompts, stop):
+        if "gpt" in self.model:
+            return [self._chat_generate(prompt, stop) for prompt in prompts]
+        else:
+            return self._completion_generate(prompts, stop)
+
+    def generate_one(self, prompt, stop):
+        if "gpt" in self.model:
+            return self._chat_generate(prompt, stop)
+        else:
+            return self._completion_generate([prompt], stop)[0]
+
+    def _chat_generate(self, prompt, stop):
+        result = openai.ChatCompletion.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            stop=stop,
+        )
+        return result["choices"][0]["message"]["content"]
+
+    def _completion_generate(self, prompts, stop):
+        result = openai.Completion.create(
+            model=self.model,
+            prompt=prompts,
+            temperature=0,
+            stop=stop,
+            max_tokens=200,
+        )
+        return [answer["text"] for answer in result["choices"]]
+
+
+class HfAgent(Agent):
+    """
+    Agent that uses and inference endpoint to generate code.
+
+    Args:
+        url_endpoint (`str`):
+            The name of the url endpoint to use.
+        token (`str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
+            running `huggingface-cli login` (stored in `~/.huggingface`).
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    from transformers import HfAgent
+
+    agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
+    ```
+    """
+
+    def __init__(
+        self, url_endpoint, token=None, chat_prompt_template=None, run_prompt_template=None, additional_tools=None
+    ):
+        self.url_endpoint = url_endpoint
+        if token is None:
+            self.token = f"Bearer {HfFolder().get_token()}"
+        elif token.startswith("Bearer") or token.startswith("Basic"):
+            self.token = token
+        else:
+            self.token = f"Bearer {token}"
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    def generate_one(self, prompt, stop):
+        headers = {"Authorization": self.token}
+        inputs = {
+            "inputs": prompt,
+            "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},
+        }
+
+        response = requests.post(self.url_endpoint, json=inputs, headers=headers)
+        if response.status_code == 429:
+            print("Getting rate-limited, waiting a tiny bit before trying again.")
+            time.sleep(1)
+            return self._generate_one(prompt)
+        elif response.status_code != 200:
+            raise ValueError(f"Error {response.status_code}: {response.json()}")
+
+        result = response.json()[0]["generated_text"]
+        # Inference API returns the stop sequence
+        for stop_seq in stop:
+            if result.endswith(stop_seq):
+                result = result[: -len(stop_seq)]
+        return result
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
new file mode 100644
index 000000000000..47bb9fe39982
--- /dev/null
+++ b/src/transformers/tools/base.py
@@ -0,0 +1,722 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import importlib
+import inspect
+import io
+import json
+import os
+import tempfile
+from typing import Any, Dict, List, Optional, Union
+
+from huggingface_hub import CommitOperationAdd, HfFolder, create_commit, create_repo, hf_hub_download
+from huggingface_hub.utils import RepositoryNotFoundError, get_session
+
+from ..dynamic_module_utils import custom_object_save, get_class_from_dynamic_module, get_imports
+from ..image_utils import is_pil_image
+from ..models.auto import AutoProcessor
+from ..utils import (
+    CONFIG_NAME,
+    cached_file,
+    is_accelerate_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+if is_accelerate_available():
+    from accelerate.utils import send_to_device
+
+
+TOOL_CONFIG_FILE = "tool_config.json"
+
+
+def get_repo_type(repo_id, repo_type=None, **hub_kwargs):
+    if repo_type is not None:
+        return repo_type
+    try:
+        hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="space", **hub_kwargs)
+        return "space"
+    except RepositoryNotFoundError:
+        try:
+            hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="model", **hub_kwargs)
+            return "model"
+        except RepositoryNotFoundError:
+            raise EnvironmentError(f"`{repo_id}` does not seem to be a valid repo identifier on the Hub.")
+        except Exception:
+            return "model"
+    except Exception:
+        return "space"
+
+
+# docstyle-ignore
+APP_FILE_TEMPLATE = """from transformers import launch_gradio_demo
+from {module_name} import {class_name}
+
+launch_gradio_demo({class_name})
+"""
+
+
+class Tool:
+    """
+    A base class for the functions used by the agent. Subclass this and implement the `__call__` method as well as the
+    following class attributes:
+
+    - **description** (`str`) -- A short description of what your tool does, the inputs it expects and the output(s) it
+      will return. For instance 'This is a tool that downloads a file from a `url`. It takes the `url` as input, and
+      returns the text contained in the file'.
+    - **name** (`str`) -- A performative name that will be used for your tool in the prompt to the agent. For instance
+      `"text-classifier"` or `"image_generator"`.
+    - **inputs** (`List[str]`) -- The list of modalities expected for the inputs (in the same order as in the call).
+      Modalitiies should be `"text"`, `"image"` or `"audio"`. This is only used by `launch_gradio_demo` or to make a
+      nice space from your tool.
+    - **outputs** (`List[str]`) -- The list of modalities returned but the tool (in the same order as the return of the
+      call method). Modalitiies should be `"text"`, `"image"` or `"audio"`. This is only used by `launch_gradio_demo`
+      or to make a nice space from your tool.
+
+    You can also override the method [`~Tool.setup`] if your tool as an expensive operation to perform before being
+    usable (such as loading a model). [`~Tool.setup`] will be called the first time you use your tool, but not at
+    instantiation.
+    """
+
+    description: str = "This is a tool that ..."
+    name: str = ""
+
+    inputs: List[str]
+    outputs: List[str]
+
+    def __init__(self, *args, **kwargs):
+        self.is_initialized = False
+
+    def __call__(self, *args, **kwargs):
+        return NotImplemented("Write this method in your subclass of `Tool`.")
+
+    def setup(self):
+        """
+        Overwrite this method here for any operation that is expensive and needs to be executed before you start using
+        your tool. Such as loading a big model.
+        """
+        self.is_initialized = True
+
+    def save(self, output_dir):
+        """
+        Saves the relevant code files for your tool so it can be pushed to the Hub. This will copy the code of your
+        tool in `output_dir` as well as autogenerate:
+
+        - a config file named `tool_config.json`
+        - an `app.py` file so that your tool can be converted to a space
+        - a `requirements.txt` containing the names of the module used by your tool (as detected when inspecting its
+          code)
+
+        You should only use this method to save tools that are defined in a separate module (not `__main__`).
+
+        Args:
+            output_dir (`str`): The folder in which you want to save your tool.
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        # Save module file
+        if self.__module__ == "__main__":
+            raise ValueError(
+                f"We can't save the code defining {self} in {output_dir} as it's been defined in __main__. You "
+                "have to put this code in a separate module so we can include it in the saved folder."
+            )
+        module_files = custom_object_save(self, output_dir)
+
+        module_name = self.__class__.__module__
+        last_module = module_name.split(".")[-1]
+        full_name = f"{last_module}.{self.__class__.__name__}"
+
+        # Save config file
+        config_file = os.path.join(output_dir, "tool_config.json")
+        if os.path.isfile(config_file):
+            with open(config_file, "r", encoding="utf-8") as f:
+                tool_config = json.load(f)
+        else:
+            tool_config = {}
+
+        tool_config = {"tool_class": full_name, "description": self.description, "name": self.name}
+        with open(config_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(tool_config, indent=2, sort_keys=True) + "\n")
+
+        # Save app file
+        app_file = os.path.join(output_dir, "app.py")
+        with open(app_file, "w", encoding="utf-8") as f:
+            f.write(APP_FILE_TEMPLATE.format(module_name=last_module, class_name=self.__class__.__name__))
+
+        # Save requirements file
+        requirements_file = os.path.join(output_dir, "requirements.txt")
+        imports = []
+        for module in module_files:
+            imports.extend(get_imports(module))
+        imports = list(set(imports))
+        with open(requirements_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(imports) + "\n")
+
+    @classmethod
+    def from_hub(cls, repo_id, model_repo_id=None, token=None, remote=False, **kwargs):
+        """
+        Loads a tool defined on the Hub.
+
+        Args:
+            repo_id (`str`):
+                The name of the repo on the Hub where your tool is defined.
+            model_repo_id (`str`, *optional*):
+                If your tool uses a model and you want to use a different model than the default, you can pass a second
+                repo ID or an endpoint url to this argument.
+            token (`str`, *optional*):
+                The token to identify you on hf.co. If unset, will use the token generated when running
+                `huggingface-cli login` (stored in `~/.huggingface`).
+            remote (`bool`, *optional*, defaults to `False`):
+                Whether to use your tool by downloading the model or (if it is available) with an inference endpoint.
+            kwargs:
+                Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
+                `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the
+                others will be passed along to its init.
+        """
+        if remote and model_repo_id is None:
+            endpoints = get_default_endpoints()
+            if repo_id not in endpoints:
+                raise ValueError(
+                    f"Could not infer a default endpoint for {repo_id}, you need to pass one using the "
+                    "`model_repo_id` argument."
+                )
+            model_repo_id = endpoints[repo_id]
+        hub_kwargs_names = [
+            "cache_dir",
+            "force_download",
+            "resume_download",
+            "proxies",
+            "revision",
+            "repo_type",
+            "subfolder",
+            "local_files_only",
+        ]
+        hub_kwargs = {k: v for k, v in kwargs.items() if k in hub_kwargs_names}
+
+        # Try to get the tool config first.
+        hub_kwargs["repo_type"] = get_repo_type(repo_id, **hub_kwargs)
+        resolved_config_file = cached_file(
+            repo_id,
+            TOOL_CONFIG_FILE,
+            use_auth_token=token,
+            **hub_kwargs,
+            _raise_exceptions_for_missing_entries=False,
+            _raise_exceptions_for_connection_errors=False,
+        )
+        is_tool_config = resolved_config_file is not None
+        if resolved_config_file is None:
+            resolved_config_file = cached_file(
+                repo_id,
+                CONFIG_NAME,
+                use_auth_token=token,
+                **hub_kwargs,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+        if resolved_config_file is None:
+            raise EnvironmentError(
+                f"{repo_id} does not appear to provide a valid configuration in `tool_config.json` or `config.json`."
+            )
+
+        with open(resolved_config_file, encoding="utf-8") as reader:
+            config = json.load(reader)
+
+        if not is_tool_config:
+            if "custom_tool" not in config:
+                raise EnvironmentError(
+                    f"{repo_id} does not provide a mapping to custom tools in its configuration `config.json`."
+                )
+            custom_tool = config["custom_tool"]
+        else:
+            custom_tool = config
+
+        tool_class = custom_tool["tool_class"]
+        tool_class = get_class_from_dynamic_module(tool_class, repo_id, use_auth_token=token, **hub_kwargs)
+
+        if remote:
+            return RemoteTool(model_repo_id, token=token, tool_class=tool_class)
+        return tool_class(model_repo_id, token=token, **kwargs)
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        commit_message: str = "Upload tool",
+        private: Optional[bool] = None,
+        token: Optional[Union[bool, str]] = None,
+        create_pr: bool = False,
+    ) -> str:
+        """
+        Upload the tool to the Hub.
+
+        Parameters:
+            repo_id (`str`):
+                The name of the repository you want to push your tool to. It should contain your organization name when
+                pushing to a given organization.
+            commit_message (`str`, *optional*, defaults to `"Upload tool"`):
+                Message to commit while pushing.
+            private (`bool`, *optional*):
+                Whether or not the repository created should be private.
+            token (`bool` or `str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
+        """
+        repo_url = create_repo(
+            repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="space", space_sdk="gradio"
+        )
+        repo_id = repo_url.repo_id
+
+        with tempfile.TemporaryDirectory() as work_dir:
+            # Save all files.
+            self.save(work_dir)
+            os.listdir(work_dir)
+            operations = [
+                CommitOperationAdd(path_or_fileobj=os.path.join(work_dir, f), path_in_repo=f)
+                for f in os.listdir(work_dir)
+            ]
+            logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}")
+            return create_commit(
+                repo_id=repo_id,
+                operations=operations,
+                commit_message=commit_message,
+                token=token,
+                create_pr=create_pr,
+                repo_type="space",
+            )
+
+    @staticmethod
+    def from_gradio(gradio_tool):
+        """
+        Creates a [`Tool`] from a gradio tool.
+        """
+
+        class GradioToolWrapper(Tool):
+            def __init__(self, _gradio_tool):
+                super().__init__()
+                self.name = _gradio_tool.name
+                self.description = _gradio_tool.description
+
+        GradioToolWrapper.__call__ = gradio_tool.run
+        return GradioToolWrapper(gradio_tool)
+
+
+class RemoteTool(Tool):
+    """
+    A [`Tool`] that will make requests to an inference endpoint.
+
+    Args:
+        endpoint_url (`str`):
+            The url of the endpoint to use.
+        token (`str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
+            running `huggingface-cli login` (stored in `~/.huggingface`).
+        tool_class (`type`, *optional*):
+            The corresponding `tool_class` if this is a remote version of an existing tool. Will help determine when
+            the output should be converted to another type (like images).
+    """
+
+    def __init__(self, endpoint_url=None, token=None, tool_class=None):
+        self.endpoint_url = endpoint_url
+        self.client = EndpointClient(endpoint_url, token=token)
+        self.tool_class = tool_class
+
+    def prepare_inputs(self, *args, **kwargs):
+        """
+        Prepare the inputs received for the HTTP client sending data to the endpoint. Positional arguments will be
+        matched with the signature of the `tool_class` if it was provided at instantation. Images will be encoded into
+        bytes.
+
+        You can override this method in your custom class of [`RemoteTool`].
+        """
+        inputs = kwargs.copy()
+        if len(args) > 0:
+            if self.tool_class is not None:
+                # Match args with the signature
+                if issubclass(self.tool_class, PipelineTool):
+                    call_method = self.tool_class.encode
+                else:
+                    call_method = self.tool_class.__call__
+                signature = inspect.signature(call_method).parameters
+                parameters = [
+                    k
+                    for k, p in signature.items()
+                    if p.kind not in [inspect._ParameterKind.VAR_POSITIONAL, inspect._ParameterKind.VAR_KEYWORD]
+                ]
+                if parameters[0] == "self":
+                    parameters = parameters[1:]
+                if len(args) > len(parameters):
+                    raise ValueError(
+                        f"{self.tool_class} only accepts {len(parameters)} arguments but {len(args)} were given."
+                    )
+                for arg, name in zip(args, parameters):
+                    inputs[name] = arg
+            elif len(args) > 1:
+                raise ValueError("A `RemoteTool` can only accept one positional input.")
+            elif len(args) == 1:
+                if is_pil_image(args[0]):
+                    return {"inputs": self.client.encode_image(args[0])}
+                return {"inputs": args[0]}
+
+        for key, value in inputs.items():
+            if is_pil_image(value):
+                inputs[key] = self.client.encode_image(value)
+
+        return {"inputs": inputs}
+
+    def extract_outputs(self, outputs):
+        """
+        You can override this method in your custom class of [`RemoteTool`] to apply some custom post-processing of the
+        outputs of the endpoint.
+        """
+        return outputs
+
+    def __call__(self, *args, **kwargs):
+        output_image = self.tool_class is not None and self.tool_class.outputs == ["image"]
+        inputs = self.prepare_inputs(*args, **kwargs)
+        if isinstance(inputs, dict):
+            outputs = self.client(**inputs, output_image=output_image)
+        else:
+            outputs = self.client(inputs, output_image=output_image)
+        if isinstance(outputs, list) and len(outputs) == 1 and isinstance(outputs[0], list):
+            outputs = outputs[0]
+        return self.extract_outputs(outputs)
+
+
+class PipelineTool(Tool):
+    """
+    A [`Tool`] tailored towards Transformer models. On top of the class attributes of the base class [`Tool`], you will
+    need to specify:
+
+    - **model_class** (`type`) -- The class to use to load the model in this tool.
+    - **default_checkpoint** (`str`) -- The default checkpoint that should be used when the user doesn't specify one.
+    - **pre_processor_class** (`type`, *optional*, defaults to [`AutoProcessor`]) -- The class to use to load the
+      pre-processor
+    - **post_processor_class** (`type`, *optional*, defaults to [`AutoProcessor`]) -- The class to use to load the
+      post-processor (when different from the pre-processor).
+
+    Args:
+        model (`str` or [`PreTrainedModel`], *optional*):
+            The name of the checkpoint to use for the model, or the instantiated model. If unset, will default to the
+            value of the class attribute `default_checkpoint`.
+        pre_processor (`str` or `Any`, *optional*):
+            The name of the checkpoint to use for the pre-processor, or the instantiated pre-processor (can be a
+            tokenizer, an image processor, a feature extractor or a processor). Will default to the value of `model` if
+            unset.
+        post_processor (`str` or `Any`, *optional*):
+            The name of the checkpoint to use for the post-processor, or the instantiated pre-processor (can be a
+            tokenizer, an image processor, a feature extractor or a processor). Will default to the `pre_processor` if
+            unset.
+        device (`int`, `str` or `torch.device`, *optional*):
+            The device on which to execute the model. Will default to any accelerator available (GPU, MPS etc...), the
+            CPU otherwise.
+        device_map (`str` or `dict`, *optional*):
+            If passed along, will be used to instantiate the model.
+        model_kwargs (`dict`, *optional*):
+            Any keyword argument to send to the model instantiation.
+        token (`str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
+            running `huggingface-cli login` (stored in `~/.huggingface`).
+        hub_kwargs:
+            Any additional keyword argument to send to the methods that will load the data from the Hub.
+    """
+
+    pre_processor_class = AutoProcessor
+    model_class = None
+    post_processor_class = AutoProcessor
+    default_checkpoint = None
+
+    def __init__(
+        self,
+        model=None,
+        pre_processor=None,
+        post_processor=None,
+        device=None,
+        device_map=None,
+        model_kwargs=None,
+        token=None,
+        **hub_kwargs,
+    ):
+        if not is_torch_available():
+            raise ImportError("Please install torch in order to use this tool.")
+
+        if not is_accelerate_available():
+            raise ImportError("Please install accelerate in order to use this tool.")
+
+        if model is None:
+            if self.default_checkpoint is None:
+                raise ValueError("This tool does not implement a default checkpoint, you need to pass one.")
+            model = self.default_checkpoint
+        if pre_processor is None:
+            pre_processor = model
+
+        self.model = model
+        self.pre_processor = pre_processor
+        self.post_processor = post_processor
+        self.device = device
+        self.device_map = device_map
+        self.model_kwargs = {} if model_kwargs is None else model_kwargs
+        if device_map is not None:
+            self.model_kwargs["device_map"] = device_map
+        self.hub_kwargs = hub_kwargs
+        self.hub_kwargs["use_auth_token"] = token
+
+        self.is_initialized = False
+
+    def setup(self):
+        """
+        Instantiates the `pre_processor`, `model` and `post_processor` if necessary.
+        """
+        if isinstance(self.pre_processor, str):
+            self.pre_processor = self.pre_processor_class.from_pretrained(self.pre_processor, **self.hub_kwargs)
+
+        if isinstance(self.model, str):
+            self.model = self.model_class.from_pretrained(self.model, **self.model_kwargs, **self.hub_kwargs)
+
+        if self.post_processor is None:
+            self.post_processor = self.pre_processor
+        elif isinstance(self.post_processor, str):
+            self.post_processor = self.post_processor_class.from_pretrained(self.post_processor, **self.hub_kwargs)
+
+        if self.device is None:
+            if self.device_map is not None:
+                self.device = list(self.model.hf_device_map.values())[0]
+            else:
+                self.device = get_default_device()
+
+        if self.device_map is None:
+            self.model.to(self.device)
+
+    def encode(self, raw_inputs):
+        """
+        Uses the `pre_processor` to prepare the inputs for the `model`.
+        """
+        return self.pre_processor(raw_inputs)
+
+    def forward(self, inputs):
+        """
+        Sends the inputs through the `model`.
+        """
+        with torch.no_grad():
+            return self.model(**inputs)
+
+    def decode(self, outputs):
+        """
+        Uses the `post_processor` to decode the model output.
+        """
+        return self.post_processor(outputs)
+
+    def __call__(self, *args, **kwargs):
+        if not self.is_initialized:
+            self.setup()
+
+        encoded_inputs = self.encode(*args, **kwargs)
+        encoded_inputs = send_to_device(encoded_inputs, self.device)
+        outputs = self.forward(encoded_inputs)
+        outputs = send_to_device(outputs, "cpu")
+        return self.decode(outputs)
+
+
+def launch_gradio_demo(tool_class: Tool):
+    """
+    Launches a gradio demo for a tool. The corresponding tool class needs to properly implement the class attributes
+    `inputs` and `outputs`.
+
+    Args:
+        tool_class (`type`): The class of the tool for which to launch the demo.
+    """
+    try:
+        import gradio as gr
+    except ImportError:
+        raise ImportError("Gradio should be installed in order to launch a gradio demo.")
+
+    tool = tool_class()
+
+    def fn(*args, **kwargs):
+        return tool(*args, **kwargs)
+
+    gr.Interface(
+        fn=fn,
+        inputs=tool_class.inputs,
+        outputs=tool_class.outputs,
+        title=tool_class.__name__,
+        article=tool.description,
+    ).launch()
+
+
+# TODO: Migrate to Accelerate for this once `PartialState.default_device` makes its way into a release.
+def get_default_device():
+    if not is_torch_available():
+        raise ImportError("Please install torch in order to use this tool.")
+
+    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+
+
+TASK_MAPPING = {
+    "document-question-answering": "DocumentQuestionAnsweringTool",
+    "image-captioning": "ImageCaptioningTool",
+    "image-question-answering": "ImageQuestionAnsweringTool",
+    "image-segmentation": "ImageSegmentationTool",
+    "speech-to-text": "SpeechToTextTool",
+    "summarization": "TextSummarizationTool",
+    "text-classification": "TextClassificationTool",
+    "text-question-answering": "TextQuestionAnsweringTool",
+    "text-to-speech": "TextToSpeechTool",
+    "translation": "TranslationTool",
+}
+
+
+def get_default_endpoints():
+    endpoints_file = cached_file("huggingface-tools/default-endpoints", "default_endpoints.json", repo_type="dataset")
+    with open(endpoints_file, "r", encoding="utf-8") as f:
+        endpoints = json.load(f)
+    return endpoints
+
+
+def supports_remote(task_or_repo_id):
+    endpoints = get_default_endpoints()
+    return task_or_repo_id in endpoints
+
+
+def load_tool(task_or_repo_id, model_repo_id=None, remote=False, token=None, **kwargs):
+    """
+    Main function to quickly load a tool, be it on the Hub or in the Transformers library.
+
+    Args:
+        task_or_repo_id (`str`):
+            The task for which to load the tool or a repo ID of a tool on the Hub. Tasks implemented in Transformers
+            are:
+
+            - `"document-question-answering"`
+            - `"image-captioning"`
+            - `"image-question-answering"`
+            - `"image-segmentation"`
+            - `"speech-to-text"`
+            - `"summarization"`
+            - `"text-classification"`
+            - `"text-question-answering"`
+            - `"text-to-speech"`
+            - `"translation"`
+
+        model_repo_id (`str`, *optional*):
+            Use this argument to use a different model than the default one for the tool you selected.
+        remote (`bool`, *optional*, defaults to `False`):
+            Whether to use your tool by downloading the model or (if it is available) with an inference endpoint.
+        token (`str`, *optional*):
+            The token to identify you on hf.co. If unset, will use the token generated when running `huggingface-cli
+            login` (stored in `~/.huggingface`).
+        kwargs:
+            Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
+            `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the others
+            will be passed along to its init.
+    """
+    if task_or_repo_id in TASK_MAPPING:
+        tool_class_name = TASK_MAPPING[task_or_repo_id]
+        main_module = importlib.import_module("transformers")
+        tools_module = main_module.tools
+        tool_class = getattr(tools_module, tool_class_name)
+
+        if remote:
+            if model_repo_id is None:
+                endpoints = get_default_endpoints()
+                if task_or_repo_id not in endpoints:
+                    raise ValueError(
+                        f"Could not infer a default endpoint for {task_or_repo_id}, you need to pass one using the "
+                        "`model_repo_id` argument."
+                    )
+                model_repo_id = endpoints[task_or_repo_id]
+            return RemoteTool(model_repo_id, token=token, tool_class=tool_class)
+        else:
+            return tool_class(model_repo_id, token=token, **kwargs)
+    else:
+        return Tool.from_hub(task_or_repo_id, model_repo_id=model_repo_id, token=token, remote=remote, **kwargs)
+
+
+def add_description(description):
+    """
+    A decorator that adds a description to a function.
+    """
+
+    def inner(func):
+        func.description = description
+        func.name = func.__name__
+        return func
+
+    return inner
+
+
+## Will move to the Hub
+class EndpointClient:
+    def __init__(self, endpoint_url: str, token: Optional[str] = None):
+        if token is None:
+            token = HfFolder().get_token()
+        self.headers = {"authorization": f"Bearer {token}", "Content-Type": "application/json"}
+        self.endpoint_url = endpoint_url
+
+    @staticmethod
+    def encode_image(image):
+        _bytes = io.BytesIO()
+        image.save(_bytes, format="PNG")
+        b64 = base64.b64encode(_bytes.getvalue())
+        return b64.decode("utf-8")
+
+    @staticmethod
+    def decode_image(raw_image):
+        if not is_vision_available():
+            raise ImportError(
+                "This tool returned an image but Pillow is not installed. Please install it (`pip install Pillow`)."
+            )
+
+        from PIL import Image
+
+        b64 = base64.b64decode(raw_image)
+        _bytes = io.BytesIO(b64)
+        return Image.open(_bytes)
+
+    def __call__(
+        self,
+        inputs: Optional[Union[str, Dict, List[str], List[List[str]]]] = None,
+        params: Optional[Dict] = None,
+        data: Optional[bytes] = None,
+        output_image: bool = False,
+    ) -> Any:
+        # Build payload
+        payload = {}
+        if inputs:
+            payload["inputs"] = inputs
+        if params:
+            payload["parameters"] = params
+
+        # Make API call
+        response = get_session().post(self.endpoint_url, headers=self.headers, json=payload, data=data)
+
+        # By default, parse the response for the user.
+        if output_image:
+            return self.decode_image(response.content)
+        else:
+            return response.json()
diff --git a/src/transformers/tools/document_question_answering.py b/src/transformers/tools/document_question_answering.py
new file mode 100644
index 000000000000..8a44a1a95112
--- /dev/null
+++ b/src/transformers/tools/document_question_answering.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from ..models.auto import AutoProcessor
+from ..models.vision_encoder_decoder import VisionEncoderDecoderModel
+from ..utils import is_vision_available
+from .base import PipelineTool
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class DocumentQuestionAnsweringTool(PipelineTool):
+    default_checkpoint = "naver-clova-ix/donut-base-finetuned-docvqa"
+    description = (
+        "This is a tool that answers a question about an document (pdf). It takes an input named `document` which "
+        "should be the document containing the information, as well as a `question` that is the question about the "
+        "document. It returns a text that contains the answer to the question."
+    )
+    name = "document_qa"
+    pre_processor_class = AutoProcessor
+    model_class = VisionEncoderDecoderModel
+
+    inputs = ["image", "text"]
+    outputs = ["text"]
+
+    def __init__(self, *args, **kwargs):
+        if not is_vision_available():
+            raise ValueError("Pillow must be installed to use the DocumentQuestionAnsweringTool.")
+
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image", question: str):
+        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+        prompt = task_prompt.replace("{user_input}", question)
+        decoder_input_ids = self.pre_processor.tokenizer(
+            prompt, add_special_tokens=False, return_tensors="pt"
+        ).input_ids
+        pixel_values = self.pre_processor(image, return_tensors="pt").pixel_values
+
+        return {"decoder_input_ids": decoder_input_ids, "pixel_values": pixel_values}
+
+    def forward(self, inputs):
+        return self.model.generate(
+            inputs["pixel_values"].to(self.device),
+            decoder_input_ids=inputs["decoder_input_ids"].to(self.device),
+            max_length=self.model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=self.pre_processor.tokenizer.pad_token_id,
+            eos_token_id=self.pre_processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[self.pre_processor.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+        ).sequences
+
+    def decode(self, outputs):
+        sequence = self.pre_processor.batch_decode(outputs)[0]
+        sequence = sequence.replace(self.pre_processor.tokenizer.eos_token, "")
+        sequence = sequence.replace(self.pre_processor.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+        sequence = self.pre_processor.token2json(sequence)
+
+        return sequence["answer"]
diff --git a/src/transformers/tools/evaluate_agent.py b/src/transformers/tools/evaluate_agent.py
new file mode 100644
index 000000000000..47d1d4330ad3
--- /dev/null
+++ b/src/transformers/tools/evaluate_agent.py
@@ -0,0 +1,692 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .agents import BASE_PYTHON_TOOLS, clean_code_for_chat, clean_code_for_run
+from .python_interpreter import InterpretorError, evaluate
+
+
+### Fake tools for test
+def classifier(text, labels):
+    return f"This is the classification of {text} along {labels}."
+
+
+def translator(text, src_lang, tgt_lang):
+    return f"This is the translation of {text} from {src_lang} to {tgt_lang}."
+
+
+def speaker(text):
+    return f"This is actually a sound reading {text}."
+
+
+def transcriber(audio):
+    if "sound" not in audio:
+        raise ValueError(f"`audio` ({audio}) is not a sound.")
+    return f"This is the transcribed text from {audio}."
+
+
+def image_generator(prompt):
+    return f"This is actually an image representing {prompt}."
+
+
+def image_captioner(image):
+    if "image" not in image:
+        raise ValueError(f"`image` ({image}) is not an image.")
+    return f"This is a description of {image}."
+
+
+def image_transformer(image, prompt):
+    if "image" not in image:
+        raise ValueError(f"`image` ({image}) is not an image.")
+    return f"This is a transformation of {image} according to {prompt}."
+
+
+def question_answerer(text, question):
+    return f"This is the answer to {question} from {text}."
+
+
+def image_qa(image, question):
+    if "image" not in image:
+        raise ValueError(f"`image` ({image}) is not an image.")
+    return f"This is the answer to {question} from {image}."
+
+
+def text_downloader(url):
+    return f"This is the content of {url}."
+
+
+def summarizer(text):
+    return f"This is a summary of {text}."
+
+
+def video_generator(prompt, seconds=2):
+    return f"A video of {prompt}"
+
+
+def document_qa(image, question):
+    return f"This is the answer to {question} from the document {image}."
+
+
+def image_segmenter(image, prompt):
+    return f"This is the mask of {prompt} in {image}"
+
+
+TEST_TOOLS = {
+    "text_classifier": classifier,
+    "translator": translator,
+    "text_reader": speaker,
+    "summarizer": summarizer,
+    "transcriber": transcriber,
+    "image_generator": image_generator,
+    "image_captioner": image_captioner,
+    "image_transformer": image_transformer,
+    "text_qa": question_answerer,
+    "text_downloader": text_downloader,
+    "image_qa": image_qa,
+    "video_generator": video_generator,
+    "document_qa": document_qa,
+    "image_segmenter": image_segmenter,
+}
+
+
+class Problem:
+    """
+    A class regrouping all the information to solve a problem on which we will evaluate agents.
+
+    Args:
+        task (`str` ou `list[str]`):
+            One or several descriptions of the task to perform. If a list, it should contain variations on the
+            phrasing, but for the same task.
+        inputs (`list[str]` or `dict[str, str]`):
+            The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
+            values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
+            inputs expected (the value used will be `<<input_name>>` in this case).
+        answer (`str` or `list[str`]):
+            The theoretical answer (or list of possible valid answers) to the problem, as code.
+    """
+
+    def __init__(self, task, inputs, answer):
+        self.task = task
+        self.inputs = inputs
+        self.answer = answer
+
+
+### The list of problems the agent will be evaluated on.
+EVALUATION_TASKS = [
+    Problem(
+        task=[
+            "Is the following `text` (in Spanish) positive or negative?",
+            "Is the text in the variable `text` (in Spanish) positive or negative?",
+            "Translate the following `text` from Spanish to English then tell me if its positive or negative.",
+        ],
+        inputs=["text"],
+        answer="""text_classifier(translator(text, src_lang="Spanish", tgt_lang="English"), labels=["positive", "negative"])""",
+    ),
+    Problem(
+        task=[
+            "Tell me out loud what the `image` contains.",
+            "Describe the following `image` out loud.",
+            "Find what is in the picture stored in `image` then read it out loud.",
+        ],
+        inputs=["image"],
+        answer=[
+            "text_reader(image_captioner(image))",
+            "text_reader(image_qa(image, question='What is in the image?'))",
+        ],
+    ),
+    Problem(
+        task=[
+            "Generate an image from the text given in `text_input`. Then transform it according to the text in `prompt`.",
+            "Use the following `text_input` to generate an image, then transform it by using the text in `prompt`.",
+        ],
+        inputs=["text_input", "prompt"],
+        answer="image_transformer(image_generator(text_input), prompt)",
+    ),
+    Problem(
+        task=[
+            "Download the content of `url`, summarize it then generate an image from its content.",
+            "Use a summary of the web page at `url` to generate an image.",
+            "Summarize the content of the web page at `url`, and use the result to generate an image.",
+        ],
+        inputs=["url"],
+        answer="image_generator(summarizer(text_downloader(url)))",
+    ),
+    Problem(
+        task=[
+            "Transform the following `image` using the prompt in `text`. The prompt is in Spanish.",
+            "Use the text prompt in `text` (in Spanish) to transform the following `image`.",
+            "Translate the `text` from Spanish to English then use it to transform the picture in `image`.",
+        ],
+        inputs=["text", "image"],
+        answer="image_transformer(image, translator(text, src_lang='Spanish', tgt_lang='English'))",
+    ),
+    Problem(
+        task=[
+            "Download the content of `url`, summarize it then read it out loud to me.",
+            "Read me a summary of the web page at `url`.",
+        ],
+        inputs=["url"],
+        answer="text_reader(summarizer(text_downloader(url)))",
+    ),
+    Problem(
+        task=[
+            "Generate an image from the text given in `text_input`.",
+        ],
+        inputs=["text_input"],
+        answer="image_generator(text_input)",
+    ),
+    Problem(
+        task=[
+            "Replace the beaver in the `image` by the `prompt`.",
+            "Transform the `image` so that it contains the `prompt`.",
+            "Use `prompt` to transform this `image`.",
+        ],
+        inputs=["image", "prompt"],
+        answer="image_transformer(image, prompt)",
+    ),
+    Problem(
+        task=[
+            "Provide me the summary of the `text`, then read it to me before transcribing it and translating it in French.",
+            "Summarize `text`, read it out loud then transcribe the audio and translate it in French.",
+            "Read me a summary of the the `text` out loud. Transcribe this and translate it in French.",
+        ],
+        inputs=["text"],
+        answer="translator(transcriber(text_reader(summarizer(text))), src_lang='English', tgt_lang='French')",
+    ),
+    Problem(
+        task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
+        inputs={"prompt": "A lobster swimming"},
+        answer="video_generator('A lobster swimming')",
+    ),
+    Problem(
+        task=[
+            "Download the following file `url`, summarize it in a few words and generate a video from it."
+            "Fetch the file at this `url`, summarize it, and create an animation out of it."
+        ],
+        inputs=["url"],
+        answer="video_generator(summarizer(text_downloader(url)))",
+    ),
+]
+
+
+EVALUATION_CHATS = [
+    [
+        Problem(
+            task=[
+                "Translate the following `text` from Spanish to English.",
+                "Translate the following `text` from Spanish to English.",
+            ],
+            inputs=["text"],
+            answer="translated_text=translator(text, src_lang='Spanish', tgt_lang='English')",
+        ),
+        Problem(
+            task=[
+                "Is it positive or negative?",
+                "Tell me if its positive or negative.",
+            ],
+            inputs=[],
+            answer="text_classifier(translated_text, labels=['positive', 'negative'])",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "What does this `image` contain?",
+                "Describe the following `image`.",
+                "Find what is in the picture stored in `image`",
+            ],
+            inputs=["image"],
+            answer=[
+                "description=image_captioner(image)",
+                "description=image_qa(image, question='What is in the image?')",
+            ],
+        ),
+        Problem(
+            task=["Now, read the description out loud.", "Great! Can you read it out loud?", "Read it out loud."],
+            inputs=[],
+            answer=["audio=text_reader(description)", "audio=text_reader(description)"],
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Generate an image from the text given in `text_input`.",
+                "Use the following `text_input` to generate an image",
+            ],
+            inputs=["text_input"],
+            answer="image = image_generator(text_input)",
+        ),
+        Problem(
+            task=[
+                "Transform it according to the text in `prompt`.",
+                "Transform it by using the text in `prompt`.",
+            ],
+            inputs=["prompt"],
+            answer="image_transformer(image, prompt)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Download the content of `url` and summarize it.",
+                "Summarize the content of the web page at `url`.",
+            ],
+            inputs=["url"],
+            answer="summary = summarizer(text_downloader(url))",
+        ),
+        Problem(
+            task=[
+                "Generate an image from its content.",
+                "Use the previous result to generate an image.",
+            ],
+            inputs=[],
+            answer="image_generator(summary)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Translate this Spanish `text` in English.",
+                "Translate the `text` from Spanish to English.",
+            ],
+            inputs=["text"],
+            answer="translated_text = translator(text, src_lang='Spanish', tgt_lang='English')",
+        ),
+        Problem(
+            task=[
+                "Transform the following `image` using the translated `text`.",
+                "Use the previous result to transform the following `image`.",
+            ],
+            inputs=["image"],
+            answer="image_transformer(image, translated_text)",
+        ),
+    ],
+    [
+        Problem(
+            task=["Download the content of `url`.", "Get me the text on the weg page `url`."],
+            inputs=["url"],
+            answer="text = text_downloader(url)",
+        ),
+        Problem(
+            task=["Summarize this text.", "Summarize this text."],
+            inputs=[],
+            answer="summary = summarizer(text)",
+        ),
+        Problem(
+            task=["Read it out loud to me.", "Read me the previous result."],
+            inputs=[],
+            answer="text_reader(summary)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Generate an image from the text given in `text_input`.",
+            ],
+            inputs=["text_input"],
+            answer="image_generator(text_input)",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Replace the beaver in the `image` by the `prompt`.",
+                "Transform the `image` so that it contains the `prompt`.",
+                "Use `prompt` to transform this `image`.",
+            ],
+            inputs=["image", "prompt"],
+            answer="image_transformer(image, prompt)",
+        ),
+    ],
+    [
+        Problem(
+            task=["Provide me the summary of the `text`.", "Summarize `text`."],
+            inputs=["text"],
+            answer="summary = summarizer(text)",
+        ),
+        Problem(
+            task=["Read this summary to me.", "Read it out loud."],
+            inputs=[],
+            answer="audio = text_reader(summarizer(text))",
+        ),
+        Problem(
+            task=["Transcribing the previous result back in text.", "Transcribe the audio."],
+            inputs=[],
+            answer="text = transcriber(audio)",
+        ),
+        Problem(
+            task=["Translating the last result in French.", "Translate this in French."],
+            inputs=[],
+            answer="translator(text, src_lang='English', tgt_lang='French')",
+        ),
+    ],
+    [
+        Problem(
+            task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
+            inputs={"prompt": "A lobster swimming"},
+            answer="video_generator('A lobster swimming')",
+        ),
+    ],
+    [
+        Problem(
+            task=[
+                "Download the content of `url` and summarize it.",
+                "Summarize the content of the web page at `url`.",
+            ],
+            inputs=["url"],
+            answer="summary = summarizer(text_downloader(url))",
+        ),
+        Problem(
+            task=["generate a video from it.", "Create an animation from the last result."],
+            inputs=[],
+            answer="video_generator(summary)",
+        ),
+    ],
+]
+
+
+def get_theoretical_tools(agent_answer, theoretical_answer, code_answer):
+    if not isinstance(theoretical_answer, list):
+        return {name for name in TEST_TOOLS if name in code_answer}
+
+    if isinstance(agent_answer, dict):
+        for one_answer, one_code in zip(theoretical_answer, code_answer):
+            if one_answer in agent_answer.values():
+                return {name for name in TEST_TOOLS if name in one_code}
+
+    for one_answer, one_code in zip(theoretical_answer, code_answer):
+        if agent_answer == one_answer:
+            return {name for name in TEST_TOOLS if name in one_code}
+
+    return {name for name in TEST_TOOLS if name in code_answer[0]}
+
+
+def evaluate_code(code, inputs=None, state=None, verbose=False, return_interpretor_error=False):
+    tools = BASE_PYTHON_TOOLS.copy()
+    for name, tool in TEST_TOOLS.items():
+        if name not in code:
+            continue
+        tools[name] = tool
+
+    if isinstance(inputs, dict):
+        inputs = inputs.copy()
+    elif inputs is not None:
+        inputs = {inp: f"<<{inp}>>" for inp in inputs}
+
+    if state is not None:
+        state.update(inputs)
+    else:
+        state = inputs
+
+    try:
+        return evaluate(code, tools, state)
+    except InterpretorError as e:
+        return str(e)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+
+
+def score_code(agent_answer, theoretical_answer, verbose: bool = False):
+    if verbose:
+        print(agent_answer, theoretical_answer)
+    theoretical_answer = theoretical_answer if isinstance(theoretical_answer, list) else [theoretical_answer]
+
+    if agent_answer in theoretical_answer:
+        if verbose:
+            print("Perfect!")
+        return 1
+    elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
+        if verbose:
+            print("Almsot perfect, result in state!")
+        return 0.75
+    else:
+        if verbose:
+            print("Result is not the right one but code executed.")
+        return 0.3
+
+
+def evaluate_one_result(explanation, code, agent_answer, theoretical_answer, answer, verbose=False):
+    tools_in_explanation = {name for name in TEST_TOOLS if f"`{name}`" in explanation}
+    theoretical_tools = get_theoretical_tools(agent_answer, theoretical_answer, answer)
+    if tools_in_explanation == theoretical_tools:
+        tool_selection_score = 1.0
+        tool_selection_errors = None
+    else:
+        missing_tools = len(theoretical_tools - tools_in_explanation)
+        unexpected_tools = len(tools_in_explanation - theoretical_tools)
+        tool_selection_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
+
+        tool_selection_errors = {
+            "selected_tools": tools_in_explanation,
+            "theoretical_tools": theoretical_tools,
+        }
+
+    tools_in_code = {name for name in TEST_TOOLS if name in code}
+    if tools_in_code == theoretical_tools:
+        tool_used_score = 1.0
+        tool_used_errors = None
+    else:
+        missing_tools = len(theoretical_tools - tools_in_code)
+        unexpected_tools = len(tools_in_code - theoretical_tools)
+        tool_used_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
+
+        tool_used_errors = {
+            "selected_tools": tools_in_explanation,
+            "theoretical_tools": theoretical_tools,
+        }
+
+    score = score_code(agent_answer, theoretical_answer, verbose=verbose)
+    if score < 1.0:
+        code_errors = {
+            "code_produced": code,
+            "evaluation": agent_answer,
+            "theoretical_answer": theoretical_answer,
+        }
+    else:
+        code_errors = None
+
+    return (tool_selection_score, tool_used_score, score), (tool_selection_errors, tool_used_errors, code_errors)
+
+
+def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
+    """
+    Evaluates a new agent on all `EVALUATION_TASKS`.
+
+    Example:
+
+    ```py
+    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
+    bads = new_evaluate_agent(agent)
+    for bad in bads:
+        print(bad)
+    ```
+    """
+    # Sanity check
+    agent_tools = set(agent.toolbox.keys())
+    if agent_tools != set(TEST_TOOLS):
+        missing_tools = set(TEST_TOOLS) - agent_tools
+        unexpected_tools = set(agent_tools) - TEST_TOOLS
+        raise ValueError(
+            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
+        )
+
+    eval_tasks = []
+    eval_idx = []
+    for idx, pb in enumerate(EVALUATION_TASKS):
+        if isinstance(pb.task, list):
+            eval_tasks.extend(pb.task)
+            eval_idx.extend([idx] * len(pb.task))
+        else:
+            eval_tasks.append(pb.task)
+            eval_idx.append(idx)
+
+    tool_selection_score = 0
+    tool_used_score = 0
+    code_score = 0
+
+    if return_errors:
+        tool_selection_errors = {}
+        tool_used_errors = {}
+        code_errors = {}
+
+    for start_idx in range(0, len(eval_tasks), batch_size):
+        end_idx = min(start_idx + batch_size, len(eval_tasks))
+        batch_tasks = eval_tasks[start_idx:end_idx]
+
+        prompts = [agent.format_prompt(task) for task in batch_tasks]
+        results = agent.generate_many(prompts, stop=["Task:"])
+
+        for idx, result in enumerate(results):
+            problem = EVALUATION_TASKS[eval_idx[start_idx + idx]]
+            if verbose:
+                print(f"====Task {start_idx + idx}====\n{batch_tasks[idx]}\n")
+            explanation, code = clean_code_for_run(result)
+
+            # Evaluate agent answer and code answer
+            agent_answer = evaluate_code(code, problem.inputs, verbose=verbose)
+            if isinstance(problem.answer, list):
+                theoretical_answer = [evaluate_code(answer, problem.inputs) for answer in problem.answer]
+            else:
+                theoretical_answer = evaluate_code(problem.answer, problem.inputs)
+
+            scores, errors = evaluate_one_result(
+                explanation, code, agent_answer, theoretical_answer, problem.answer, verbose=verbose
+            )
+
+            tool_selection_score += scores[0]
+            tool_used_score += scores[1]
+            code_score += scores[2]
+
+            if return_errors:
+                if errors[0] is not None:
+                    tool_selection_errors[batch_tasks[idx]] = errors[0]
+                if errors[1] is not None:
+                    tool_used_errors[batch_tasks[idx]] = errors[1]
+                if errors[2] is not None:
+                    code_errors[batch_tasks[idx]] = errors[2]
+
+    scores = {
+        "tool selection score": 100 * (tool_selection_score / len(eval_tasks)),
+        "tool used score": 100 * (tool_used_score / len(eval_tasks)),
+        "code score": 100 * (code_score / len(eval_tasks)),
+    }
+
+    if return_errors:
+        return scores, tool_selection_errors, tool_used_errors, code_errors
+    else:
+        return scores
+
+
+def evaluate_chat_agent(agent, verbose=False, return_errors=False):
+    """
+    Evaluates a new agent on all `EVALUATION_CHATS`.
+
+    Example:
+
+    ```py
+    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
+    bads = new_evaluate_agent(agent)
+    for bad in bads:
+        print(bad)
+    ```
+    """
+    # Sanity check
+    agent_tools = set(agent.toolbox.keys())
+    if agent_tools != set(TEST_TOOLS):
+        missing_tools = set(TEST_TOOLS) - agent_tools
+        unexpected_tools = agent_tools - set(TEST_TOOLS)
+        raise ValueError(
+            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
+        )
+
+    tool_selection_score = 0
+    tool_used_score = 0
+    code_score = 0
+    total_steps = 0
+
+    if return_errors:
+        tool_selection_errors = {}
+        tool_used_errors = {}
+        code_errors = {}
+
+    for chat_problem in EVALUATION_CHATS:
+        if isinstance(chat_problem[0].task, str):
+            resolved_problems = [chat_problem]
+        else:
+            resolved_problems = [
+                [Problem(task=pb.task[i], inputs=pb.inputs, answer=pb.answer) for pb in chat_problem]
+                for i in range(len(chat_problem[0].task))
+            ]
+        for problem in resolved_problems:
+            agent.prepare_for_new_chat()
+            agent_state = {}
+            theoretical_state = (
+                [{} for _ in range(len(problem[0].answer))] if isinstance(problem[0].answer, list) else {}
+            )
+
+            for step, step_problem in enumerate(problem):
+                if verbose:
+                    print(step_problem.task)
+                total_steps += 1
+                prompt = agent.format_prompt(step_problem.task, chat_mode=True)
+                result = agent.generate_one(prompt, stop=["Human:", "====="])
+                agent.chat_history = prompt + result + "\n"
+
+                explanation, code = clean_code_for_chat(result)
+
+                if verbose:
+                    print(f"==Explanation from the agent==\n{explanation}")
+                    print(f"\n==Code generated by the agent==\n{code}")
+
+                # Evaluate agent answer and code answer
+                agent_answer = evaluate_code(code, step_problem.inputs, state=agent_state, verbose=verbose)
+
+                answer = step_problem.answer
+                if isinstance(answer, list):
+                    theoretical_answer = [
+                        evaluate_code(a, step_problem.inputs, state=state)
+                        for a, state in zip(answer, theoretical_state)
+                    ]
+                else:
+                    theoretical_answer = evaluate_code(answer, step_problem.inputs, state=theoretical_state)
+
+                scores, errors = evaluate_one_result(
+                    explanation, code, agent_answer, theoretical_answer, answer, verbose=verbose
+                )
+
+                tool_selection_score += scores[0]
+                tool_used_score += scores[1]
+                code_score += scores[2]
+
+                if return_errors:
+                    if errors[0] is not None:
+                        tool_selection_errors[step_problem.task] = errors[0]
+                    if errors[1] is not None:
+                        tool_used_errors[step_problem.task] = errors[1]
+                    if errors[2] is not None:
+                        code_errors[step_problem.task] = errors[2]
+
+    scores = {
+        "tool selection score": 100 * (tool_selection_score / total_steps),
+        "tool used score": 100 * (tool_used_score / total_steps),
+        "code score": 100 * (code_score / total_steps),
+    }
+
+    if return_errors:
+        return scores, tool_selection_errors, tool_used_errors, code_errors
+    else:
+        return scores
diff --git a/src/transformers/tools/image_captioning.py b/src/transformers/tools/image_captioning.py
new file mode 100644
index 000000000000..cfcf0bc8dc28
--- /dev/null
+++ b/src/transformers/tools/image_captioning.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..models.auto import AutoModelForVision2Seq
+from ..utils import requires_backends
+from .base import PipelineTool
+
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+
+class ImageCaptioningTool(PipelineTool):
+    default_checkpoint = "Salesforce/blip-image-captioning-base"
+    description = (
+        "This is a tool that generates a description of an image. It takes an input named `image` which should be the "
+        "image to caption, and returns a text that contains the description in English."
+    )
+    name = "image_captioner"
+    model_class = AutoModelForVision2Seq
+
+    inputs = ["image"]
+    outputs = ["text"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image"):
+        return self.pre_processor(images=image, return_tensors="pt")
+
+    def forward(self, inputs):
+        return self.model.generate(**inputs)
+
+    def decode(self, outputs):
+        return self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
diff --git a/src/transformers/tools/image_question_answering.py b/src/transformers/tools/image_question_answering.py
new file mode 100644
index 000000000000..a9d9ef82b514
--- /dev/null
+++ b/src/transformers/tools/image_question_answering.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+import torch
+
+from ..models.auto import AutoModelForVisualQuestionAnswering, AutoProcessor
+from ..utils import requires_backends
+from .base import PipelineTool
+
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+
+class ImageQuestionAnsweringTool(PipelineTool):
+    default_checkpoint = "dandelin/vilt-b32-finetuned-vqa"
+    description = (
+        "This is a tool that answers a question about an image. It takes an input named `image` which should be the "
+        "image containing the information, as well as a `question` which should be the question in English. It "
+        "returns a text that is the answer to the question."
+    )
+    name = "image_qa"
+    pre_processor_class = AutoProcessor
+    model_class = AutoModelForVisualQuestionAnswering
+
+    inputs = ["image", "text"]
+    outputs = ["text"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image", question: str):
+        return self.pre_processor(image, question, return_tensors="pt")
+
+    def forward(self, inputs):
+        with torch.no_grad():
+            return self.model(**inputs).logits
+
+    def decode(self, outputs):
+        idx = outputs.argmax(-1).item()
+        return self.model.config.id2label[idx]
diff --git a/src/transformers/tools/image_segmentation.py b/src/transformers/tools/image_segmentation.py
new file mode 100644
index 000000000000..1a43afbd18d2
--- /dev/null
+++ b/src/transformers/tools/image_segmentation.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+
+from ..models.clipseg import CLIPSegForImageSegmentation
+from ..utils import is_vision_available, requires_backends
+from .base import PipelineTool
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageSegmentationTool(PipelineTool):
+    description = (
+        "This is a tool that creates a segmentation mask identifiying elements inside an image according to a prompt. "
+        "It takes two arguments named `image` which should be the original image, and `prompt` which should be a text "
+        "describing the elements what should be identified in the segmentation mask. The tool returns the mask as a "
+        "black-and-white image."
+    )
+    default_checkpoint = "CIDAS/clipseg-rd64-refined"
+    name = "image_segmenter"
+    model_class = CLIPSegForImageSegmentation
+
+    inputs = ["image", "text"]
+    outputs = ["image"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+
+    def encode(self, image: "Image", prompt: str):
+        self.pre_processor.image_processor.size = {"width": image.size[0], "height": image.size[1]}
+        return self.pre_processor(text=[prompt], images=[image], padding=True, return_tensors="pt")
+
+    def forward(self, inputs):
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        return logits
+
+    def decode(self, outputs):
+        array = outputs.cpu().detach().numpy()
+        array[array <= 0] = 0
+        array[array > 0] = 1
+        return Image.fromarray((array * 255).astype(np.uint8))
diff --git a/src/transformers/tools/prompts.py b/src/transformers/tools/prompts.py
new file mode 100644
index 000000000000..44336277e134
--- /dev/null
+++ b/src/transformers/tools/prompts.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# docstyle-ignore
+RUN_PROMPT_TEMPLATE = """I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
+To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns.
+You should first explain which tool you will use to perform the task and for what reason, then write the code in Python.
+Each instruction in Python should be a simple assignment. You can print intermediate results if it makes sense to do so.
+
+Tools:
+<<all_tools>>
+
+
+Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
+
+I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+
+Answer:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+print(f"The answer is {answer}")
+```
+
+Task: "Identify the oldest person in the `document` and create an image showcasing the result."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator(answer)
+```
+
+Task: "Generate an image using the text given in the variable `caption`."
+
+I will use the following tool: `image_generator` to generate an image.
+
+Answer:
+```py
+image = image_generator(prompt=caption)
+```
+
+Task: "Summarize the text given in the variable `text` and read it out loud."
+
+I will use the following tools: `summarizer` to create a summary of the input text, then `text_reader` to read it out loud.
+
+Answer:
+```py
+summarized_text = summarizer(text)
+print(f"Summary: {summarized_text}")
+audio_summary = text_reader(summarized_text)
+```
+
+Task: "Answer the question in the variable `question` about the text in the variable `text`. Use the answer to generate an image."
+
+I will use the following tools: `text_qa` to create the answer, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = text_qa(text=text, question=question)
+print(f"The answer is {answer}.")
+image = image_generator(answer)
+```
+
+Task: "Caption the following `image`."
+
+I will use the following tool: `image_captioner` to generate a caption for the image.
+
+Answer:
+```py
+caption = image_captioner(image)
+```
+
+Task: "<<prompt>>"
+
+I will use the following"""
+
+
+# docstyle-ignore
+CHAT_PROMPT_TEMPLATE = """Below are a series of dialogues between various people and an AI assistant specialized in coding. The AI assistant tries to be helpful, polite, honest, and humble-but-knowledgeable.
+
+The job of the AI assistant is to come up with a series of simple commands in Python that will perform the task the human wants to perform.
+To help with that, the AI assistant has access to a set of tools. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns.
+The AI assistant should first explain the tools it will use to perform the task and for what reason, then write the code in Python.
+Each instruction in Python should be a simple assignment. The AI assistant can print intermediate results if it makes sense to do so.
+
+Tools:
+<<all_tools>>
+
+=====
+
+Human: Answer the question in the variable `question` about the image stored in the variable `image`.
+
+Assistant: I will use the tool `image_qa` to answer the question on the input image.
+
+```py
+answer = image_qa(text=question, image=image)
+print(f"The answer is {answer}")
+```
+
+Human: I tried this code, it worked but didn't give me a good result. The question is in French
+
+Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
+
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(text=translated_question, image=image)
+print(f"The answer is {answer}")
+```
+
+=====
+
+Human: Identify the oldest person in the `document`.
+
+Assistant: I will use the tool `document_qa` to find the oldest person in the document.
+
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+```
+
+Human: Can you generate an image with the result?
+
+Assistant: I will use the tool `image_generator` to do that.
+
+```py
+image = image_generator(answer)
+```
+
+=====
+
+Human: Summarize the text given in the variable `text` and read it out loud.
+
+Assistant: I will use the tool `summarizer` to create a summary of the input text, then the tool `text_reader` to read it out loud.
+
+```py
+summarized_text = summarizer(text)
+print(f"Summary: {summarized_text}")
+audio_summary = text_reader(text=summary)
+```
+
+Human: I got the following error: "The variable `summary` is not defined."
+
+Assistant: My bad! Let's try this code instead.
+
+```py
+summarized_text = summarizer(text)
+print(f"Summary: {summarized_text}")
+audio_summary = text_reader(text=summarized_text)
+```
+
+Human: It worked! Can you translate the summary in German?
+
+Assistant: I will use the tool `translator` to translate the text in German.
+
+```py
+translated_summary = translator(summarized_text, src_lang="English", tgt_lang="German)
+```
+
+====
+"""
+
+
+# docstyle-ignore
+CHAT_MESSAGE_PROMPT = """
+Human: <<task>>
+
+Assistant: """
diff --git a/src/transformers/tools/python_interpreter.py b/src/transformers/tools/python_interpreter.py
new file mode 100644
index 000000000000..ef366c5d2e77
--- /dev/null
+++ b/src/transformers/tools/python_interpreter.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import difflib
+from collections.abc import Mapping
+from typing import Any, Callable, Dict
+
+
+class InterpretorError(ValueError):
+    """
+    An error raised when the interpretor cannot evaluate a Python expression, due to syntax error or unsupported
+    operations.
+    """
+
+    pass
+
+
+def evaluate(code: str, tools: Dict[str, Callable], state=None, chat_mode=False):
+    """
+    Evaluate a python expression using the content of the variables stored in a state and only evaluating a given set
+    of functions.
+
+    This function will recurse through the nodes of the tree provided.
+
+    Args:
+        code (`str`):
+            The code to evaluate.
+        tools (`Dict[str, Callable]`):
+            The functions that may be called during the evaluation. Any call to another function will fail with an
+            `InterpretorError`.
+        state (`Dict[str, Any]`):
+            A dictionary mapping variable names to values. The `state` should contain the initial inputs but will be
+            updated by this function to contain all variables as they are evaluated.
+        chat_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the function is called from `Agent.chat`.
+    """
+    try:
+        expression = ast.parse(code)
+    except SyntaxError as e:
+        print("The code generated by the agent is not valid.\n", e)
+        return
+    if state is None:
+        state = {}
+    result = None
+    for idx, node in enumerate(expression.body):
+        try:
+            line_result = evaluate_ast(node, state, tools)
+        except InterpretorError as e:
+            msg = f"Evaluation of the code stopped at line {idx} before the end because of the following error"
+            if chat_mode:
+                msg += (
+                    f". Copy paste the following error message and send it back to the agent:\nI get an error: '{e}'"
+                )
+            else:
+                msg += f":\n{e}"
+            print(msg)
+            break
+        if line_result is not None:
+            result = line_result
+
+    return result
+
+
+def evaluate_ast(expression: ast.AST, state: Dict[str, Any], tools: Dict[str, Callable]):
+    """
+    Evaluate an absract syntax tree using the content of the variables stored in a state and only evaluating a given
+    set of functions.
+
+    This function will recurse trough the nodes of the tree provided.
+
+    Args:
+        expression (`ast.AST`):
+            The code to evaluate, as an abastract syntax tree.
+        state (`Dict[str, Any]`):
+            A dictionary mapping variable names to values. The `state` is updated if need be when the evaluation
+            encounters assignements.
+        tools (`Dict[str, Callable]`):
+            The functions that may be called during the evaluation. Any call to another function will fail with an
+            `InterpretorError`.
+    """
+    if isinstance(expression, ast.Assign):
+        # Assignement -> we evaluate the assignement which should update the state
+        # We return the variable assigned as it may be used to determine the final result.
+        return evaluate_assign(expression, state, tools)
+    elif isinstance(expression, ast.Call):
+        # Function call -> we return the value of the function call
+        return evaluate_call(expression, state, tools)
+    elif isinstance(expression, ast.Constant):
+        # Constant -> just return the value
+        return expression.value
+    elif isinstance(expression, ast.Dict):
+        # Dict -> evaluate all keys and values
+        keys = [evaluate_ast(k, state, tools) for k in expression.keys]
+        values = [evaluate_ast(v, state, tools) for v in expression.values]
+        return dict(zip(keys, values))
+    elif isinstance(expression, ast.Expr):
+        # Expression -> evaluate the content
+        return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.FormattedValue):
+        # Formatted value (part of f-string) -> evaluate the content and return
+        return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.If):
+        # If -> execute the right branch
+        return evaluate_if(expression, state, tools)
+    elif hasattr(ast, "Index") and isinstance(expression, ast.Index):
+        return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.JoinedStr):
+        return "".join([str(evaluate_ast(v, state, tools)) for v in expression.values])
+    elif isinstance(expression, ast.List):
+        # List -> evaluate all elements
+        return [evaluate_ast(elt, state, tools) for elt in expression.elts]
+    elif isinstance(expression, ast.Name):
+        # Name -> pick up the value in the state
+        return evaluate_name(expression, state, tools)
+    elif isinstance(expression, ast.Subscript):
+        # Subscript -> return the value of the indexing
+        return evaluate_subscript(expression, state, tools)
+    else:
+        # For now we refuse anything else. Let's add things as we need them.
+        raise InterpretorError(f"{expression.__class__.__name__} is not supported.")
+
+
+def evaluate_assign(assign, state, tools):
+    var_names = assign.targets
+    result = evaluate_ast(assign.value, state, tools)
+
+    if len(var_names) == 1:
+        state[var_names[0].id] = result
+    else:
+        if len(result) != len(var_names):
+            raise InterpretorError(f"Expected {len(var_names)} values but got {len(result)}.")
+        for var_name, r in zip(var_names, result):
+            state[var_name.id] = r
+    return result
+
+
+def evaluate_call(call, state, tools):
+    if not isinstance(call.func, ast.Name):
+        raise InterpretorError(
+            f"It is not permitted to evaluate other functions than the provided tools (tried to execute {call.func} of "
+            f"type {type(call.func)}."
+        )
+    func_name = call.func.id
+    if func_name not in tools:
+        raise InterpretorError(
+            f"It is not permitted to evaluate other functions than the provided tools (tried to execute {call.func.id})."
+        )
+
+    func = tools[func_name]
+    # Todo deal with args
+    args = [evaluate_ast(arg, state, tools) for arg in call.args]
+    kwargs = {keyword.arg: evaluate_ast(keyword.value, state, tools) for keyword in call.keywords}
+    return func(*args, **kwargs)
+
+
+def evaluate_subscript(subscript, state, tools):
+    index = evaluate_ast(subscript.slice, state, tools)
+    value = evaluate_ast(subscript.value, state, tools)
+    if isinstance(value, (list, tuple)):
+        return value[int(index)]
+    if index in value:
+        return value[index]
+    if isinstance(index, str) and isinstance(value, Mapping):
+        close_matches = difflib.get_close_matches(index, list(value.keys()))
+        if len(close_matches) > 0:
+            return value[close_matches[0]]
+
+    raise InterpretorError(f"Could not index {value} with '{index}'.")
+
+
+def evaluate_name(name, state, tools):
+    if name.id in state:
+        return state[name.id]
+    close_matches = difflib.get_close_matches(name.id, list(state.keys()))
+    if len(close_matches) > 0:
+        return state[close_matches[0]]
+    raise InterpretorError(f"The variable `{name.id}` is not defined.")
+
+
+def evaluate_condition(condition, state, tools):
+    if len(condition.ops) > 1:
+        raise InterpretorError("Cannot evaluate conditions with multiple operators")
+
+    left = evaluate_ast(condition.left, state, tools)
+    comparator = condition.ops[0]
+    right = evaluate_ast(condition.comparators[0], state, tools)
+
+    if isinstance(comparator, ast.Eq):
+        return left == right
+    elif isinstance(comparator, ast.NotEq):
+        return left != right
+    elif isinstance(comparator, ast.Lt):
+        return left < right
+    elif isinstance(comparator, ast.LtE):
+        return left <= right
+    elif isinstance(comparator, ast.Gt):
+        return left > right
+    elif isinstance(comparator, ast.GtE):
+        return left >= right
+    elif isinstance(comparator, ast.Is):
+        return left is right
+    elif isinstance(comparator, ast.IsNot):
+        return left is not right
+    elif isinstance(comparator, ast.In):
+        return left in right
+    elif isinstance(comparator, ast.NotIn):
+        return left not in right
+    else:
+        raise InterpretorError(f"Operator not supported: {comparator}")
+
+
+def evaluate_if(if_statement, state, tools):
+    result = None
+    if evaluate_condition(if_statement.test, state, tools):
+        for line in if_statement.body:
+            line_result = evaluate_ast(line, state, tools)
+            if line_result is not None:
+                result = line_result
+    else:
+        for line in if_statement.orelse:
+            line_result = evaluate_ast(line, state, tools)
+            if line_result is not None:
+                result = line_result
+    return result
diff --git a/src/transformers/tools/speech_to_text.py b/src/transformers/tools/speech_to_text.py
new file mode 100644
index 000000000000..d3b8fd29ee1a
--- /dev/null
+++ b/src/transformers/tools/speech_to_text.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.whisper import WhisperForConditionalGeneration, WhisperProcessor
+from .base import PipelineTool
+
+
+class SpeechToTextTool(PipelineTool):
+    default_checkpoint = "openai/whisper-base"
+    description = (
+        "This is a tool that transcribes an audio into text. It takes an input named `audio` and returns the "
+        "transcribed text."
+    )
+    name = "transcriber"
+    pre_processor_class = WhisperProcessor
+    model_class = WhisperForConditionalGeneration
+
+    inputs = ["audio"]
+    outputs = ["text"]
+
+    def encode(self, audio):
+        return self.pre_processor(audio, return_tensors="pt").input_features
+
+    def forward(self, inputs):
+        return self.model.generate(inputs=inputs)
+
+    def decode(self, outputs):
+        return self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0]
diff --git a/src/transformers/tools/text_classification.py b/src/transformers/tools/text_classification.py
new file mode 100644
index 000000000000..f04cdc05b6ac
--- /dev/null
+++ b/src/transformers/tools/text_classification.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..models.auto import AutoModelForSequenceClassification, AutoTokenizer
+from .base import PipelineTool
+
+
+class TextClassificationTool(PipelineTool):
+    """
+    Example:
+
+    ```py
+    from transformers.tools import TextClassificationTool
+
+    classifier = TextClassificationTool()
+    classifier("This is a super nice API!", labels=["positive", "negative"])
+    ```
+    """
+
+    default_checkpoint = "facebook/bart-large-mnli"
+    description = (
+        "This is a tool that classifies an English text using provided labels. It takes two inputs: `text`, which "
+        "should be the text to classify, and `labels`, which should be the list of labels to use for classification. "
+        "It returns the most likely label in the list of provided `labels` for the input text."
+    )
+    name = "text_classifier"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSequenceClassification
+
+    inputs = ["text", ["text"]]
+    outputs = ["text"]
+
+    def setup(self):
+        super().setup()
+        config = self.model.config
+        self.entailment_id = -1
+        for idx, label in config.id2label.items():
+            if label.lower().startswith("entail"):
+                self.entailment_id = int(idx)
+        if self.entailment_id == -1:
+            raise ValueError("Could not determine the entailment ID from the model config, please pass it at init.")
+
+    def encode(self, text, labels):
+        self._labels = labels
+        return self.pre_processor(
+            [text] * len(labels),
+            [f"This example is {label}" for label in labels],
+            return_tensors="pt",
+            padding="max_length",
+        )
+
+    def decode(self, outputs):
+        logits = outputs.logits
+        label_id = torch.argmax(logits[:, 2]).item()
+        return self._labels[label_id]
diff --git a/src/transformers/tools/text_question_answering.py b/src/transformers/tools/text_question_answering.py
new file mode 100644
index 000000000000..2a7c2fc09a63
--- /dev/null
+++ b/src/transformers/tools/text_question_answering.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.auto import AutoModelForSeq2SeqLM, AutoTokenizer
+from .base import PipelineTool
+
+
+QA_PROMPT = """Here is a text containing a lot of information: '''{text}'''.
+
+Can you answer this question about the text: '{question}'"""
+
+
+class TextQuestionAnsweringTool(PipelineTool):
+    default_checkpoint = "google/flan-t5-base"
+    description = (
+        "This is a tool that answers questions related to a text. It takes two arguments named `text`, which is the "
+        "text where to find the answer, and `question`, which is the question, and returns the answer to the question."
+    )
+    name = "text_qa"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSeq2SeqLM
+
+    inputs = ["text", "text"]
+    outputs = ["text"]
+
+    def encode(self, text: str, question: str):
+        prompt = QA_PROMPT.format(text=text, question=question)
+        return self.pre_processor(prompt, return_tensors="pt")
+
+    def forward(self, inputs):
+        output_ids = self.model.generate(**inputs)
+
+        in_b, _ = inputs["input_ids"].shape
+        out_b = output_ids.shape[0]
+
+        return output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])[0][0]
+
+    def decode(self, outputs):
+        return self.pre_processor.decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
diff --git a/src/transformers/tools/text_summarization.py b/src/transformers/tools/text_summarization.py
new file mode 100644
index 000000000000..8eedf234ae50
--- /dev/null
+++ b/src/transformers/tools/text_summarization.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.auto import AutoModelForSeq2SeqLM, AutoTokenizer
+from .base import PipelineTool
+
+
+class TextSummarizationTool(PipelineTool):
+    """
+    Example:
+
+    ```py
+    from transformers.tools import TextSummarizationTool
+
+    summarizer = TextSummarizationTool()
+    summarizer(long_text)
+    ```
+    """
+
+    default_checkpoint = "philschmid/bart-large-cnn-samsum"
+    description = (
+        "This is a tool that summarizes an English text. It takes an input `text` containing the text to summarize, "
+        "and returns a summary of the text."
+    )
+    name = "summarizer"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSeq2SeqLM
+
+    inputs = ["text"]
+    outputs = ["text"]
+
+    def encode(self, text):
+        return self.pre_processor(text, return_tensors="pt", truncation=True)
+
+    def forward(self, inputs):
+        return self.model.generate(**inputs)[0]
+
+    def decode(self, outputs):
+        return self.pre_processor.decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
diff --git a/src/transformers/tools/text_to_speech.py b/src/transformers/tools/text_to_speech.py
new file mode 100644
index 000000000000..9faed77b01a3
--- /dev/null
+++ b/src/transformers/tools/text_to_speech.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from ..models.speecht5 import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
+from ..utils import is_datasets_available
+from .base import PipelineTool
+
+
+if is_datasets_available():
+    from datasets import load_dataset
+
+
+class TextToSpeechTool(PipelineTool):
+    default_checkpoint = "microsoft/speecht5_tts"
+    description = (
+        "This is a tool that reads an English text out loud. It takes an input named `text` which should contain the "
+        "text to read (in English) and returns a waveform object containing the sound."
+    )
+    name = "text_reader"
+    pre_processor_class = SpeechT5Processor
+    model_class = SpeechT5ForTextToSpeech
+    post_processor_class = SpeechT5HifiGan
+
+    inputs = ["text"]
+    outputs = ["audio"]
+
+    def setup(self):
+        if self.post_processor is None:
+            self.post_processor = "microsoft/speecht5_hifigan"
+        super().setup()
+
+    def encode(self, text, speaker_embeddings=None):
+        inputs = self.pre_processor(text=text, return_tensors="pt", truncation=True)
+
+        if speaker_embeddings is None:
+            if not is_datasets_available():
+                raise ImportError("Datasets needs to be installed if not passing speaker embeddings.")
+
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            speaker_embeddings = torch.tensor(embeddings_dataset[7305]["xvector"]).unsqueeze(0)
+
+        return {"input_ids": inputs["input_ids"], "speaker_embeddings": speaker_embeddings}
+
+    def forward(self, inputs):
+        with torch.no_grad():
+            return self.model.generate_speech(**inputs)
+
+    def decode(self, outputs):
+        with torch.no_grad():
+            return self.post_processor(outputs).cpu().detach()
diff --git a/src/transformers/tools/translation.py b/src/transformers/tools/translation.py
new file mode 100644
index 000000000000..50a164d5bd6f
--- /dev/null
+++ b/src/transformers/tools/translation.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..models.auto import AutoModelForSeq2SeqLM, AutoTokenizer
+from .base import PipelineTool
+
+
+LANGUAGE_CODES = {
+    "Acehnese Arabic": "ace_Arab",
+    "Acehnese Latin": "ace_Latn",
+    "Mesopotamian Arabic": "acm_Arab",
+    "Ta'izzi-Adeni Arabic": "acq_Arab",
+    "Tunisian Arabic": "aeb_Arab",
+    "Afrikaans": "afr_Latn",
+    "South Levantine Arabic": "ajp_Arab",
+    "Akan": "aka_Latn",
+    "Amharic": "amh_Ethi",
+    "North Levantine Arabic": "apc_Arab",
+    "Modern Standard Arabic": "arb_Arab",
+    "Modern Standard Arabic Romanized": "arb_Latn",
+    "Najdi Arabic": "ars_Arab",
+    "Moroccan Arabic": "ary_Arab",
+    "Egyptian Arabic": "arz_Arab",
+    "Assamese": "asm_Beng",
+    "Asturian": "ast_Latn",
+    "Awadhi": "awa_Deva",
+    "Central Aymara": "ayr_Latn",
+    "South Azerbaijani": "azb_Arab",
+    "North Azerbaijani": "azj_Latn",
+    "Bashkir": "bak_Cyrl",
+    "Bambara": "bam_Latn",
+    "Balinese": "ban_Latn",
+    "Belarusian": "bel_Cyrl",
+    "Bemba": "bem_Latn",
+    "Bengali": "ben_Beng",
+    "Bhojpuri": "bho_Deva",
+    "Banjar Arabic": "bjn_Arab",
+    "Banjar Latin": "bjn_Latn",
+    "Standard Tibetan": "bod_Tibt",
+    "Bosnian": "bos_Latn",
+    "Buginese": "bug_Latn",
+    "Bulgarian": "bul_Cyrl",
+    "Catalan": "cat_Latn",
+    "Cebuano": "ceb_Latn",
+    "Czech": "ces_Latn",
+    "Chokwe": "cjk_Latn",
+    "Central Kurdish": "ckb_Arab",
+    "Crimean Tatar": "crh_Latn",
+    "Welsh": "cym_Latn",
+    "Danish": "dan_Latn",
+    "German": "deu_Latn",
+    "Southwestern Dinka": "dik_Latn",
+    "Dyula": "dyu_Latn",
+    "Dzongkha": "dzo_Tibt",
+    "Greek": "ell_Grek",
+    "English": "eng_Latn",
+    "Esperanto": "epo_Latn",
+    "Estonian": "est_Latn",
+    "Basque": "eus_Latn",
+    "Ewe": "ewe_Latn",
+    "Faroese": "fao_Latn",
+    "Fijian": "fij_Latn",
+    "Finnish": "fin_Latn",
+    "Fon": "fon_Latn",
+    "French": "fra_Latn",
+    "Friulian": "fur_Latn",
+    "Nigerian Fulfulde": "fuv_Latn",
+    "Scottish Gaelic": "gla_Latn",
+    "Irish": "gle_Latn",
+    "Galician": "glg_Latn",
+    "Guarani": "grn_Latn",
+    "Gujarati": "guj_Gujr",
+    "Haitian Creole": "hat_Latn",
+    "Hausa": "hau_Latn",
+    "Hebrew": "heb_Hebr",
+    "Hindi": "hin_Deva",
+    "Chhattisgarhi": "hne_Deva",
+    "Croatian": "hrv_Latn",
+    "Hungarian": "hun_Latn",
+    "Armenian": "hye_Armn",
+    "Igbo": "ibo_Latn",
+    "Ilocano": "ilo_Latn",
+    "Indonesian": "ind_Latn",
+    "Icelandic": "isl_Latn",
+    "Italian": "ita_Latn",
+    "Javanese": "jav_Latn",
+    "Japanese": "jpn_Jpan",
+    "Kabyle": "kab_Latn",
+    "Jingpho": "kac_Latn",
+    "Kamba": "kam_Latn",
+    "Kannada": "kan_Knda",
+    "Kashmiri Arabic": "kas_Arab",
+    "Kashmiri Devanagari": "kas_Deva",
+    "Georgian": "kat_Geor",
+    "Central Kanuri Arabic": "knc_Arab",
+    "Central Kanuri Latin": "knc_Latn",
+    "Kazakh": "kaz_Cyrl",
+    "Kabiyè": "kbp_Latn",
+    "Kabuverdianu": "kea_Latn",
+    "Khmer": "khm_Khmr",
+    "Kikuyu": "kik_Latn",
+    "Kinyarwanda": "kin_Latn",
+    "Kyrgyz": "kir_Cyrl",
+    "Kimbundu": "kmb_Latn",
+    "Northern Kurdish": "kmr_Latn",
+    "Kikongo": "kon_Latn",
+    "Korean": "kor_Hang",
+    "Lao": "lao_Laoo",
+    "Ligurian": "lij_Latn",
+    "Limburgish": "lim_Latn",
+    "Lingala": "lin_Latn",
+    "Lithuanian": "lit_Latn",
+    "Lombard": "lmo_Latn",
+    "Latgalian": "ltg_Latn",
+    "Luxembourgish": "ltz_Latn",
+    "Luba-Kasai": "lua_Latn",
+    "Ganda": "lug_Latn",
+    "Luo": "luo_Latn",
+    "Mizo": "lus_Latn",
+    "Standard Latvian": "lvs_Latn",
+    "Magahi": "mag_Deva",
+    "Maithili": "mai_Deva",
+    "Malayalam": "mal_Mlym",
+    "Marathi": "mar_Deva",
+    "Minangkabau Arabic ": "min_Arab",
+    "Minangkabau Latin": "min_Latn",
+    "Macedonian": "mkd_Cyrl",
+    "Plateau Malagasy": "plt_Latn",
+    "Maltese": "mlt_Latn",
+    "Meitei Bengali": "mni_Beng",
+    "Halh Mongolian": "khk_Cyrl",
+    "Mossi": "mos_Latn",
+    "Maori": "mri_Latn",
+    "Burmese": "mya_Mymr",
+    "Dutch": "nld_Latn",
+    "Norwegian Nynorsk": "nno_Latn",
+    "Norwegian Bokmål": "nob_Latn",
+    "Nepali": "npi_Deva",
+    "Northern Sotho": "nso_Latn",
+    "Nuer": "nus_Latn",
+    "Nyanja": "nya_Latn",
+    "Occitan": "oci_Latn",
+    "West Central Oromo": "gaz_Latn",
+    "Odia": "ory_Orya",
+    "Pangasinan": "pag_Latn",
+    "Eastern Panjabi": "pan_Guru",
+    "Papiamento": "pap_Latn",
+    "Western Persian": "pes_Arab",
+    "Polish": "pol_Latn",
+    "Portuguese": "por_Latn",
+    "Dari": "prs_Arab",
+    "Southern Pashto": "pbt_Arab",
+    "Ayacucho Quechua": "quy_Latn",
+    "Romanian": "ron_Latn",
+    "Rundi": "run_Latn",
+    "Russian": "rus_Cyrl",
+    "Sango": "sag_Latn",
+    "Sanskrit": "san_Deva",
+    "Santali": "sat_Olck",
+    "Sicilian": "scn_Latn",
+    "Shan": "shn_Mymr",
+    "Sinhala": "sin_Sinh",
+    "Slovak": "slk_Latn",
+    "Slovenian": "slv_Latn",
+    "Samoan": "smo_Latn",
+    "Shona": "sna_Latn",
+    "Sindhi": "snd_Arab",
+    "Somali": "som_Latn",
+    "Southern Sotho": "sot_Latn",
+    "Spanish": "spa_Latn",
+    "Tosk Albanian": "als_Latn",
+    "Sardinian": "srd_Latn",
+    "Serbian": "srp_Cyrl",
+    "Swati": "ssw_Latn",
+    "Sundanese": "sun_Latn",
+    "Swedish": "swe_Latn",
+    "Swahili": "swh_Latn",
+    "Silesian": "szl_Latn",
+    "Tamil": "tam_Taml",
+    "Tatar": "tat_Cyrl",
+    "Telugu": "tel_Telu",
+    "Tajik": "tgk_Cyrl",
+    "Tagalog": "tgl_Latn",
+    "Thai": "tha_Thai",
+    "Tigrinya": "tir_Ethi",
+    "Tamasheq Latin": "taq_Latn",
+    "Tamasheq Tifinagh": "taq_Tfng",
+    "Tok Pisin": "tpi_Latn",
+    "Tswana": "tsn_Latn",
+    "Tsonga": "tso_Latn",
+    "Turkmen": "tuk_Latn",
+    "Tumbuka": "tum_Latn",
+    "Turkish": "tur_Latn",
+    "Twi": "twi_Latn",
+    "Central Atlas Tamazight": "tzm_Tfng",
+    "Uyghur": "uig_Arab",
+    "Ukrainian": "ukr_Cyrl",
+    "Umbundu": "umb_Latn",
+    "Urdu": "urd_Arab",
+    "Northern Uzbek": "uzn_Latn",
+    "Venetian": "vec_Latn",
+    "Vietnamese": "vie_Latn",
+    "Waray": "war_Latn",
+    "Wolof": "wol_Latn",
+    "Xhosa": "xho_Latn",
+    "Eastern Yiddish": "ydd_Hebr",
+    "Yoruba": "yor_Latn",
+    "Yue Chinese": "yue_Hant",
+    "Chinese Simplified": "zho_Hans",
+    "Chinese Traditional": "zho_Hant",
+    "Standard Malay": "zsm_Latn",
+    "Zulu": "zul_Latn",
+}
+
+
+class TranslationTool(PipelineTool):
+    """
+    Example:
+
+    ```py
+    from transformers.tools import TranslationTool
+
+    translator = TranslationTool()
+    translator("This is a super nice API!", src_lang="English", tgt_lang="French")
+    ```
+    """
+
+    default_checkpoint = "facebook/nllb-200-distilled-600M"
+    description = (
+        "This is a tool that translates text from a language to another. It takes three inputs: `text`, which should "
+        "be the text to translate, `src_lang`, which should be the language of the text to translate and `tgt_lang`, "
+        "which should be the language for the desired ouput language. Both `src_lang` and `tgt_lang` are written in "
+        "plain English, such as 'Romanian', or 'Albanian'. It returns the text translated in `tgt_lang`."
+    )
+    name = "translator"
+    pre_processor_class = AutoTokenizer
+    model_class = AutoModelForSeq2SeqLM
+    lang_to_code = LANGUAGE_CODES
+
+    inputs = ["text", "text", "text"]
+    outputs = ["text"]
+
+    def encode(self, text, src_lang, tgt_lang):
+        if src_lang not in self.lang_to_code:
+            raise ValueError(f"{src_lang} is not a supported language.")
+        if tgt_lang not in self.lang_to_code:
+            raise ValueError(f"{tgt_lang} is not a supported language.")
+        src_lang = self.lang_to_code[src_lang]
+        tgt_lang = self.lang_to_code[tgt_lang]
+        return self.pre_processor._build_translation_inputs(
+            text, return_tensors="pt", src_lang=src_lang, tgt_lang=tgt_lang
+        )
+
+    def forward(self, inputs):
+        return self.model.generate(**inputs)
+
+    def decode(self, outputs):
+        return self.post_processor.decode(outputs[0].tolist(), skip_special_tokens=True)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 42e856d9e4ac..a44a8360c667 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -121,6 +121,7 @@
     is_natten_available,
     is_ninja_available,
     is_onnx_available,
+    is_openai_available,
     is_optimum_available,
     is_pandas_available,
     is_peft_available,
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 6bedd51509de..58aac59d24e8 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -235,6 +235,7 @@ def try_to_load_from_cache(
     filename: str,
     cache_dir: Union[str, Path, None] = None,
     revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
 ) -> Optional[str]:
     """
     Explores the cache to return the latest cached file for a given revision if found.
@@ -251,6 +252,8 @@ def try_to_load_from_cache(
         revision (`str`, *optional*):
             The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
             provided either.
+        repo_type (`str`, *optional*):
+            The type of the repo.
 
     Returns:
         `Optional[str]` or `_CACHED_NO_EXIST`:
@@ -266,7 +269,9 @@ def try_to_load_from_cache(
         cache_dir = TRANSFORMERS_CACHE
 
     object_id = repo_id.replace("/", "--")
-    repo_cache = os.path.join(cache_dir, f"models--{object_id}")
+    if repo_type is None:
+        repo_type = "model"
+    repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
     if not os.path.isdir(repo_cache):
         # No cache for this model
         return None
@@ -303,6 +308,7 @@ def cached_file(
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
+    repo_type: Optional[str] = None,
     user_agent: Optional[Union[str, Dict[str, str]]] = None,
     _raise_exceptions_for_missing_entries: bool = True,
     _raise_exceptions_for_connection_errors: bool = True,
@@ -342,6 +348,8 @@ def cached_file(
         subfolder (`str`, *optional*, defaults to `""`):
             In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
             specify the folder name here.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
 
     <Tip>
 
@@ -393,7 +401,7 @@ def cached_file(
     if _commit_hash is not None and not force_download:
         # If the file is cached under that commit hash, we return it directly.
         resolved_file = try_to_load_from_cache(
-            path_or_repo_id, full_filename, cache_dir=cache_dir, revision=_commit_hash
+            path_or_repo_id, full_filename, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
         )
         if resolved_file is not None:
             if resolved_file is not _CACHED_NO_EXIST:
@@ -410,6 +418,7 @@ def cached_file(
             path_or_repo_id,
             filename,
             subfolder=None if len(subfolder) == 0 else subfolder,
+            repo_type=repo_type,
             revision=revision,
             cache_dir=cache_dir,
             user_agent=user_agent,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 9d31c1ba0de1..af169332eccb 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -125,6 +125,14 @@
     _datasets_available = False
 
 
+_diffusers_available = importlib.util.find_spec("diffusers") is not None
+try:
+    _diffusers_version = importlib_metadata.version("diffusers")
+    logger.debug(f"Successfully imported diffusers version {_diffusers_version}")
+except importlib_metadata.PackageNotFoundError:
+    _diffusers_available = False
+
+
 _detectron2_available = importlib.util.find_spec("detectron2") is not None
 try:
     _detectron2_version = importlib_metadata.version("detectron2")
@@ -185,6 +193,9 @@
     _onnx_available = False
 
 
+_opencv_available = importlib.util.find_spec("cv2") is not None
+
+
 _pytorch_quantization_available = importlib.util.find_spec("pytorch_quantization") is not None
 try:
     _pytorch_quantization_version = importlib_metadata.version("pytorch_quantization")
@@ -431,6 +442,10 @@ def is_onnx_available():
     return _onnx_available
 
 
+def is_openai_available():
+    return importlib.util.find_spec("openai") is not None
+
+
 def is_flax_available():
     return _flax_available
 
diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/tools/test_document_question_answering.py b/tests/tools/test_document_question_answering.py
new file mode 100644
index 000000000000..a799676e9a62
--- /dev/null
+++ b/tests/tools/test_document_question_answering.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from datasets import load_dataset
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+class DocumentQuestionAnsweringToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("document-question-answering")
+        self.tool.setup()
+        self.remote_tool = load_tool("document-question-answering", remote=True)
+
+    def test_exact_match_arg(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[0]["image"]
+
+        result = self.tool(image, "When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
+
+    def test_exact_match_arg_remote(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[0]["image"]
+
+        result = self.remote_tool(image, "When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
+
+    def test_exact_match_kwarg(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[0]["image"]
+
+        result = self.tool(image=image, question="When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
+
+    def test_exact_match_kwarg_remote(self):
+        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+        image = dataset[0]["image"]
+
+        result = self.remote_tool(image=image, question="When is the coffee break?")
+        self.assertEqual(result, "11-14 to 11:39 a.m.")
diff --git a/tests/tools/test_image_captioning.py b/tests/tools/test_image_captioning.py
new file mode 100644
index 000000000000..fcd06eb44435
--- /dev/null
+++ b/tests/tools/test_image_captioning.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+
+from transformers import is_vision_available, load_tool
+from transformers.testing_utils import get_tests_dir
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageCaptioningToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("image-captioning")
+        self.tool.setup()
+        self.remote_tool = load_tool("image-captioning", remote=True)
+
+    def test_exact_match_arg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image)
+        self.assertEqual(result, "two cats sleeping on a couch")
+
+    def test_exact_match_arg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image)
+        self.assertEqual(result, "two cats sleeping on a couch")
+
+    def test_exact_match_kwarg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image=image)
+        self.assertEqual(result, "two cats sleeping on a couch")
+
+    def test_exact_match_kwarg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image=image)
+        self.assertEqual(result, "two cats sleeping on a couch")
diff --git a/tests/tools/test_image_question_answering.py b/tests/tools/test_image_question_answering.py
new file mode 100644
index 000000000000..035b1b4fa045
--- /dev/null
+++ b/tests/tools/test_image_question_answering.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+
+from transformers import is_vision_available, load_tool
+from transformers.testing_utils import get_tests_dir
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageQuestionAnsweringToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("image-question-answering")
+        self.tool.setup()
+        self.remote_tool = load_tool("image-question-answering", remote=True)
+
+    def test_exact_match_arg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image, "How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
+
+    def test_exact_match_arg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image, "How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
+
+    def test_exact_match_kwarg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.tool(image=image, question="How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
+
+    def test_exact_match_kwarg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
+        result = self.remote_tool(image=image, question="How many cats are sleeping on the couch?")
+        self.assertEqual(result, "2")
diff --git a/tests/tools/test_image_segmentation.py b/tests/tools/test_image_segmentation.py
new file mode 100644
index 000000000000..b71d84729929
--- /dev/null
+++ b/tests/tools/test_image_segmentation.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+
+from transformers import is_vision_available, load_tool
+from transformers.testing_utils import get_tests_dir
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class ImageSegmentationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("image-segmentation")
+        self.tool.setup()
+        self.remote_tool = load_tool("image-segmentation", remote=True)
+
+    def test_exact_match_arg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        result = self.tool(image, "cat")
+        self.assertTrue(isinstance(result, Image.Image))
+
+    def test_exact_match_arg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        result = self.remote_tool(image, "cat")
+        self.assertTrue(isinstance(result, Image.Image))
+
+    def test_exact_match_kwarg(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        result = self.tool(image=image, prompt="cat")
+        self.assertTrue(isinstance(result, Image.Image))
+
+    def test_exact_match_kwarg_remote(self):
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        result = self.remote_tool(image=image, prompt="cat")
+        self.assertTrue(isinstance(result, Image.Image))
diff --git a/tests/tools/test_python_interpreter.py b/tests/tools/test_python_interpreter.py
new file mode 100644
index 000000000000..5dfdd8d9c37c
--- /dev/null
+++ b/tests/tools/test_python_interpreter.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.testing_utils import CaptureStdout
+from transformers.tools.python_interpreter import evaluate
+
+
+# Fake function we will use as tool
+def add_two(x):
+    return x + 2
+
+
+class PythonInterpreterTester(unittest.TestCase):
+    def test_evaluate_assign(self):
+        code = "x = 3"
+        state = {}
+        result = evaluate(code, {}, state=state)
+        assert result == 3
+        self.assertDictEqual(state, {"x": 3})
+
+        code = "x = y"
+        state = {"y": 5}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 5
+        self.assertDictEqual(state, {"x": 5, "y": 5})
+
+    def test_evaluate_call(self):
+        code = "y = add_two(x)"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "y": 5})
+
+        # Won't work without the tool
+        with CaptureStdout() as out:
+            result = evaluate(code, {}, state=state)
+        assert result is None
+        assert "tried to execute add_two" in out.out
+
+    def test_evaluate_constant(self):
+        code = "x = 3"
+        state = {}
+        result = evaluate(code, {}, state=state)
+        assert result == 3
+        self.assertDictEqual(state, {"x": 3})
+
+    def test_evaluate_dict(self):
+        code = "test_dict = {'x': x, 'y': add_two(x)}"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        self.assertDictEqual(result, {"x": 3, "y": 5})
+        self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}})
+
+    def test_evaluate_expression(self):
+        code = "x = 3\ny = 5"
+        state = {}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "y": 5})
+
+    def test_evaluate_f_string(self):
+        code = "text = f'This is x: {x}.'"
+        state = {"x": 3}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == "This is x: 3."
+        self.assertDictEqual(state, {"x": 3, "text": "This is x: 3."})
+
+    def test_evaluate_if(self):
+        code = "if x <= 3:\n    y = 2\nelse:\n    y = 5"
+        state = {"x": 3}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 2
+        self.assertDictEqual(state, {"x": 3, "y": 2})
+
+        state = {"x": 8}
+        result = evaluate(code, {}, state=state)
+        # evaluate returns the value of the last assignment.
+        assert result == 5
+        self.assertDictEqual(state, {"x": 8, "y": 5})
+
+    def test_evaluate_list(self):
+        code = "test_list = [x, add_two(x)]"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        self.assertListEqual(result, [3, 5])
+        self.assertDictEqual(state, {"x": 3, "test_list": [3, 5]})
+
+    def test_evaluate_name(self):
+        code = "y = x"
+        state = {"x": 3}
+        result = evaluate(code, {}, state=state)
+        assert result == 3
+        self.assertDictEqual(state, {"x": 3, "y": 3})
+
+    def test_evaluate_subscript(self):
+        code = "test_list = [x, add_two(x)]\ntest_list[1]"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "test_list": [3, 5]})
+
+        code = "test_dict = {'x': x, 'y': add_two(x)}\ntest_dict['y']"
+        state = {"x": 3}
+        result = evaluate(code, {"add_two": add_two}, state=state)
+        assert result == 5
+        self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}})
diff --git a/tests/tools/test_speech_to_text.py b/tests/tools/test_speech_to_text.py
new file mode 100644
index 000000000000..9383cf0465f8
--- /dev/null
+++ b/tests/tools/test_speech_to_text.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available, load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class SpeechToTextToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("speech-to-text")
+        self.tool.setup()
+
+    def test_exact_match_arg(self):
+        result = self.tool(torch.ones(3000))
+        self.assertEqual(result, " you")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(audio=torch.ones(3000))
+        self.assertEqual(result, " you")
diff --git a/tests/tools/test_text_classification.py b/tests/tools/test_text_classification.py
new file mode 100644
index 000000000000..b40067490c61
--- /dev/null
+++ b/tests/tools/test_text_classification.py
@@ -0,0 +1,43 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+class TextClassificationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("text-classification")
+        self.tool.setup()
+        self.remote_tool = load_tool("text-classification", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool("That's quite cool", ["positive", "negative"])
+        self.assertEqual(result, "positive")
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool("That's quite cool", ["positive", "negative"])
+        self.assertEqual(result, "positive")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text="That's quite cool", labels=["positive", "negative"])
+        self.assertEqual(result, "positive")
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text="That's quite cool", labels=["positive", "negative"])
+        self.assertEqual(result, "positive")
diff --git a/tests/tools/test_text_question_answering.py b/tests/tools/test_text_question_answering.py
new file mode 100644
index 000000000000..aed2898f0153
--- /dev/null
+++ b/tests/tools/test_text_question_answering.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+TEXT = """
+Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf originally as a company that developed a chatbot app targeted at teenagers.[2] After open-sourcing the model behind the chatbot, the company pivoted to focus on being a platform for machine learning.
+
+In March 2021, Hugging Face raised $40 million in a Series B funding round.[3]
+
+On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model.[4] In 2022, the workshop concluded with the announcement of BLOOM, a multilingual large language model with 176 billion parameters.[5]
+"""
+
+
+class TextQuestionAnsweringToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("text-question-answering")
+        self.tool.setup()
+        self.remote_tool = load_tool("text-question-answering", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool(TEXT, "What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool(TEXT, "What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text=TEXT, question="What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text=TEXT, question="What did Hugging Face do in April 2021?")
+        self.assertEqual(result, "launched the BigScience Research Workshop")
diff --git a/tests/tools/test_text_summarization.py b/tests/tools/test_text_summarization.py
new file mode 100644
index 000000000000..162443f373a4
--- /dev/null
+++ b/tests/tools/test_text_summarization.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin
+
+
+TEXT = """
+Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf originally as a company that developed a chatbot app targeted at teenagers.[2] After open-sourcing the model behind the chatbot, the company pivoted to focus on being a platform for machine learning.
+
+In March 2021, Hugging Face raised $40 million in a Series B funding round.[3]
+
+On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model.[4] In 2022, the workshop concluded with the announcement of BLOOM, a multilingual large language model with 176 billion parameters.[5]
+"""
+
+
+class TextSummarizationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("summarization")
+        self.tool.setup()
+        self.remote_tool = load_tool("summarization", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool(TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool(TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text=TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text=TEXT)
+        self.assertEqual(
+            result,
+            "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf. In March 2021, Hugging Face raised $40 million in a Series B funding round. On April 28, 2021, the company launched the BigScience Research Workshop in collaboration with several other research groups to release an open large language model. In 2022, the workshop concluded with the announcement of BLOOM.",
+        )
diff --git a/tests/tools/test_text_to_speech.py b/tests/tools/test_text_to_speech.py
new file mode 100644
index 000000000000..d404f3574058
--- /dev/null
+++ b/tests/tools/test_text_to_speech.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+from transformers.testing_utils import require_torch
+
+from .test_tools_common import ToolTesterMixin
+
+
+@require_torch
+class TextToSpeechToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("text-to-speech")
+        self.tool.setup()
+
+    def test_exact_match_arg(self):
+        # SpeechT5 isn't deterministic
+        torch.manual_seed(0)
+        result = self.tool("hey")
+        self.assertTrue(
+            torch.allclose(
+                result[:3], torch.tensor([-0.00040140701457858086, -0.0002551682700868696, -0.00010294507956132293])
+            )
+        )
+
+    def test_exact_match_kwarg(self):
+        # SpeechT5 isn't deterministic
+        torch.manual_seed(0)
+        result = self.tool("hey")
+        self.assertTrue(
+            torch.allclose(
+                result[:3], torch.tensor([-0.00040140701457858086, -0.0002551682700868696, -0.00010294507956132293])
+            )
+        )
diff --git a/tests/tools/test_tools_common.py b/tests/tools/test_tools_common.py
new file mode 100644
index 000000000000..5e66885cac7d
--- /dev/null
+++ b/tests/tools/test_tools_common.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import List
+
+from transformers import is_torch_available, is_vision_available
+from transformers.testing_utils import get_tests_dir, is_tool_test
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+authorized_types = ["text", "image", "audio"]
+
+
+def create_inputs(input_types: List[str]):
+    inputs = []
+
+    for input_type in input_types:
+        if input_type == "text":
+            inputs.append("Text input")
+        elif input_type == "image":
+            inputs.append(
+                Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+            )
+        elif input_type == "audio":
+            inputs.append(torch.ones(3000))
+        elif isinstance(input_type, list):
+            inputs.append(create_inputs(input_type))
+        else:
+            raise ValueError(f"Invalid type requested: {input_type}")
+
+    return inputs
+
+
+def output_types(outputs: List):
+    output_types = []
+
+    for output in outputs:
+        if isinstance(output, str):
+            output_types.append("text")
+        elif isinstance(output, Image.Image):
+            output_types.append("image")
+        elif isinstance(output, torch.Tensor):
+            output_types.append("audio")
+        else:
+            raise ValueError(f"Invalid output: {output}")
+
+    return output_types
+
+
+@is_tool_test
+class ToolTesterMixin:
+    def test_inputs_outputs(self):
+        self.assertTrue(hasattr(self.tool, "inputs"))
+        self.assertTrue(hasattr(self.tool, "outputs"))
+
+        inputs = self.tool.inputs
+        for _input in inputs:
+            if isinstance(_input, list):
+                for __input in _input:
+                    self.assertTrue(__input in authorized_types)
+            else:
+                self.assertTrue(_input in authorized_types)
+
+        outputs = self.tool.outputs
+        for _output in outputs:
+            self.assertTrue(_output in authorized_types)
+
+    def test_call(self):
+        inputs = create_inputs(self.tool.inputs)
+        outputs = self.tool(*inputs)
+
+        # There is a single output
+        if len(self.tool.outputs) == 1:
+            outputs = [outputs]
+
+        self.assertListEqual(output_types(outputs), self.tool.outputs)
+
+    def test_common_attributes(self):
+        self.assertTrue(hasattr(self.tool, "description"))
+        self.assertTrue(hasattr(self.tool, "default_checkpoint"))
+        self.assertTrue(self.tool.description.startswith("This is a tool that"))
diff --git a/tests/tools/test_translation.py b/tests/tools/test_translation.py
new file mode 100644
index 000000000000..2ccb043b0049
--- /dev/null
+++ b/tests/tools/test_translation.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import load_tool
+
+from .test_tools_common import ToolTesterMixin, output_types
+
+
+class TranslationToolTester(unittest.TestCase, ToolTesterMixin):
+    def setUp(self):
+        self.tool = load_tool("translation")
+        self.tool.setup()
+        self.remote_tool = load_tool("translation", remote=True)
+
+    def test_exact_match_arg(self):
+        result = self.tool("Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_exact_match_arg_remote(self):
+        result = self.remote_tool("Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_exact_match_kwarg(self):
+        result = self.tool(text="Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_exact_match_kwarg_remote(self):
+        result = self.remote_tool(text="Hey, what's up?", src_lang="English", tgt_lang="French")
+        self.assertEqual(result, "- Hé, comment ça va?")
+
+    def test_call(self):
+        inputs = ["Hey, what's up?", "English", "Spanish"]
+        outputs = self.tool(*inputs)
+
+        # There is a single output
+        if len(self.tool.outputs) == 1:
+            outputs = [outputs]
+
+        self.assertListEqual(output_types(outputs), self.tool.outputs)

From 91f4c84a198483d030914c12304a96de93b994cb Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Wed, 10 May 2023 07:45:10 -0400
Subject: [PATCH 063/935] CTC example: updated trainer parameters to save
 tokenizer (#23243)

trainer parameters changed to save tokenizer in addition to feature_extractor
---
 .../pytorch/speech-recognition/run_speech_recognition_ctc.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 73eea0b79f33..f29de6e0b98d 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -705,7 +705,7 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        tokenizer=processor,
     )
 
     # 8. Finally, we can start training

From d3cbc997a231098cca81ac27fd3028a5536abe67 Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Wed, 10 May 2023 07:45:33 -0400
Subject: [PATCH 064/935] [docs] Audio task guides fixes (#23239)

trainer parameters fixed
---
 docs/source/en/tasks/asr.mdx            | 2 +-
 docs/source/en/tasks/text-to-speech.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx
index b7cced766435..06737f8c8c62 100644
--- a/docs/source/en/tasks/asr.mdx
+++ b/docs/source/en/tasks/asr.mdx
@@ -282,7 +282,7 @@ At this point, only three steps remain:
 ...     args=training_args,
 ...     train_dataset=encoded_minds["train"],
 ...     eval_dataset=encoded_minds["test"],
-...     tokenizer=processor.feature_extractor,
+...     tokenizer=processor,
 ...     data_collator=data_collator,
 ...     compute_metrics=compute_metrics,
 ... )
diff --git a/docs/source/en/tasks/text-to-speech.mdx b/docs/source/en/tasks/text-to-speech.mdx
index a368fcb35f7e..21c76841e32f 100644
--- a/docs/source/en/tasks/text-to-speech.mdx
+++ b/docs/source/en/tasks/text-to-speech.mdx
@@ -469,7 +469,7 @@ Instantiate the `Trainer` object  and pass the model, dataset, and data collator
 ...     train_dataset=dataset["train"],
 ...     eval_dataset=dataset["test"],
 ...     data_collator=data_collator,
-...     tokenizer=processor.tokenizer,
+...     tokenizer=processor,
 ... )
 ```
 

From 996f127a904e24cfb1722db00bc1e84520b7d4d3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 10 May 2023 14:55:26 +0200
Subject: [PATCH 065/935] Improve Docs of Custom Tools and Agents (#23255)

* Improve docs

* correct tip format

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Correct grammer & spelling

* Improve code style

* make style ruff

* make style final
---
 docs/source/en/custom_tools.mdx        | 349 ++++++++++++++++++++++---
 docs/source/en/transformers_agents.mdx |  47 ++--
 src/transformers/tools/agents.py       |  10 +-
 3 files changed, 340 insertions(+), 66 deletions(-)

diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
index f69a2cde90d7..29a54b61cbcd 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@@ -21,7 +21,7 @@ If you are not aware of what tools and agents are in the context of transformers
 
 <Tip warning={true}>
 
-Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
+Transformers Agent is an experimental API that is subject to change at any time. Results returned by the agents
 can vary as the APIs or underlying models are prone to change.
 
 </Tip>
@@ -36,13 +36,15 @@ In this guide we'll take a look at:
 ## Customizing the prompt
 
 As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode.
-Both the run and chat mode underlie the same logic. The language model powering the agent is conditioned on a long prompt 
-and simply asked to complete the prompt by generating next tokens until the stop token is reached.
-The only difference between the `run` and `chat` mode is that during the `chat` mode the prompt is extended with 
-previous user inputs and model generations, which seemingly gives the agent a memory and allows it to refer to 
-past interactions.
+Both the `run` and `chat` modes underlie the same logic. The language model powering the agent is conditioned on a long 
+prompt and completes the prompt by generating the next tokens until the stop token is reached.
+The only difference between the two modes is that during the `chat` mode the prompt is extended with 
+previous user inputs and model generations. This allows the agent to have access to past interactions,
+seemingly giving the agent some kind of memory.
 
-Let's take a closer look into how the prompt is structured to understand how it can be best customized.
+### Structure of the prompt
+
+Let's take a closer look at how the prompt is structured to understand how it can be best customized.
 The prompt is structured broadly into four parts.
 
 - 1. Introduction: how the agent should behave, explanation of the concept of tools.
@@ -50,16 +52,16 @@ The prompt is structured broadly into four parts.
 - 3. A set of examples of tasks and their solution
 - 4. Current example, and request for solution.
 
-To better understand each part, let's look at a shortened version of how such a prompt can look like in practice.
+To better understand each part, let's look at a shortened version of how the `run` prompt can look like:
 
-```
+````
 I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
 [...]
 You can print intermediate results if it makes sense to do so.
 
 Tools:
-- document_qa: This is a tool that answers a question about an document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
-- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to caption, and returns a text that contains the description in English.
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
 [...]
 
 Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
@@ -90,21 +92,286 @@ image = image_generator("A banner showing " + answer)
 Task: "Draw me a picture of rivers and lakes"
 
 I will use the following
+````
+
+The introduction (the text before *"Tools:"*) explains precisely how the model shall behave and what it should do.
+This part most likely does not need to be customized as the agent shall always behave the same way.
+
+The second part (the bullet points below *"Tools"*) is dynamically added upon calling `run` or `chat`. There are 
+exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name 
+and description of the tool:
+
+```
+- <tool.name>: <tool.description>
+```
+
+Let's verify this quickly by loading the document_qa tool and printing out the name and description.
+
+```py
+from transformers import load_tool
+
+document_qa = load_tool("document-question-answering")
+print(f"- {document_qa.name}: {document_qa.description}")
+```
+
+which gives:
+```
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+```
+
+We can see that the tool name is short and precise. The description includes two parts, the first explaining 
+what the tool does and the second states what input arguments and return values are expected.
+
+A good tool name and tool description are very important for the agent to correctly use it. Note that the only
+information the agent has about the tool is its name and description, so one should make sure that both 
+are precisely written and match the style of the existing tools in the toolbox.
+
+<Tip>
+
+Check the naming and description of the curated Transformers tools to better understand what name and 
+description a tool is expected to have. You can see all tools with the [`Agent.toolbox`] property.
+
+</Tip>
+
+The third part includes a set of curated examples that show the agent exactly what code it should produce
+for what kind of user request. The large language models empowering the agent are extremely good at 
+recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important
+that the examples are written in a way that maximizes the likelihood of the agent to generating correct,
+  executable code in practice. 
+
+Let's have a look at one example:
+
+````
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+````
+
+The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of 
+what it intends to do, and finally the generated code. Every example that is part of the prompt has this exact 
+pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens.
+
+The prompt examples are curated by the Transformers team and rigorously evaluated on a set of 
+[problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)
+to ensure that the agent's prompt is as good as possible to solve real use cases of the agent.
+
+The final part of the prompt corresponds to:
+```
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+```
+
+is a final and unfinished example that the agent is tasked to complete. The unfinished example
+is dynamically created based on the actual user input. For the above example, the user ran:
+
+```py
+agent.run("Draw me a picture of rivers and lakes")
+```
+
+The user input - *a.k.a* the task: *"Draw me a picture of rivers and lakes"* is cast into the 
+prompt template: "Task: <task> \n\n I will use the following". This sentence makes up the final lines of the 
+prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example 
+exactly in the same way it was previously done in the examples.
+
+Without going into too much detail, the chat template has the same prompt structure with the 
+examples having a slightly different style, *e.g.*:
+
+````
+[...]
+
+=====
+
+Human: Answer the question in the variable `question` about the image stored in the variable `image`.
+
+Assistant: I will use the tool `image_qa` to answer the question on the input image.
+
+```py
+answer = image_qa(text=question, image=image)
+print(f"The answer is {answer}")
+```
+
+Human: I tried this code, it worked but didn't give me a good result. The question is in French
+
+Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
+
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(text=translated_question, image=image)
+print(f"The answer is {answer}")
+```
+
+=====
+
+[...]
+````
+
+Contrary, to the examples of the `run` prompt, each `chat` prompt example has one or more exchanges between the 
+*Human* and the *Assistant*. Every exchange is structured similarly to the example of the `run` prompt. 
+The user's input is appended to behind *Human:* and the agent is prompted to first generate what needs to be done 
+before generating code. An exchange can be based on previous exchanges, therefore allowing the user to refer
+to past exchanges as is done *e.g.* above by the user's input of "I tried **this** code" refers to the 
+previously generated code of the agent.
+
+Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form:
+```
+Human: <user-input>\n\nAssistent:
+```
+which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example
+to the prompt, thus giving the agent more context for the next `chat` turn.
+
+Great now that we know how the prompt is structured, let's see how we can customize it!
+
+### Writing good user inputs
+
+While large language models are getting better and better at understanding users' intentions, it helps 
+enormously to be as precise as possible to help the agent pick the correct task. What does it mean to be 
+as precise as possible?
+
+The agent sees a list of tool names and their description in its prompt. The more tools are added the 
+more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose
+the correct sequences of tools to run. Let's look at a common failure case, here we will only return 
+the code to analyze it.
+
+```py
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+
+agent.run("Show me a tree", return_code=True)
+```
+
+gives:
+
+```
+==Explanation from the agent==
+I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
+
+
+==Code generated by the agent==
+mask = image_segmenter(image, prompt="tree")
 ```
 
-The first part explains precisely how the model shall behave and what it should do. This part 
-most likely does not need to be customized.
+which is probably not what we wanted. Instead, it is more likely that we want an image of a tree to be generated.
+To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that 
+are present in the tool's name and description. Let's have a look.
+```py
+agent.toolbox["image_generator"].description
+```
+```
+'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
+```
 
-TODO(PVP) - explain better how the .description and .name influence the prompt
+The name and description make use of the keywords "image", "prompt", "create" and "generate". Using these words will most likely work better here. Let's refine our prompt a bit.
+
+```py
+agent.run("Create an image of a tree", return_code=True)
+```
+
+gives:
+```
+==Explanation from the agent==
+I will use the following tool `image_generator` to generate an image of a tree.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="tree")
+```
+
+Much better! That looks more like what we want. In short, when you notice that the agent struggles to 
+correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name
+and description and try refining your task request with it.
 
 ### Customizing the tool descriptions
 
-The performance of the agent is directly linked to the prompt itself. We structure the prompt so that it works well 
-with what we intend for the agent to do; but for maximum customization we also offer the ability to specify a different prompt when instantiating the agent.
+As we've seen before the agent has access to each of the tools' names and descriptions. The base tools 
+should have very precise names and descriptions, however, you might find that it could help to change the 
+the description or name of a tool for your specific use case. This might become especially important 
+when you've added multiple tools that are very similar or if you want to use your agent only for a certain 
+domain, *e.g.* image generation and transformations.
+
+A common problem is that the agent confuses image generation with image transformation/modification when 
+used a lot for image generation tasks, *e.g.*
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+returns
+```
+==Explanation from the agent== 
+I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+house_car_image = image_transformer(image=car_image, prompt="A house")
+```
+
+which is probably not exactly what we want here. It seems like the agent has a difficult time 
+to understand the difference between `image_generator` and `image_transformer` and often uses the two together.
+
+We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier`
+to disassociate it a bit from "image" and "prompt":
+```
+agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
+agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace("transforms an image according to a prompt", "modifies an image")
+```
+
+Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again.
+
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+
+Now we're getting:
+```
+==Explanation from the agent==
+I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
+
 
-### Customizing the single-execution prompt
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+```
 
-In order to specify a custom single-execution prompt, one would so the following:
+which is definitely closer to what we had in mind! However, we want to have both the house and car in the same image. Steering the task more toward single image generation should help:
+
+```py
+agent.run("Create image: 'A house and car'", return_code=True)
+```
+
+```
+==Explanation from the agent==
+I will use the following tool: `image_generator` to generate an image.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="A house and car")
+```
+
+<Tip warning={true}>
+
+Agents are still brittle for many use cases, especially when it comes to 
+slightly more complex use cases like generating an image of multiple objects.
+Both the agent itself and the underlying prompt will be further improved in the coming 
+months making sure that agents become more robust to a variety of user inputs.
+
+</Tip>
+
+### Customizing the whole prompt
+
+To give the user maximum flexibility, the whole prompt template as explained in [above](#structure-of-the-prompt)
+can be overwritten by the user. In this case make sure that your custom prompt includes an introduction section, 
+a tool section, an example section, and an unfinished example section. If you want to overwrite the `run` prompt template, 
+you can do as follows:
 
 ```py
 template = """ [...] """
@@ -112,31 +379,33 @@ template = """ [...] """
 agent = HfAgent(your_endpoint, run_prompt_template=template)
 ```
 
-<Tip>
+<Tip warning={true}>
 
-Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware 
-of the tools it has available to it.
+Please make sure to have the `<<all_tools>>` string and the `<<prompt>>` defined somewhere in the `template` so that the agent can be aware 
+of the tools, it has available to it as well as correctly insert the user's prompt.
 
 </Tip>
 
-#### Chat-execution prompt
+Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges:
+```
+Human: <<task>>
+
+Assistant:
+```
 
-In order to specify a custom single-execution prompt, one would so the following:
+Therefore it is important that the examples of the custom `chat` prompt template also make use of this format.
+You can overwrite the `chat` template at instantiation as follows.
 
 ```
 template = """ [...] """
 
-agent = HfAgent(
-	url_endpoint=your_endpoint,
-	token=your_hf_token,
-	chat_prompt_template=template
-)
+agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
 ```
 
-<Tip>
+<Tip warning={true}>
 
-Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be 
-aware of the tools it has available to it.
+Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware 
+of the tools, it has available to it.
 
 </Tip>
 
@@ -176,7 +445,7 @@ It takes two inputs: `image`, which should be the image to transform, and `promp
 Name: 'image_transformer'
 ```
 
-The name and description is accurate and fits the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
+The name and description are accurate and fit the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
 Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`:
 
 ```py
@@ -191,7 +460,7 @@ image_transformer has been replaced by <transformers_modules.diffusers.controlne
 8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
 ```
 
-The set of curated tools already has a `image_transformer` tool which is hereby replaced with our custom tool.
+The set of curated tools already has an `image_transformer` tool which is hereby replaced with our custom tool.
 
 <Tip>
 
@@ -201,7 +470,7 @@ as the overwritten tool in this case.
 
 </Tip>
 
-The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore is simply added to the list of tools.
+The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore simply added to the list of tools.
 You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute:
 
 ```py
@@ -248,7 +517,7 @@ image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image
 
 ```
 ==Explanation from the agent==
-I will use the following  tool: `image_transformer` to transform the image.
+I will use the following tool: `image_transformer` to transform the image.
 
 
 ==Code generated by the agent==
@@ -257,7 +526,7 @@ image = image_transformer(image, prompt="A frozen lake and snowy forest")
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
 
-The new image processing tool is based on ControlNet which is can make very strong modifications to the image.
+The new image processing tool is based on ControlNet which can make very strong modifications to the image.
 By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it.
 
 ```py
@@ -266,7 +535,7 @@ image = agent.run("Upscale the image", image)
 
 ```
 ==Explanation from the agent==
-I will use the following  tool: `image_upscaler` to upscale the image.
+I will use the following tool: `image_upscaler` to upscale the image.
 
 
 ==Code generated by the agent==
@@ -278,11 +547,11 @@ upscaled_image = image_upscaler(image)
 The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool 
 and was able to correctly run it.
 
-Next, let's have a look into how you can create a new custom tool.
+Next, let's have a look at how you can create a new custom tool.
 
 ### Adding new tools
 
-In this section we show how to create a new tool that can be added to the agent.
+In this section, we show how to create a new tool that can be added to the agent.
 
 #### Creating a new tool
 
@@ -406,7 +675,7 @@ and generates the following audio.
 <Tip>
 
 Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. Having a well-defined
-description of the tool is paramount to having it be leveraged by the agent.
+name and description of the tool is paramount to having it be leveraged by the agent.
 
 </Tip>
 
diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.mdx
index bddc908b0201..5343e93cf338 100644
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.mdx
@@ -21,7 +21,7 @@ can vary as the APIs or underlying models are prone to change.
 
 Transformers version v4.29.0, building on the concept of *tools* and *agents*.
 
-In short, it provides a natural language API on top of transformers: we define a set of curated tools, and design an 
+In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an 
 agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, 
 but we'll show you how the system can be extended easily to use any tool developed by the community.
 
@@ -63,7 +63,7 @@ Before being able to use `agent.run`, you will need to instantiate an agent, whi
 We recommend using the [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) checkpoint as it works very well 
 for the task at hand and is open-source, but please find other examples below.
 
-Start by logging-in to have access to the Inference API:
+Start by logging in to have access to the Inference API:
 
 ```py
 from huggingface_hub import login
@@ -79,8 +79,8 @@ from transformers import HfAgent
 agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
 ```
 
-This is using the inference API that Hugging Face provides for free at the moment, if you have your own inference
-endpoint for this model (or another one) you can replace the url above by your url endpoint.
+This is using the inference API that Hugging Face provides for free at the moment if you have your inference
+endpoint for this model (or another one) you can replace the URL above with your URL endpoint.
 
 <Tip>
 
@@ -102,7 +102,7 @@ agent.run("Draw me a picture of rivers and lakes")
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
 
-It automatically select the tool (or tools) appropriate for the task you want to perform and run them appropriately. It
+It automatically selects the tool (or tools) appropriate for the task you want to perform and runs them appropriately. It
 can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely
 the agent is to fail).
 
@@ -121,7 +121,7 @@ Note that your `agent` is just a large-language model, so small variations in yo
 different results. It's important to explain as clearly as possible the task you want to perform.
 
 If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
-variables that you would like the agent to use. For example you could generate the first image of rivers and lakes, 
+variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes, 
 and ask the model to update that picture to add an island by doing the following:
 
 ```python
@@ -133,17 +133,17 @@ updated_picture = agent.chat("Take that `picture` and add an island to it", pict
 
 This can be helpful when the model is unable to understand your request and mixes tools. An example would be:
 
-```python
+```py
 agent.run("Draw me the picture of a capybara swimming in the sea")
 ```
 
-Here, the model could interpret it two ways:
+Here, the model could interpret in two ways:
 - Have the `text-to-image` generate a capybara swimming in the sea
 - Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea
 
 In case you would like to force the first scenario, you could do so by passing it the prompt as an argument:
 
-```python
+```py
 agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
 ```
 
@@ -177,15 +177,15 @@ This method can also take arguments if you would like to pass non-text types or
 ### ⚠️ Remote execution
 
 For demonstration purposes and so that this can be used with all setups, we have created remote executors for several 
-of the default tools the agent has access to. These are created using 
-[inference endpoints](https://huggingface.co/inference-endpoints). To see how to setup remote executors tools yourself,
-we recommend reading the custom tool guide [TODO LINK].
+of the default tools the agent has access. These are created using 
+[inference endpoints](https://huggingface.co/inference-endpoints). To see how to set up remote executors tools yourself,
+we recommend reading the [custom tool guide](./custom_tools).
 
 In order to run with remote tools, specifying `remote=True` to either [`~Agent.run`] or [`~Agent.chat`] is sufficient.
 
 For example, the following command could be run on any device efficiently, without needing significant RAM or GPU:
 
-```python
+```py
 agent.run("Draw me a picture of rivers and lakes", remote=True)
 ```
 
@@ -202,18 +202,18 @@ agent.chat("Draw me a picture of rivers and lakes", remote=True)
 The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
 
 LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the 
-LLM to give a small sample of code performing a task with a set of tools. This prompt is then completed by the 
+LLM gives a small sample of code performing a task with a set of tools. This prompt is then completed by the 
 task you give your agent and the description of the tools you give it. This way it gets access to the doc of the 
-tools you are using, especially their expected inputs and outputs and can generate the relevant code.
+tools you are using, especially their expected inputs and outputs, and can generate the relevant code.
 
 #### Tools
 
-Tools are very simple: they're a single function, with a name, and a description. We then use these tools description 
-to prompt the agent. Through the prompt, we show the agent how it would leverage tools in order to perform what was 
-requests in the query.
+Tools are very simple: they're a single function, with a name, and a description. We then use these tools' descriptions 
+to prompt the agent. Through the prompt, we show the agent how it would leverage tools to perform what was 
+requested in the query.
 
 This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. 
-Pipelines are more refactored and often combine several tasks in one. Tools are really meant to be focused on
+Pipelines are more refactored and often combine several tasks in one. Tools are meant to be focused on
 one very simple task only.
 
 #### Code-execution?!
@@ -271,13 +271,12 @@ directly with the agent. We've added a few
 - **Text to image**: generate an image according to a prompt, leveraging stable diffusion
 - **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
 
-The text-to-image tool we have been using since the beginning is actually a remote tool that lives in 
+The text-to-image tool we have been using since the beginning is a remote tool that lives in 
 [*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
-continue releasing such tools on this and other organization, to further supercharge this implementation.
+continue releasing such tools on this and other organizations, to further supercharge this implementation.
 
 The agents have by default access to tools that reside on `huggingface-tools`.
-We explain how to you can write and share your own tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
-[following guide](custom_tools).
+We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
 
 ### Leveraging different agents
 
@@ -307,7 +306,7 @@ agent = OpenAiAgent(model="text-davinci-003", api_key="<API_KEY>")
 
 ### Code generation
 
-So far we have shown how to use the agents to perform actions for you. However, the agent is really only generating code
+So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code
 that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in 
 a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports.
 
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index e1ae12804b3a..bdb45f50719a 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -19,6 +19,7 @@
 import os
 import time
 from dataclasses import dataclass
+from typing import Dict
 
 import requests
 from huggingface_hub import HfFolder, hf_hub_download, list_spaces
@@ -199,7 +200,7 @@ def __init__(self, chat_prompt_template=None, run_prompt_template=None, addition
 
         self.chat_prompt_template = CHAT_MESSAGE_PROMPT if chat_prompt_template is None else chat_prompt_template
         self.run_prompt_template = RUN_PROMPT_TEMPLATE if run_prompt_template is None else run_prompt_template
-        self.toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
+        self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
         if additional_tools is not None:
             if isinstance(additional_tools, (list, tuple)):
                 additional_tools = {t.name: t for t in additional_tools}
@@ -207,7 +208,7 @@ def __init__(self, chat_prompt_template=None, run_prompt_template=None, addition
                 additional_tools = {additional_tools.name: additional_tools}
 
             replacements = {name: tool for name, tool in additional_tools.items() if name in HUGGINGFACE_DEFAULT_TOOLS}
-            self.toolbox.update(additional_tools)
+            self._toolbox.update(additional_tools)
             if len(replacements) > 1:
                 names = "\n".join([f"- {n}: {t}" for n, t in replacements.items()])
                 logger.warn(
@@ -219,6 +220,11 @@ def __init__(self, chat_prompt_template=None, run_prompt_template=None, addition
 
         self.prepare_for_new_chat()
 
+    @property
+    def toolbox(self) -> Dict[str, Tool]:
+        """Get all tool currently available to the agent"""
+        return self._toolbox
+
     def format_prompt(self, task, chat_mode=False):
         description = "\n".join([f"- {name}: {tool.description}" for name, tool in self.toolbox.items()])
         if chat_mode:

From 4f05bbf16583a92effb238b4a45b8001083b882c Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 May 2023 15:25:07 +0200
Subject: [PATCH 066/935] Metadata update (#23259)

* Metadata update

* Make fixup
---
 src/transformers/tools/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index 47bb9fe39982..b51868e7c60b 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -23,7 +23,7 @@
 import tempfile
 from typing import Any, Dict, List, Optional, Union
 
-from huggingface_hub import CommitOperationAdd, HfFolder, create_commit, create_repo, hf_hub_download
+from huggingface_hub import CommitOperationAdd, HfFolder, create_commit, create_repo, hf_hub_download, metadata_update
 from huggingface_hub.utils import RepositoryNotFoundError, get_session
 
 from ..dynamic_module_utils import custom_object_save, get_class_from_dynamic_module, get_imports
@@ -285,6 +285,7 @@ def push_to_hub(
         repo_url = create_repo(
             repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="space", space_sdk="gradio"
         )
+        metadata_update(repo_id, {"tags": ["tool"]}, repo_type="space")
         repo_id = repo_url.repo_id
 
         with tempfile.TemporaryDirectory() as work_dir:

From b203de7c86ec2139c393d1366bde81780526afa5 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 May 2023 15:36:15 +0200
Subject: [PATCH 067/935] Update Image segmentation description (#23261)

* Update Image segmentation description

* prompt -> label
---
 src/transformers/tools/image_segmentation.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/tools/image_segmentation.py b/src/transformers/tools/image_segmentation.py
index 1a43afbd18d2..4471b8490527 100644
--- a/src/transformers/tools/image_segmentation.py
+++ b/src/transformers/tools/image_segmentation.py
@@ -28,10 +28,9 @@
 
 class ImageSegmentationTool(PipelineTool):
     description = (
-        "This is a tool that creates a segmentation mask identifiying elements inside an image according to a prompt. "
-        "It takes two arguments named `image` which should be the original image, and `prompt` which should be a text "
-        "describing the elements what should be identified in the segmentation mask. The tool returns the mask as a "
-        "black-and-white image."
+        "This is a tool that creates a segmentation mask of an image according to a label. It cannot create an image."
+        "It takes two arguments named `image` which should be the original image, and `label` which should be a text "
+        "describing the elements what should be identified in the segmentation mask. The tool returns the mask."
     )
     default_checkpoint = "CIDAS/clipseg-rd64-refined"
     name = "image_segmenter"
@@ -44,9 +43,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
         super().__init__(*args, **kwargs)
 
-    def encode(self, image: "Image", prompt: str):
+    def encode(self, image: "Image", label: str):
         self.pre_processor.image_processor.size = {"width": image.size[0], "height": image.size[1]}
-        return self.pre_processor(text=[prompt], images=[image], padding=True, return_tensors="pt")
+        return self.pre_processor(text=[label], images=[image], padding=True, return_tensors="pt")
 
     def forward(self, inputs):
         with torch.no_grad():

From 5f26a23d0359ef983734655f6e1545380268e256 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 10 May 2023 16:21:09 +0200
Subject: [PATCH 068/935] pin `tensorflow-probability` in docker files (#23260)

* pong TF prob

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 +-
 docker/transformers-tensorflow-gpu/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index a46346f65907..7717a96f0b8c 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -33,7 +33,7 @@ RUN echo torch=$VERSION
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
 RUN python3 -m pip install --no-cache-dir -U tensorflow==2.11
-RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.20"
 RUN python3 -m pip uninstall -y flax jax
 
 # To include the change in this commit https://github.com/onnx/tensorflow-onnx/commit/ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
index 09e8512f2ce8..09df552546ce 100644
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -18,7 +18,7 @@ RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSIO
 RUN python3 -m pip uninstall -y torch flax
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.20"
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.

From f93509b114efdfb371683d4abb32a296510a1a15 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 May 2023 11:03:53 -0400
Subject: [PATCH 069/935] Refine documentation for Tools (#23266)

* refine documentation for Tools

* + one bugfix
---
 docs/source/en/_toctree.yml            |  2 +-
 docs/source/en/custom_tools.mdx        | 13 +++--
 docs/source/en/transformers_agents.mdx | 73 ++++++++++++--------------
 src/transformers/tools/agents.py       |  4 +-
 4 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c2c50082274d..31c782697843 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -102,7 +102,7 @@
     - local: community
       title: Community resources
     - local: custom_tools
-      title: Custom Tools
+      title: Custom Tools and Prompts
     - local: troubleshooting
       title: Troubleshoot
   title: Developer guides
diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
index 29a54b61cbcd..bba044e37318 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@@ -124,7 +124,9 @@ what the tool does and the second states what input arguments and return values
 
 A good tool name and tool description are very important for the agent to correctly use it. Note that the only
 information the agent has about the tool is its name and description, so one should make sure that both 
-are precisely written and match the style of the existing tools in the toolbox.
+are precisely written and match the style of the existing tools in the toolbox. In particular make sure the description
+mentions all the arguments expected by name in code-style, along with the expected type and a description of what they
+are.
 
 <Tip>
 
@@ -137,7 +139,7 @@ The third part includes a set of curated examples that show the agent exactly wh
 for what kind of user request. The large language models empowering the agent are extremely good at 
 recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important
 that the examples are written in a way that maximizes the likelihood of the agent to generating correct,
-  executable code in practice. 
+executable code in practice. 
 
 Let's have a look at one example:
 
@@ -466,7 +468,8 @@ The set of curated tools already has an `image_transformer` tool which is hereby
 
 Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool 
 because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API 
-as the overwritten tool in this case.
+as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that
+tool are updated.
 
 </Tip>
 
@@ -627,14 +630,14 @@ In order to let others benefit from it and for simpler initialization, we recomm
 namespace. To do so, just call `push_to_hub` on the `tool` variable:
 
 ```python
-tool.push_to_hub("lysandre/hf-model-downloads")
+tool.push_to_hub("hf-model-downloads")
 ```
 
 You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it.
 
 #### Having the agent use the tool
 
-We now have our tool that lives on the Hub which can be instantiated as such:
+We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
 
 ```python
 from transformers import load_tool
diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.mdx
index 5343e93cf338..793ab6bfc0f2 100644
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.mdx
@@ -19,7 +19,8 @@ can vary as the APIs or underlying models are prone to change.
 
 </Tip>
 
-Transformers version v4.29.0, building on the concept of *tools* and *agents*.
+Transformers version v4.29.0, building on the concept of *tools* and *agents*. You can play with in
+[this colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
 
 In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an 
 agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, 
@@ -60,10 +61,19 @@ agent.run(
 ## Quickstart
 
 Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM). 
-We recommend using the [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) checkpoint as it works very well 
-for the task at hand and is open-source, but please find other examples below.
+We provide support for openAI models as well as opensource alternatives from BigCode and OpenAssistant. The openAI
+models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is
+providing free access to endpoints for BigCode and OpenAssistant models.
 
-Start by logging in to have access to the Inference API:
+To use openAI models, you instantiate an [`OpenAiAgent`]:
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+To use BigCode or OpenAssistant, start by logging in to have access to the Inference API:
 
 ```py
 from huggingface_hub import login
@@ -76,17 +86,22 @@ Then, instantiate the agent
 ```py
 from transformers import HfAgent
 
+# Starcoder
 agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
 ```
 
-This is using the inference API that Hugging Face provides for free at the moment if you have your inference
+This is using the inference API that Hugging Face provides for free at the moment. If you have your own inference
 endpoint for this model (or another one) you can replace the URL above with your URL endpoint.
 
 <Tip>
 
-We're showcasing StarCoder as the default in the documentation as the model is free to use and performs admirably well
-on simple tasks. However, the checkpoint doesn't hold up when handling more complex prompts. If you're facing such an
-issue, we recommend trying out the OpenAI model which, while sadly not open-source, performs better at this given time.
+StarCoder and OpenAssistant are free to use and perform admirably well on simple tasks. However, the checkpoints
+don't hold up when handling more complex prompts. If you're facing such an issue, we recommend trying out the OpenAI
+model which, while sadly not open-source, performs better at this given time.
 
 </Tip>
 
@@ -97,7 +112,7 @@ You're now good to go! Let's dive into the two APIs that you now have at your di
 The single execution method is when using the [`~Agent.run`] method of the agent:
 
 ```py
-agent.run("Draw me a picture of rivers and lakes")
+agent.run("Draw me a picture of rivers and lakes.")
 ```
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
@@ -107,7 +122,7 @@ can perform one or several tasks in the same instruction (though the more comple
 the agent is to fail).
 
 ```py
-agent.chat("Draw me a picture of the sea then transform the picture to add an island.")
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
 ```
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
@@ -118,15 +133,16 @@ agent.chat("Draw me a picture of the sea then transform the picture to add an is
 Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks.
 
 Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely
-different results. It's important to explain as clearly as possible the task you want to perform.
+different results. It's important to explain as clearly as possible the task you want to perform. We go more in-depth
+on how to write good prompts [here](custom_tools#writing-good-user-inputs).
 
 If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
 variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes, 
 and ask the model to update that picture to add an island by doing the following:
 
 ```python
-picture = agent.run("Draw me a picture of rivers and lakes")
-updated_picture = agent.chat("Take that `picture` and add an island to it", picture=picture)
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
 ```
 
 <Tip>
@@ -155,7 +171,7 @@ agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in th
 The agent also has a chat-based approach, using the [`~Agent.chat`] method:
 
 ```py
-agent.chat("Draw me a picture of rivers and lakes")
+agent.chat("Generate a picture of rivers and lakes")
 ```
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
@@ -197,6 +213,8 @@ agent.chat("Draw me a picture of rivers and lakes", remote=True)
 
 ### What's happening here? What are tools, and what are agents?
 
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
 #### Agents
 
 The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
@@ -270,6 +288,7 @@ directly with the agent. We've added a few
 - **Text downloader**: to download a text from a web URL
 - **Text to image**: generate an image according to a prompt, leveraging stable diffusion
 - **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
+- **Text to video**: generate a small video according to a prompt, leveraging damo-vilab
 
 The text-to-image tool we have been using since the beginning is a remote tool that lives in 
 [*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
@@ -278,32 +297,6 @@ continue releasing such tools on this and other organizations, to further superc
 The agents have by default access to tools that reside on `huggingface-tools`.
 We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
 
-### Leveraging different agents
-
-We showcase here how to use the [bigcode/starcoder](https://huggingface.co/bigcode/starcoder) model as an LLM, but 
-it isn't the only model available. We also support the OpenAssistant model and OpenAI's davinci models (3.5 and 4).
-
-We're planning on supporting local language models in an ulterior version.
-
-The tools defined in this implementation are agnostic to the agent used; we are showcasing the agents that work with 
-our prompts below, but the tools can also be used with Langchain, Minichain, or any other Agent-based library.
-
-#### Example code for the OpenAssistant model
-
-```py
-from transformers import HfAgent
-
-agent = HfAgent(url_endpoint="https://OpenAssistant/oasst-sft-1-pythia-12b", token="<HF_TOKEN>")
-```
-
-#### Example code for OpenAI models
-
-```py
-from transformers import OpenAiAgent
-
-agent = OpenAiAgent(model="text-davinci-003", api_key="<API_KEY>")
-```
-
 ### Code generation
 
 So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index bdb45f50719a..fd45b17c8377 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -264,7 +264,9 @@ def chat(self, task, *, return_code=False, remote=False, **kwargs):
         """
         prompt = self.format_prompt(task, chat_mode=True)
         result = self.generate_one(prompt, stop=["Human:", "====="])
-        self.chat_history = prompt + result + "\n"
+        self.chat_history = prompt + result
+        if not self.chat_history.endswith("\n"):
+            self.chat_history += "\n"
         explanation, code = clean_code_for_chat(result)
 
         print(f"==Explanation from the agent==\n{explanation}")

From 42017d82baa083da2bee3055fdac80c81ee97b8a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 May 2023 11:13:42 -0400
Subject: [PATCH 070/935] Fix new line bug in chat mode for agents (#23267)

---
 src/transformers/tools/agents.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index fd45b17c8377..a7b07a4932cf 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -264,9 +264,7 @@ def chat(self, task, *, return_code=False, remote=False, **kwargs):
         """
         prompt = self.format_prompt(task, chat_mode=True)
         result = self.generate_one(prompt, stop=["Human:", "====="])
-        self.chat_history = prompt + result
-        if not self.chat_history.endswith("\n"):
-            self.chat_history += "\n"
+        self.chat_history = prompt + result.strip() + "\n"
         explanation, code = clean_code_for_chat(result)
 
         print(f"==Explanation from the agent==\n{explanation}")

From eb5b5ce64153e6fb12319cc8dcd8e1d7a81fd273 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 May 2023 11:58:20 -0400
Subject: [PATCH 071/935] Render custom tool docs a bit better (#23269)

* Try on a couple of blocks to see

* Build the doc please

* Build the doc please

* Build the doc please

* add more

* Finish with all

* Style
---
 docs/source/en/custom_tools.mdx | 51 +++++++++++++++++----------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
index bba044e37318..b0895f3bfc8e 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@@ -54,7 +54,7 @@ The prompt is structured broadly into four parts.
 
 To better understand each part, let's look at a shortened version of how the `run` prompt can look like:
 
-````
+````text
 I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
 [...]
 You can print intermediate results if it makes sense to do so.
@@ -101,7 +101,7 @@ The second part (the bullet points below *"Tools"*) is dynamically added upon ca
 exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name 
 and description of the tool:
 
-```
+```text
 - <tool.name>: <tool.description>
 ```
 
@@ -115,7 +115,7 @@ print(f"- {document_qa.name}: {document_qa.description}")
 ```
 
 which gives:
-```
+```text
 - document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
 ```
 
@@ -143,7 +143,7 @@ executable code in practice.
 
 Let's have a look at one example:
 
-````
+````text
 Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
 
 I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
@@ -166,7 +166,7 @@ The prompt examples are curated by the Transformers team and rigorously evaluate
 to ensure that the agent's prompt is as good as possible to solve real use cases of the agent.
 
 The final part of the prompt corresponds to:
-```
+```text
 Task: "Draw me a picture of rivers and lakes"
 
 I will use the following
@@ -187,7 +187,7 @@ exactly in the same way it was previously done in the examples.
 Without going into too much detail, the chat template has the same prompt structure with the 
 examples having a slightly different style, *e.g.*:
 
-````
+````text
 [...]
 
 =====
@@ -225,8 +225,8 @@ to past exchanges as is done *e.g.* above by the user's input of "I tried **this
 previously generated code of the agent.
 
 Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form:
-```
-Human: <user-input>\n\nAssistent:
+```text
+Human: <user-input>\n\nAssistant:
 ```
 which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example
 to the prompt, thus giving the agent more context for the next `chat` turn.
@@ -254,7 +254,7 @@ agent.run("Show me a tree", return_code=True)
 
 gives:
 
-```
+```text
 ==Explanation from the agent==
 I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
 
@@ -269,7 +269,8 @@ are present in the tool's name and description. Let's have a look.
 ```py
 agent.toolbox["image_generator"].description
 ```
-```
+
+```text
 'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
 ```
 
@@ -280,7 +281,7 @@ agent.run("Create an image of a tree", return_code=True)
 ```
 
 gives:
-```
+```text
 ==Explanation from the agent==
 I will use the following tool `image_generator` to generate an image of a tree.
 
@@ -307,7 +308,7 @@ used a lot for image generation tasks, *e.g.*
 agent.run("Make an image of a house and a car", return_code=True)
 ```
 returns
-```
+```text
 ==Explanation from the agent== 
 I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
 
@@ -322,9 +323,11 @@ to understand the difference between `image_generator` and `image_transformer` a
 
 We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier`
 to disassociate it a bit from "image" and "prompt":
-```
+```py
 agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
-agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace("transforms an image according to a prompt", "modifies an image")
+agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
+    "transforms an image according to a prompt", "modifies an image"
+)
 ```
 
 Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again.
@@ -334,7 +337,7 @@ agent.run("Make an image of a house and a car", return_code=True)
 ```
 
 Now we're getting:
-```
+```text
 ==Explanation from the agent==
 I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
 
@@ -350,7 +353,7 @@ which is definitely closer to what we had in mind! However, we want to have both
 agent.run("Create image: 'A house and car'", return_code=True)
 ```
 
-```
+```text
 ==Explanation from the agent==
 I will use the following tool: `image_generator` to generate an image.
 
@@ -389,7 +392,7 @@ of the tools, it has available to it as well as correctly insert the user's prom
 </Tip>
 
 Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges:
-```
+```text
 Human: <<task>>
 
 Assistant:
@@ -441,7 +444,7 @@ print(f"Name: '{controlnet_transformer.name}'")
 ```
 
 gives 
-```
+```text
 Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
 It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
 Name: 'image_transformer'
@@ -457,7 +460,7 @@ agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder",
 
 This command should give you the following info:
 
-```
+```text
 image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
 8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
 ```
@@ -480,7 +483,7 @@ You can always have a look at the toolbox that is currently available to the age
 print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
 ```
 
-```
+```text
 - document_qa
 - image_captioner
 - image_qa
@@ -518,7 +521,7 @@ Let's transform the image into a beautiful winter landscape:
 image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
 ```
 
-```
+```text
 ==Explanation from the agent==
 I will use the following tool: `image_transformer` to transform the image.
 
@@ -536,7 +539,7 @@ By default the image processing tool returns an image of size 512x512 pixels. Le
 image = agent.run("Upscale the image", image)
 ```
 
-```
+```text
 ==Explanation from the agent==
 I will use the following tool: `image_upscaler` to upscale the image.
 
@@ -657,7 +660,7 @@ agent.run(
 )
 ```
 which outputs the following:
-```
+```text
 ==Code generated by the agent==
 model = model_download_counter(task="text-to-video")
 print(f"The model with the most downloads is {model}.")
@@ -738,7 +741,7 @@ agent.run("Generate an image of the `prompt` after improving it.", prompt="A rab
 ```
 
 The model adequately leverages the tool:
-```
+```text
 ==Explanation from the agent==
 I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
 

From 0c65fb7cfa1258fb5946c5ae4d13f5a2a88a2f56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20=C3=81ngel=20Rey=20Li=C3=B1ares?=
 <93319775+jose-turintech@users.noreply.github.com>
Date: Wed, 10 May 2023 20:22:56 +0200
Subject: [PATCH 072/935] chore: allow protobuf 3.20.3 requirement (#22759)

* chore: allow protobuf 3.20.3

Allow latest bugfix release for protobuf (3.20.3)

* chore: update auto-generated dependency table

update auto-generated dependency table

* run in subprocess

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 setup.py                                      |   2 +-
 src/transformers/dependency_versions_table.py |   2 +-
 tests/test_tokenization_common.py             | 135 +++++++++++-------
 3 files changed, 87 insertions(+), 52 deletions(-)

diff --git a/setup.py b/setup.py
index 1d8115f66845..4ac28772625e 100644
--- a/setup.py
+++ b/setup.py
@@ -145,7 +145,7 @@
     "packaging>=20.0",
     "parameterized",
     "phonemizer",
-    "protobuf<=3.20.2",
+    "protobuf<=3.20.3",
     "psutil",
     "pyyaml>=5.1",
     "pydantic",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index f325447a109d..9012b21ff43a 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -46,7 +46,7 @@
     "packaging": "packaging>=20.0",
     "parameterized": "parameterized",
     "phonemizer": "phonemizer",
-    "protobuf": "protobuf<=3.20.2",
+    "protobuf": "protobuf<=3.20.3",
     "psutil": "psutil",
     "pyyaml": "pyyaml>=5.1",
     "pydantic": "pydantic",
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 84f6886db667..53231998ee41 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -23,6 +23,7 @@
 import shutil
 import sys
 import tempfile
+import traceback
 import unittest
 import unittest.mock as mock
 from collections import OrderedDict
@@ -64,6 +65,7 @@
     require_tf,
     require_tokenizers,
     require_torch,
+    run_test_in_subprocess,
     slow,
 )
 from transformers.tokenization_utils import AddedToken, Trie
@@ -131,6 +133,71 @@ def merge_model_tokenizer_mappings(
     return model_tokenizer_mapping
 
 
+def _test_subword_regularization_tokenizer(in_queue, out_queue, timeout):
+    error = None
+
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        tokenizer = inputs["tokenizer"]
+        sp_model_kwargs = inputs["sp_model_kwargs"]
+        test_sentencepiece_ignore_case = inputs["test_sentencepiece_ignore_case"]
+
+        unittest.TestCase().assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
+        unittest.TestCase().assertIsNotNone(tokenizer.sp_model_kwargs)
+        unittest.TestCase().assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
+        unittest.TestCase().assertDictEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
+        check_subword_sampling(tokenizer, test_sentencepiece_ignore_case=test_sentencepiece_ignore_case)
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+def check_subword_sampling(
+    tokenizer: PreTrainedTokenizer,
+    text: str = None,
+    test_sentencepiece_ignore_case: bool = True,
+) -> None:
+    """
+    Check if the tokenizer generates different results when subword regularization is enabled.
+
+    Subword regularization augments training data with subword sampling.
+    This has a random component.
+
+    Args:
+        tokenizer: The tokenizer to check.
+        text: The text to use for the checks.
+        test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`.
+    """
+    text = "This is a test for subword regularization." if text is None else text
+    if test_sentencepiece_ignore_case:
+        text = text.lower()
+
+    tokens_list = []
+    for _ in range(5):
+        tokens_list.append(tokenizer.tokenize(text))
+
+    # the list of different pairs of tokens_list
+    combinations = itertools.combinations(tokens_list, 2)
+
+    # check of sampling is done
+    subword_sampling_found = False
+    for combination in combinations:
+        if combination[0] != combination[1]:
+            subword_sampling_found = True
+    unittest.TestCase().assertTrue(subword_sampling_found)
+
+    # check if converting back to original text works
+    for tokens in tokens_list:
+        if test_sentencepiece_ignore_case:
+            unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
+        else:
+            unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
+
+
 class TokenizerTesterMixin:
     tokenizer_class = None
     rust_tokenizer_class = None
@@ -420,11 +487,15 @@ def test_subword_regularization_tokenizer(self) -> None:
         sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
         tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
 
-        self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
-        self.assertIsNotNone(tokenizer.sp_model_kwargs)
-        self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
-        self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
-        self.check_subword_sampling(tokenizer)
+        run_test_in_subprocess(
+            test_case=self,
+            target_func=_test_subword_regularization_tokenizer,
+            inputs={
+                "tokenizer": tokenizer,
+                "sp_model_kwargs": sp_model_kwargs,
+                "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
+            },
+        )
 
     def test_pickle_subword_regularization_tokenizer(self) -> None:
         if not self.test_sentencepiece:
@@ -438,11 +509,15 @@ def test_pickle_subword_regularization_tokenizer(self) -> None:
         del tokenizer
         tokenizer_new = pickle.loads(tokenizer_bin)
 
-        self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
-        self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
-        self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
-        self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
-        self.check_subword_sampling(tokenizer_new)
+        run_test_in_subprocess(
+            test_case=self,
+            target_func=_test_subword_regularization_tokenizer,
+            inputs={
+                "tokenizer": tokenizer_new,
+                "sp_model_kwargs": sp_model_kwargs,
+                "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
+            },
+        )
 
     def test_save_sentencepiece_tokenizer(self) -> None:
         if not self.test_sentencepiece or not self.test_slow_tokenizer:
@@ -2317,46 +2392,6 @@ def _check_no_pad_token_padding(self, tokenizer, sequences):
             # add pad_token_id to pass subsequent tests
             tokenizer.add_special_tokens({"pad_token": "<PAD>"})
 
-    def check_subword_sampling(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        text: str = None,
-    ) -> None:
-        """
-        Check if the tokenizer generates different results when subword regularization is enabled.
-
-        Subword regularization augments training data with subword sampling.
-        This has a random component.
-
-        Args:
-            tokenizer: The tokenizer to check.
-            text: The text to use for the checks.
-        """
-        text = "This is a test for subword regularization." if text is None else text
-        if self.test_sentencepiece_ignore_case:
-            text = text.lower()
-
-        tokens_list = []
-        for _ in range(5):
-            tokens_list.append(tokenizer.tokenize(text))
-
-        # the list of different pairs of tokens_list
-        combinations = itertools.combinations(tokens_list, 2)
-
-        # check of sampling is done
-        subword_sampling_found = False
-        for combination in combinations:
-            if combination[0] != combination[1]:
-                subword_sampling_found = True
-        self.assertTrue(subword_sampling_found)
-
-        # check if converting back to original text works
-        for tokens in tokens_list:
-            if self.test_sentencepiece_ignore_case:
-                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
-            else:
-                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
-
     @require_torch
     @slow
     def test_torch_encode_plus_sent_to_model(self):

From 6d6b7c923cc3eb580bf56dec9ccabf7d478d9027 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 May 2023 15:09:57 -0400
Subject: [PATCH 073/935] Fix link displayed for custom tools (#23274)

---
 src/transformers/dynamic_module_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index fd8a5204d8de..e7ee18a278fa 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -318,7 +318,7 @@ def get_cached_module_file(
 
     if len(new_files) > 0:
         new_files = "\n".join([f"- {f}" for f in new_files])
-        repo_type_str = "" if repo_type is None else f"{repo_type}/"
+        repo_type_str = "" if repo_type is None else f"{repo_type}s/"
         url = f"https://huggingface.co/{repo_type_str}{pretrained_model_name_or_path}"
         logger.warning(
             f"A new version of the following files was downloaded from {url}:\n{new_files}"

From b2846afda87164ffc7e6633cc609c775f54c88b9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 10 May 2023 15:10:06 -0400
Subject: [PATCH 074/935] Remove missplaced test file (#23275)

---
 .../data/test_generation_utils.py             | 99 -------------------
 1 file changed, 99 deletions(-)
 delete mode 100644 src/transformers/data/test_generation_utils.py

diff --git a/src/transformers/data/test_generation_utils.py b/src/transformers/data/test_generation_utils.py
deleted file mode 100644
index a69b5683de75..000000000000
--- a/src/transformers/data/test_generation_utils.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import timeout_decorator
-
-from ..testing_utils import require_torch
-from ..utils import cached_property, is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-    from ..models.marian import MarianConfig, MarianMTModel
-
-
-@require_torch
-class GenerationUtilsTest(unittest.TestCase):
-    @cached_property
-    def config(self):
-        config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de")
-        return config
-
-    @cached_property
-    def model(self):
-        return MarianMTModel(self.config)
-
-    def test_postprocess_next_token_scores(self):
-        config = self.config
-        model = self.model
-        # Initialize an input id tensor with batch size 8 and sequence length 12
-        input_ids = torch.arange(0, 96, 1).view((8, 12))
-        eos = config.eos_token_id
-        bad_words_ids_test_cases = [[[299]], [[23, 24], [54]], [[config.eos_token_id]], []]
-        masked_scores = [
-            [(0, 299), (1, 299), (2, 299), (3, 299), (4, 299), (5, 299), (6, 299), (7, 299)],
-            [(1, 24), (0, 54), (1, 54), (2, 54), (3, 54), (4, 54), (5, 54), (6, 54), (7, 54)],
-            [(0, eos), (1, eos), (2, eos), (3, eos), (4, eos), (5, eos), (6, eos), (7, eos)],
-            [],
-        ]
-
-        for test_case_index, bad_words_ids in enumerate(bad_words_ids_test_cases):
-            # Initialize a scores tensor with batch size 8 and vocabulary size 300
-            scores = torch.rand((8, 300))
-            output = model.postprocess_next_token_scores(
-                scores,
-                input_ids,
-                0,
-                bad_words_ids,
-                13,
-                15,
-                config.max_length,
-                config.eos_token_id,
-                config.repetition_penalty,
-                32,
-                5,
-            )
-            for masked_score in masked_scores[test_case_index]:
-                self.assertTrue(output[masked_score[0], masked_score[1]] == -float("inf"))
-
-    @timeout_decorator.timeout(10)
-    def test_postprocess_next_token_scores_large_bad_words_list(self):
-        config = self.config
-        model = self.model
-        # Initialize an input id tensor with batch size 8 and sequence length 12
-        input_ids = torch.arange(0, 96, 1).view((8, 12))
-
-        bad_words_ids = []
-        for _ in range(100):
-            length_bad_word = random.randint(1, 4)
-            bad_words_ids.append(random.sample(range(1, 300), length_bad_word))
-
-        scores = torch.rand((8, 300))
-        _ = model.postprocess_next_token_scores(
-            scores,
-            input_ids,
-            0,
-            bad_words_ids,
-            13,
-            15,
-            config.max_length,
-            config.eos_token_id,
-            config.repetition_penalty,
-            32,
-            5,
-        )

From 9088fcae82f4e23021e600966626188ce6fbe6df Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 10 May 2023 22:00:48 +0200
Subject: [PATCH 075/935] Bring back the PR `Refactor doctests + add CI` to
 `main` (#23271)

* Revert "Revert "[Doctests] Refactor doctests + add CI" (#23245)"

This reverts commit 69ee46243c40ea61f63d4b8f78d171ad27b4a046.

* try not expose HfDocTestParser

* move into testing_utils.py

* remove pytest install

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py |  85 +++++++++++++-
 .github/workflows/doctests.yml      |   8 --
 Makefile                            |   8 +-
 conftest.py                         |  12 +-
 docs/source/en/testing.mdx          |  12 +-
 setup.cfg                           |   1 +
 src/transformers/testing_utils.py   | 174 +++++++++++++++++++++++++++-
 utils/prepare_for_doc_test.py       | 148 -----------------------
 8 files changed, 269 insertions(+), 179 deletions(-)
 delete mode 100644 utils/prepare_for_doc_test.py

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 7208d876a97c..30898f9e1c2a 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -51,6 +51,8 @@ class CircleCIJob:
     resource_class: Optional[str] = "xlarge"
     tests_to_run: Optional[List[str]] = None
     working_directory: str = "~/transformers"
+    # This should be only used for doctest job!
+    command_timeout: Optional[int] = None
 
     def __post_init__(self):
         # Deal with defaults for mutable attributes.
@@ -107,11 +109,15 @@ def to_dict(self):
         steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
 
         all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
+        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
         pytest_flags.append(
             f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
         )
-        test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+        test_command = ""
+        if self.command_timeout:
+            test_command = f"timeout {self.command_timeout} "
+        test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+        
         if self.parallelism == 1:
             if self.tests_to_run is None:
                 test_command += " << pipeline.parameters.tests_to_run >>"
@@ -161,12 +167,37 @@ def to_dict(self):
             steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
             steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
 
-            test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+            test_command = ""
+            if self.timeout:
+                test_command = f"timeout {self.timeout} "
+            test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
             test_command += " $(cat splitted_tests.txt)"
         if self.marker is not None:
             test_command += f" -m {self.marker}"
-        test_command += " | tee tests_output.txt"
+
+        if self.name == "pr_documentation_tests":
+            # can't use ` | tee tee tests_output.txt` as usual
+            test_command += " > tests_output.txt"
+            # Save the return code, so we can check if it is timeout in the next step.
+            test_command += '; touch "$?".txt'
+            # Never fail the test step for the doctest job. We will check the results in the next step, and fail that
+            # step instead if the actual test failures are found. This is to avoid the timeout being reported as test
+            # failure.
+            test_command = f"({test_command}) || true"
+        else:
+            test_command += " | tee tests_output.txt"
         steps.append({"run": {"name": "Run tests", "command": test_command}})
+
+        # return code `124` means the previous (pytest run) step is timeout
+        if self.name == "pr_documentation_tests":
+            checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; '
+            checkout_doctest_command += 'then echo "some test failed"; '
+            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; '
+            checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; '
+            checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; '
+            checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;'
+            steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}})
+
         steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
         steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
         job["steps"] = steps
@@ -401,6 +432,51 @@ def job_name(self):
     tests_to_run="tests/repo_utils",
 )
 
+# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file).
+py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
+py_command = f"$(python3 -c '{py_command}')"
+command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt'
+doc_test_job = CircleCIJob(
+    "pr_documentation_tests",
+    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
+    install_steps=[
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
+        "pip install --upgrade pip",
+        "pip install -e .[dev]",
+        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade pytest pytest-sugar",
+        "find -name __pycache__ -delete",
+        "find . -name \*.pyc -delete",
+        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
+        "touch dummy.py",
+        {
+            "name": "Get files to test",
+            "command":
+                "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n"
+                "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt"
+        },
+        {
+            "name": "List files beings changed: pr_documentation_tests.txt",
+            "command":
+                "cat pr_documentation_tests.txt"
+        },
+        {
+            "name": "Filter pr_documentation_tests.txt",
+            "command":
+                command
+        },
+        {
+            "name": "List files beings tested: pr_documentation_tests_filtered.txt",
+            "command":
+                "cat pr_documentation_tests_filtered.txt"
+        },
+    ],
+    tests_to_run="$(cat pr_documentation_tests_filtered.txt)",  # noqa
+    pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
+    command_timeout=1200,  # test cannot run longer than 1200 seconds
+    pytest_num_workers=1,
+)
+
 REGULAR_TESTS = [
     torch_and_tf_job,
     torch_and_flax_job,
@@ -411,6 +487,7 @@ def job_name(self):
     hub_job,
     onnx_job,
     exotic_models_job,
+    doc_test_job
 ]
 EXAMPLES_TESTS = [
     examples_torch_job,
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index a0efe40cbbe9..55c09b1acc82 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -37,18 +37,10 @@ jobs:
       - name: Show installed libraries and their versions
         run: pip freeze
 
-      - name: Prepare files for doctests
-        run: |
-          python3 utils/prepare_for_doc_test.py src docs
-
       - name: Run doctests
         run: |
           python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
 
-      - name: Clean files after doctests
-        run: |
-          python3 utils/prepare_for_doc_test.py src docs --remove_new_line
-
       - name: Failure short reports
         if: ${{ failure() }}
         continue-on-error: true
diff --git a/Makefile b/Makefile
index 5e5a11a1fee0..d6d6966a1dad 100644
--- a/Makefile
+++ b/Makefile
@@ -47,10 +47,10 @@ repo-consistency:
 # this target runs checks on all files
 
 quality:
-	black --check $(check_dirs) setup.py
+	black --check $(check_dirs) setup.py conftest.py
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
-	ruff $(check_dirs) setup.py
+	ruff $(check_dirs) setup.py conftest.py
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py
 
@@ -65,8 +65,8 @@ extra_style_checks:
 # this target runs checks on all files and potentially modifies some of them
 
 style:
-	black $(check_dirs) setup.py
-	ruff $(check_dirs) setup.py --fix
+	black $(check_dirs) setup.py conftest.py
+	ruff $(check_dirs) setup.py conftest.py --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks
 
diff --git a/conftest.py b/conftest.py
index 683b47705bf4..247e5eb92d53 100644
--- a/conftest.py
+++ b/conftest.py
@@ -20,6 +20,10 @@
 import warnings
 from os.path import abspath, dirname, join
 
+import _pytest
+
+from transformers.testing_utils import HfDoctestModule, HfDocTestParser
+
 
 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
@@ -38,9 +42,7 @@ def pytest_configure(config):
     config.addinivalue_line(
         "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
     )
-    config.addinivalue_line(
-        "markers", "is_pipeline_test: mark test to run only when pipelines are tested"
-    )
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
     config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
     config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")
@@ -67,7 +69,7 @@ def pytest_sessionfinish(session, exitstatus):
 
 
 # Doctest custom flag to ignore output.
-IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
+IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
 
 OutputChecker = doctest.OutputChecker
 
@@ -80,3 +82,5 @@ def check_output(self, want, got, optionflags):
 
 
 doctest.OutputChecker = CustomOutputChecker
+_pytest.doctest.DoctestModule = HfDoctestModule
+doctest.DocTestParser = HfDocTestParser
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.mdx
index 4663b8ac4d93..5adbc8e44db7 100644
--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.mdx
@@ -212,20 +212,12 @@ Example:
     ```"""
 
 ```
-3 steps are required to debug the docstring examples: 
-1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using: 
-```bash 
-python utils/prepare_for_doc_test.py <path_to_file_or_dir>
-```
 
-2. Then, you can use the following line to automatically test every docstring example in the desired file: 
+Just run the following line to automatically test every docstring example in the desired file: 
 ```bash 
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following: 
-```bash 
-python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
-```
+If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument.
 
 ### Run only modified tests
 
diff --git a/setup.cfg b/setup.cfg
index 5f47c5c6be69..8b84d3a6d9b9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,3 @@
 [tool:pytest]
 doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
+doctest_glob=**/*.mdx
\ No newline at end of file
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index e9e9c7e2c1c6..9e8f51bd9345 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -14,6 +14,7 @@
 
 import collections
 import contextlib
+import doctest
 import functools
 import inspect
 import logging
@@ -30,11 +31,23 @@
 from collections.abc import Mapping
 from io import StringIO
 from pathlib import Path
-from typing import Iterator, List, Optional, Union
+from typing import Iterable, Iterator, List, Optional, Union
 from unittest import mock
 
 import huggingface_hub
 import requests
+from _pytest.doctest import (
+    Module,
+    _get_checker,
+    _get_continue_on_failure,
+    _get_runner,
+    _is_mocked,
+    _patch_unwrap_mock_aware,
+    get_optionflags,
+    import_path,
+)
+from _pytest.outcomes import skip
+from pytest import DoctestItem
 
 from transformers import logging as transformers_logging
 
@@ -1812,3 +1825,162 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
 
     if results["error"] is not None:
         test_case.fail(f'{results["error"]}')
+
+
+"""
+The following contains utils to run the documentation tests without having to overwrite any files.
+
+The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
+made as a print would otherwise fail the corresonding line.
+
+To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
+"""
+
+
+def preprocess_string(string, skip_cuda_tests):
+    """Prepare a docstring or a `.mdx` file to be run by doctest.
+
+    The argument `string` would be the whole file content if it is a `.mdx` file. For a python file, it would be one of
+    its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
+    cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
+    `string`.
+    """
+    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
+    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
+    is_cuda_found = False
+    for i, codeblock in enumerate(codeblocks):
+        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
+            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
+        if (
+            (">>>" in codeblock or "..." in codeblock)
+            and re.search(r"cuda|to\(0\)|device=0", codeblock)
+            and skip_cuda_tests
+        ):
+            is_cuda_found = True
+            break
+    modified_string = ""
+    if not is_cuda_found:
+        modified_string = "".join(codeblocks)
+    return modified_string
+
+
+class HfDocTestParser(doctest.DocTestParser):
+    """
+    Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This
+    means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also
+    added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line.
+
+    Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
+    """
+
+    # This regular expression is used to find doctest examples in a
+    # string.  It defines three groups: `source` is the source code
+    # (including leading indentation and prompts); `indent` is the
+    # indentation of the first (PS1) line of the source code; and
+    # `want` is the expected output (including leading indentation).
+    # fmt: off
+    _EXAMPLE_RE = re.compile(r'''
+        # Source consists of a PS1 line followed by zero or more PS2 lines.
+        (?P<source>
+            (?:^(?P<indent> [ ]*) >>>    .*)    # PS1 line
+            (?:\n           [ ]*  \.\.\. .*)*)  # PS2 lines
+        \n?
+        # Want consists of any non-blank lines that do not start with PS1.
+        (?P<want> (?:(?![ ]*$)    # Not a blank line
+             (?![ ]*>>>)          # Not a line starting with PS1
+             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+             (?:(?!```).)*        # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line)
+             # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+             (?:\n|$)  # Match a new line or end of string
+          )*)
+        ''', re.MULTILINE | re.VERBOSE
+    )
+    # fmt: on
+
+    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+    skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False))
+    # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+
+    def parse(self, string, name="<string>"):
+        """
+        Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before
+        calling `super().parse`
+        """
+        string = preprocess_string(string, self.skip_cuda_tests)
+        return super().parse(string, name)
+
+
+class HfDoctestModule(Module):
+    """
+    Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
+    tests.
+    """
+
+    def collect(self) -> Iterable[DoctestItem]:
+        class MockAwareDocTestFinder(doctest.DocTestFinder):
+            """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
+
+            https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532
+            """
+
+            def _find_lineno(self, obj, source_lines):
+                """Doctest code does not take into account `@property`, this
+                is a hackish way to fix it. https://bugs.python.org/issue17446
+
+                Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be
+                reported upstream. #8796
+                """
+                if isinstance(obj, property):
+                    obj = getattr(obj, "fget", obj)
+
+                if hasattr(obj, "__wrapped__"):
+                    # Get the main obj in case of it being wrapped
+                    obj = inspect.unwrap(obj)
+
+                # Type ignored because this is a private function.
+                return super()._find_lineno(  # type:ignore[misc]
+                    obj,
+                    source_lines,
+                )
+
+            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
+                if _is_mocked(obj):
+                    return
+                with _patch_unwrap_mock_aware():
+                    # Type ignored because this is a private function.
+                    super()._find(  # type:ignore[misc]
+                        tests, obj, name, module, source_lines, globs, seen
+                    )
+
+        if self.path.name == "conftest.py":
+            module = self.config.pluginmanager._importconftest(
+                self.path,
+                self.config.getoption("importmode"),
+                rootpath=self.config.rootpath,
+            )
+        else:
+            try:
+                module = import_path(
+                    self.path,
+                    root=self.config.rootpath,
+                    mode=self.config.getoption("importmode"),
+                )
+            except ImportError:
+                if self.config.getvalue("doctest_ignore_import_errors"):
+                    skip("unable to import module %r" % self.path)
+                else:
+                    raise
+
+        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+        finder = MockAwareDocTestFinder(parser=HfDocTestParser())
+        # !!!!!!!!!!! HF Specific !!!!!!!!!!!
+        optionflags = get_optionflags(self)
+        runner = _get_runner(
+            verbose=False,
+            optionflags=optionflags,
+            checker=_get_checker(),
+            continue_on_failure=_get_continue_on_failure(self.config),
+        )
+        for test in finder.find(module, module.__name__):
+            if test.examples:  # skip empty doctests and cuda
+                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
diff --git a/utils/prepare_for_doc_test.py b/utils/prepare_for_doc_test.py
deleted file mode 100644
index c55f3540d994..000000000000
--- a/utils/prepare_for_doc_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Style utils to preprocess files for doc tests.
-
-    The doc precossing function can be run on a list of files and/org
-    directories of files. It will recursively check if the files have
-    a python code snippet by looking for a ```python or ```py syntax.
-    In the default mode - `remove_new_line==False` the script will
-    add a new line before every python code ending ``` line to make
-    the docstrings ready for pytest doctests.
-    However, we don't want to have empty lines displayed in the
-    official documentation which is why the new line command can be
-    reversed by adding the flag `--remove_new_line` which sets
-    `remove_new_line==True`.
-
-    When debugging the doc tests locally, please make sure to
-    always run:
-
-    ```python utils/prepare_for_doc_test.py src docs```
-
-    before running the doc tests:
-
-    ```pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"```
-
-    Afterwards you should revert the changes by running
-
-    ```python utils/prepare_for_doc_test.py src docs --remove_new_line```
-"""
-
-import argparse
-import os
-
-
-def process_code_block(code, add_new_line=True):
-    if add_new_line:
-        return maybe_append_new_line(code)
-    else:
-        return maybe_remove_new_line(code)
-
-
-def maybe_append_new_line(code):
-    """
-    Append new line if code snippet is a
-    Python code snippet
-    """
-    lines = code.split("\n")
-
-    if lines[0] in ["py", "python"]:
-        # add new line before last line being ```
-        last_line = lines[-1]
-        lines.pop()
-        lines.append("\n" + last_line)
-
-    return "\n".join(lines)
-
-
-def maybe_remove_new_line(code):
-    """
-    Remove new line if code snippet is a
-    Python code snippet
-    """
-    lines = code.split("\n")
-
-    if lines[0] in ["py", "python"]:
-        # add new line before last line being ```
-        lines = lines[:-2] + lines[-1:]
-
-    return "\n".join(lines)
-
-
-def process_doc_file(code_file, add_new_line=True):
-    """
-    Process given file.
-
-    Args:
-        code_file (`str` or `os.PathLike`): The file in which we want to style the docstring.
-    """
-    with open(code_file, "r", encoding="utf-8", newline="\n") as f:
-        code = f.read()
-
-    # fmt: off
-    splits = code.split("```")
-    if len(splits) % 2 != 1:
-        raise ValueError("The number of occurrences of ``` should be an even number.")
-
-    splits = [s if i % 2 == 0 else process_code_block(s, add_new_line=add_new_line) for i, s in enumerate(splits)]
-    clean_code = "```".join(splits)
-    # fmt: on
-
-    diff = clean_code != code
-    if diff:
-        print(f"Overwriting content of {code_file}.")
-        with open(code_file, "w", encoding="utf-8", newline="\n") as f:
-            f.write(clean_code)
-
-
-def process_doc_files(*files, add_new_line=True):
-    """
-    Applies doc styling or checks everything is correct in a list of files.
-
-    Args:
-        files (several `str` or `os.PathLike`): The files to treat.
-            Whether to restyle file or just check if they should be restyled.
-
-    Returns:
-        List[`str`]: The list of files changed or that should be restyled.
-    """
-    for file in files:
-        # Treat folders
-        if os.path.isdir(file):
-            files = [os.path.join(file, f) for f in os.listdir(file)]
-            files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")]
-            process_doc_files(*files, add_new_line=add_new_line)
-        else:
-            try:
-                process_doc_file(file, add_new_line=add_new_line)
-            except Exception:
-                print(f"There is a problem in {file}.")
-                raise
-
-
-def main(*files, add_new_line=True):
-    process_doc_files(*files, add_new_line=add_new_line)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
-    parser.add_argument(
-        "--remove_new_line",
-        action="store_true",
-        help="Whether to remove new line after each python code block instead of adding one.",
-    )
-    args = parser.parse_args()
-
-    main(*args.files, add_new_line=not args.remove_new_line)

From ca26699f375d761b2ac6f27849c04ee5f58a2d63 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 11 May 2023 09:32:23 +0200
Subject: [PATCH 076/935] [`gpt`] Gpt2 fix half precision causal mask (#23256)

* fix gpt2 inference

* fixup

* no need to be in `_keys_to_ignore_on_load_missing`
---
 .../decision_transformer/modeling_decision_transformer.py   | 4 +++-
 src/transformers/models/gpt2/modeling_gpt2.py               | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 0add713d8e87..926947b1617d 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -118,8 +118,9 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -747,6 +748,7 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel):
     main_input_name = "states"
     supports_gradient_checkpointing = False
     _keys_to_ignore_on_load_missing = [r"position_ids"]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index e6ee04200b34..5fc5451141aa 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -131,8 +131,9 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -954,7 +955,8 @@ def custom_forward(*inputs):
     GPT2_START_DOCSTRING,
 )
 class GPT2LMHeadModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
 
     def __init__(self, config):
         super().__init__(config)

From f82ee109e6e58e19c21e631a2354af3b00da9a3c Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 11 May 2023 10:04:07 +0100
Subject: [PATCH 077/935] Temporary tolerance fix for flaky whipser PT-TF
 equiv. test (#23257)

* Temp tol fix for flaky whipser test

* Add equivalent update to TF tests
---
 tests/models/whisper/test_modeling_tf_whisper.py | 4 ++++
 tests/models/whisper/test_modeling_whisper.py    | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index d4abd8f5f032..a52994899ad8 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -400,6 +400,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 0b5b375e9dd3..f66cafed97ce 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -824,6 +824,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     @is_pt_flax_cross_test
     def test_equivalence_pt_to_flax(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From b92abfa6e0b7053c8bede0647b69f0f1690e9cd0 Mon Sep 17 00:00:00 2001
From: Alexander Brokking <alexander.balfors@gmail.com>
Date: Thu, 11 May 2023 11:07:43 +0200
Subject: [PATCH 078/935] Add `top_k` argument to post-process of
 conditional/deformable-DETR (#22787)

* update min k_value of conditional detr post-processing

* feat: add top_k arg to post processing of deformable and conditional detr

* refactor: revert changes to deprecated methods

* refactor: move prob reshape to improve code clarity and reduce repetition
---
 .../conditional_detr/image_processing_conditional_detr.py | 8 ++++++--
 .../deformable_detr/image_processing_deformable_detr.py   | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index b5b9a576da8a..52b46471ddcf 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -1328,7 +1328,7 @@ def post_process(self, outputs, target_sizes):
 
     # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection with DeformableDetr->ConditionalDetr
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
     ):
         """
         Converts the raw output of [`ConditionalDetrForObjectDetection`] into final bounding boxes in (top_left_x,
@@ -1342,6 +1342,8 @@ def post_process_object_detection(
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
 
         Returns:
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
@@ -1356,7 +1358,9 @@ def post_process_object_detection(
                 )
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
         scores = topk_values
         topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
         labels = topk_indexes % out_logits.shape[2]
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 35a18e23edb3..908cc148c2fa 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -1325,7 +1325,7 @@ def post_process(self, outputs, target_sizes):
         return results
 
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
     ):
         """
         Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
@@ -1339,6 +1339,8 @@ def post_process_object_detection(
             target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                 (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
 
         Returns:
             `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
@@ -1353,7 +1355,9 @@ def post_process_object_detection(
                 )
 
         prob = out_logits.sigmoid()
-        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
         scores = topk_values
         topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
         labels = topk_indexes % out_logits.shape[2]

From b3bbe1bdb6207b99db38144a7ed258a060f9e131 Mon Sep 17 00:00:00 2001
From: AlpinDale <52078762+AlpinDale@users.noreply.github.com>
Date: Thu, 11 May 2023 14:42:13 +0430
Subject: [PATCH 079/935] `transformers-cli` -> `huggingface-cli` (#23276)

---
 scripts/tatoeba/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/tatoeba/README.md b/scripts/tatoeba/README.md
index 7c492ec4f46e..94bb167d51bb 100644
--- a/scripts/tatoeba/README.md
+++ b/scripts/tatoeba/README.md
@@ -54,7 +54,7 @@ To upload all converted models,
 
 1. Install [git-lfs](https://git-lfs.github.com/).
 
-2. Login to `transformers-cli`
+2. Login to `huggingface-cli`
 
 ```bash
 huggingface-cli login

From e1eb3efd02f1d6a2abea9433743897b5a7887ea3 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 11 May 2023 11:43:18 +0100
Subject: [PATCH 080/935] Temporarily increase tol for PT-FLAX whisper tests
 (#23288)

---
 tests/models/whisper/test_modeling_flax_whisper.py | 4 ++++
 tests/models/whisper/test_modeling_whisper.py      | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
index 79a2c51039ac..7ec5f90f0fcd 100644
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -248,6 +248,10 @@ def model_jitted(input_features, decoder_input_ids, **kwargs):
                 for jitted_output, output in zip(jitted_outputs, outputs):
                     self.assertEqual(jitted_output.shape, output.shape)
 
+    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
+
     # overwrite because of `input_features`
     @is_pt_flax_cross_test
     def test_save_load_bf16_to_base_pt(self):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f66cafed97ce..883a2021b9bb 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -828,6 +828,10 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, nam
         # We override with a slightly higher tol value, as test recently became flaky
         super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
 
+    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as test recently became flaky
+        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
+
     @is_pt_flax_cross_test
     def test_equivalence_pt_to_flax(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From dee673232bdc300ced32f7c1c4cd0b9447b1478f Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Thu, 11 May 2023 12:45:32 +0200
Subject: [PATCH 081/935] Added missing " in CHAT_PROMPT_TEMPLATE (#23287)

---
 src/transformers/tools/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tools/prompts.py b/src/transformers/tools/prompts.py
index 44336277e134..796b3c242ccf 100644
--- a/src/transformers/tools/prompts.py
+++ b/src/transformers/tools/prompts.py
@@ -172,7 +172,7 @@
 Assistant: I will use the tool `translator` to translate the text in German.
 
 ```py
-translated_summary = translator(summarized_text, src_lang="English", tgt_lang="German)
+translated_summary = translator(summarized_text, src_lang="English", tgt_lang="German")
 ```
 
 ====

From 125516977d2dede896b5270c1dce46a7089fc4cf Mon Sep 17 00:00:00 2001
From: Mishig <dmishig@gmail.com>
Date: Thu, 11 May 2023 14:50:04 +0200
Subject: [PATCH 082/935] Update custom_tools.mdx: fix link (#23292)

Wrong parantheses
---
 docs/source/en/custom_tools.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
index b0895f3bfc8e..3f53ad873796 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@@ -503,7 +503,7 @@ print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
 
 Note how `image_upscaler` is now part of the agents' toolbox.
 
-Let's now try out the new tools! We will re-use the image we generated in (Transformers Agents Quickstart)[./transformers_agents#single-execution-run].
+Let's now try out the new tools! We will re-use the image we generated in [Transformers Agents Quickstart](./transformers_agents#single-execution-run).
 
 ```py
 from diffusers.utils import load_image

From 436dc779a5cc187d4dccbc49e5c6367a4aeeb471 Mon Sep 17 00:00:00 2001
From: Mishig <dmishig@gmail.com>
Date: Thu, 11 May 2023 14:54:02 +0200
Subject: [PATCH 083/935] Update transformers_agents.mdx (#23289)

Make `huggingface-tools` to [`huggingface-tools`](https://huggingface.co/huggingface-tools)
---
 docs/source/en/transformers_agents.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.mdx
index 793ab6bfc0f2..514e34c30b31 100644
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.mdx
@@ -283,7 +283,7 @@ the ability to quickly create and share custom tools.
 
 By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool 
 directly with the agent. We've added a few 
-**transformers-agnostic** tools to the `huggingface-tools` organization:
+**transformers-agnostic** tools to the [`huggingface-tools` organization](https://huggingface.co/huggingface-tools):
 
 - **Text downloader**: to download a text from a web URL
 - **Text to image**: generate an image according to a prompt, leveraging stable diffusion
@@ -294,7 +294,7 @@ The text-to-image tool we have been using since the beginning is a remote tool t
 [*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
 continue releasing such tools on this and other organizations, to further supercharge this implementation.
 
-The agents have by default access to tools that reside on `huggingface-tools`.
+The agents have by default access to tools that reside on [`huggingface-tools`](https://huggingface.co/huggingface-tools).
 We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
 
 ### Code generation

From 5d02e6bd2079a3691c5ef7a1c888abe6a16d854b Mon Sep 17 00:00:00 2001
From: Hari <58052269+harisankar95@users.noreply.github.com>
Date: Thu, 11 May 2023 14:54:23 +0200
Subject: [PATCH 084/935] Convert numpy arrays to lists before saving the
 evaluation metrics as json (#23268)

* convert numpy array to list before writing to json

per_category_iou and per_category_accuracy  are ndarray in the eval_metrics

* code reformatted with make style
---
 .../run_semantic_segmentation_no_trainer.py                   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 783810be64b7..de997529de81 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -682,7 +682,9 @@ def preprocess_val(example_batch):
             if args.push_to_hub:
                 repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
 
-            all_results = {f"eval_{k}": v for k, v in eval_metrics.items()}
+            all_results = {
+                f"eval_{k}": v.tolist() if isinstance(v, np.ndarray) else v for k, v in eval_metrics.items()
+            }
             with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
                 json.dump(all_results, f)
 

From 6a6225beab4ea51a6ac3e9cfd56e2b4cdedd0022 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 11 May 2023 17:14:06 +0200
Subject: [PATCH 085/935] Fix doctest files fetch issue (#23277)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 28 ++++++--------
 .github/workflows/doctests.yml      |  6 +++
 utils/tests_fetcher.py              | 60 +++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 30898f9e1c2a..31627dbe594a 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -432,10 +432,13 @@ def job_name(self):
     tests_to_run="tests/repo_utils",
 )
 
-# At this moment, only the files that are in `utils/documentation_tests.txt` will be kept (together with a dummy file).
-py_command = 'import os; import json; fp = open("pr_documentation_tests.txt"); data_1 = fp.read().strip().split("\\n"); fp = open("utils/documentation_tests.txt"); data_2 = fp.read().strip().split("\\n"); to_test = [x for x in data_1 if x in set(data_2)] + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
+
+# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
+# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
+# the bash output redirection.)
+py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
 py_command = f"$(python3 -c '{py_command}')"
-command = f'echo "{py_command}" > pr_documentation_tests_filtered.txt'
+command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
 doc_test_job = CircleCIJob(
     "pr_documentation_tests",
     additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
@@ -451,27 +454,20 @@ def job_name(self):
         "touch dummy.py",
         {
             "name": "Get files to test",
-            "command":
-                "git remote add upstream https://github.com/huggingface/transformers.git && git fetch upstream \n"
-                "git diff --name-only --relative --diff-filter=AMR refs/remotes/upstream/main...HEAD | grep -E '\.(py|mdx)$' | grep -Ev '^\..*|/\.' | grep -Ev '__' > pr_documentation_tests.txt"
-        },
-        {
-            "name": "List files beings changed: pr_documentation_tests.txt",
-            "command":
-                "cat pr_documentation_tests.txt"
+            "command": command,
         },
         {
-            "name": "Filter pr_documentation_tests.txt",
+            "name": "Show information in `Get files to test`",
             "command":
-                command
+                "cat pr_documentation_tests_temp.txt"
         },
         {
-            "name": "List files beings tested: pr_documentation_tests_filtered.txt",
+            "name": "Get the last line in `pr_documentation_tests.txt`",
             "command":
-                "cat pr_documentation_tests_filtered.txt"
+                "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt"
         },
     ],
-    tests_to_run="$(cat pr_documentation_tests_filtered.txt)",  # noqa
+    tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
     pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
     command_timeout=1200,  # test cannot run longer than 1200 seconds
     pytest_num_workers=1,
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index 55c09b1acc82..76eeb001dfd2 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -25,11 +25,17 @@ jobs:
       image: huggingface/transformers-all-latest-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
+      - name: uninstall transformers (installed during docker image build)
+        run: python3 -m pip uninstall -y transformers
+
       - uses: actions/checkout@v3
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
 
+      - name: Install transformers in edit mode
+        run: python3 -m pip install -e .
+
       - name: GPU visibility
         run: |
           python3 utils/print_env.py
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 702b1e126ab4..8aa1015f3947 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -216,6 +216,66 @@ def get_modified_python_files(diff_with_last_commit=False):
         return get_diff(repo, repo.head.commit, parent_commits)
 
 
+def get_diff_for_py_and_mdx_files(repo, base_commit, commits):
+    """
+    Get's the diff between one or several commits and the head of the repository.
+    """
+    print("\n### DIFF ###\n")
+    code_diff = []
+    for commit in commits:
+        for diff_obj in commit.diff(base_commit):
+            # We always add new python files
+            if diff_obj.change_type in ["A", "M", "R"] and (
+                diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")
+            ):
+                code_diff.append(diff_obj.b_path)
+
+    return code_diff
+
+
+def get_modified_python_and_mdx_files(diff_with_last_commit=False):
+    """
+    Return a list of python and mdx files that have been modified between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+    """
+    repo = Repo(PATH_TO_REPO)
+
+    if not diff_with_last_commit:
+        print(f"main is at {repo.refs.main.commit}")
+        print(f"Current head is at {repo.head.commit}")
+
+        branching_commits = repo.merge_base(repo.refs.main, repo.head)
+        for commit in branching_commits:
+            print(f"Branching commit: {commit}")
+        return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits)
+    else:
+        print(f"main is at {repo.head.commit}")
+        parent_commits = repo.head.commit.parents
+        for commit in parent_commits:
+            print(f"Parent commit: {commit}")
+        return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits)
+
+
+def get_doctest_files(diff_with_last_commit=False):
+    """
+    Return a list of python and mdx files that have been modified between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+    """
+    test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit)
+    with open("utils/documentation_tests.txt") as fp:
+        documentation_tests = set(fp.read().strip().split("\n"))
+    # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
+    test_files_to_run = [x for x in test_files_to_run if x in documentation_tests]
+    # Make sure we did not end up with a test file that was removed
+    test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
+
+    return test_files_to_run
+
+
 # (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
 # \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
 # (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every

From d51296d9c2c48731cb83361dca14a5f851f5877c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 11 May 2023 19:26:48 +0200
Subject: [PATCH 086/935] skip `test_run_squad_no_trainer` for now (#23302)

skip

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 examples/pytorch/test_accelerate_examples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index d88a2ead64b4..da777d627285 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -21,6 +21,7 @@
 import shutil
 import sys
 import tempfile
+import unittest
 from unittest import mock
 
 import torch
@@ -176,6 +177,7 @@ def test_run_ner_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
 
+    @unittest.skip(reason="Fix me @zack")
     @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_squad_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()

From 83eda6435e7c842e55b42a529e9bf367bf2a126b Mon Sep 17 00:00:00 2001
From: Alessandro Pietro Bardelli <apbard@users.noreply.github.com>
Date: Thu, 11 May 2023 19:52:22 +0200
Subject: [PATCH 087/935] Better check for packages availability (#23163)

* Better check for packages availability

* amend _optimumneuron_available

* amend torch_version

* amend PIL detection and lint

* lint

* amend _faiss_available

* remove overloaded signatures of _is_package_available

* fix sklearn and decord detection

* remove unused checks

* revert
---
 src/transformers/file_utils.py         |   2 +-
 src/transformers/onnx/config.py        |   4 +-
 src/transformers/onnx/convert.py       |   4 +-
 src/transformers/utils/__init__.py     |   2 +-
 src/transformers/utils/fx.py           |  13 +-
 src/transformers/utils/import_utils.py | 404 +++++++++----------------
 tests/onnx/test_onnx_v2.py             |  12 +-
 7 files changed, 168 insertions(+), 273 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index da24760118c6..38f4db058159 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -72,6 +72,7 @@
     get_cached_models,
     get_file_from_repo,
     get_full_repo_name,
+    get_torch_version,
     has_file,
     http_user_agent,
     is_apex_available,
@@ -125,5 +126,4 @@
     to_numpy,
     to_py_obj,
     torch_only_method,
-    torch_version,
 )
diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index bbf06b07929d..66236e98645e 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -232,9 +232,9 @@ def is_torch_support_available(self) -> bool:
             `bool`: Whether the installed version of PyTorch is compatible with the model.
         """
         if is_torch_available():
-            from transformers.utils import torch_version
+            from transformers.utils import get_torch_version
 
-            return torch_version >= self.torch_onnx_minimum_version
+            return get_torch_version() >= self.torch_onnx_minimum_version
         else:
             return False
 
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index 9e9cc93c064b..be46f7cd3106 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -334,12 +334,12 @@ def export(
         preprocessor = tokenizer
 
     if is_torch_available():
-        from ..utils import torch_version
+        from ..utils import get_torch_version
 
         if not config.is_torch_support_available:
             logger.warning(
                 f"Unsupported PyTorch version for this model. Minimum required is {config.torch_onnx_minimum_version},"
-                f" got: {torch_version}"
+                f" got: {get_torch_version()}"
             )
 
     if is_torch_available() and issubclass(type(model), PreTrainedModel):
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index a44a8360c667..35d3638aecdb 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -99,6 +99,7 @@
     _LazyModule,
     ccl_version,
     direct_transformers_import,
+    get_torch_version,
     is_accelerate_available,
     is_apex_available,
     is_bitsandbytes_available,
@@ -170,7 +171,6 @@
     is_vision_available,
     requires_backends,
     torch_only_method,
-    torch_version,
 )
 
 
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index e82d44c80281..5fb75cc4face 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -25,7 +25,6 @@
 from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import torch
-from packaging import version
 from torch import nn
 from torch.fx import Graph, GraphModule, Proxy, Tracer
 from torch.fx._compatibility import compatibility
@@ -54,8 +53,13 @@
     MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
     MODEL_MAPPING_NAMES,
 )
-from ..utils import ENV_VARS_TRUE_VALUES, TORCH_FX_REQUIRED_VERSION, is_peft_available, is_torch_fx_available
-from ..utils.versions import importlib_metadata
+from ..utils import (
+    ENV_VARS_TRUE_VALUES,
+    TORCH_FX_REQUIRED_VERSION,
+    get_torch_version,
+    is_peft_available,
+    is_torch_fx_available,
+)
 
 
 if is_peft_available():
@@ -737,9 +741,8 @@ def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
         super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions)
 
         if not is_torch_fx_available():
-            torch_version = version.parse(importlib_metadata.version("torch"))
             raise ImportError(
-                f"Found an incompatible version of torch. Found version {torch_version}, but only version "
+                f"Found an incompatible version of torch. Found version {get_torch_version()}, but only version "
                 f"{TORCH_FX_REQUIRED_VERSION} is supported."
             )
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index af169332eccb..b349f5382760 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -25,7 +25,7 @@
 from functools import lru_cache
 from itertools import chain
 from types import ModuleType
-from typing import Any
+from typing import Any, Tuple, Union
 
 from packaging import version
 
@@ -35,6 +35,24 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
+    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            package_version = importlib_metadata.version(pkg_name)
+            package_exists = True
+        except importlib_metadata.PackageNotFoundError:
+            package_exists = False
+        logger.debug(f"Detected {pkg_name} version {package_version}")
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+
+
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
 ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
 
@@ -44,26 +62,80 @@
 
 FORCE_TF_AVAILABLE = os.environ.get("FORCE_TF_AVAILABLE", "AUTO").upper()
 
+# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
+TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
+
+
+_accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
+_apex_available = _is_package_available("apex")
+_bitsandbytes_available = _is_package_available("bitsandbytes")
+_bs4_available = _is_package_available("bs4")
+_coloredlogs_available = _is_package_available("coloredlogs")
+_datasets_available = _is_package_available("datasets")
+_decord_available = importlib.util.find_spec("decord") is not None
+_detectron2_available = _is_package_available("detectron2")
+_faiss_available = _is_package_available("faiss") or _is_package_available("faiss-cpu")
+_ftfy_available = _is_package_available("ftfy")
+_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
+_jieba_available = _is_package_available("jieba")
+_kenlm_available = _is_package_available("kenlm")
+_keras_nlp_available = _is_package_available("keras_nlp")
+_librosa_available = _is_package_available("librosa")
+_natten_available = _is_package_available("natten")
+_ninja_available = _is_package_available("ninja")
+_onnx_available = _is_package_available("onnx")
+_openai_available = _is_package_available("openai")
+_optimum_available = _is_package_available("optimum")
+_optimumneuron_available = _optimum_available and _is_package_available("optimum.neuron")
+_pandas_available = _is_package_available("pandas")
+_peft_available = _is_package_available("peft")
+_phonemizer_available = _is_package_available("phonemizer")
+_psutil_available = _is_package_available("psutil")
+_py3nvml_available = _is_package_available("py3nvml")
+_pyctcdecode_available = _is_package_available("pyctcdecode")
+_pytesseract_available = _is_package_available("pytesseract")
+_pytorch_quantization_available = _is_package_available("pytorch_quantization")
+_rjieba_available = _is_package_available("rjieba")
+_sacremoses_available = _is_package_available("sacremoses")
+_safetensors_available = _is_package_available("safetensors")
+_scipy_available = _is_package_available("scipy")
+_sentencepiece_available = _is_package_available("sentencepiece")
+_sklearn_available = importlib.util.find_spec("sklearn") is not None
+if _sklearn_available:
+    try:
+        importlib_metadata.version("scikit-learn")
+    except importlib_metadata.PackageNotFoundError:
+        _sklearn_available = False
+_smdistributed_available = _is_package_available("smdistributed")
+_soundfile_available = _is_package_available("soundfile")
+_spacy_available = _is_package_available("spacy")
+_sudachipy_available = _is_package_available("sudachipy")
+_tensorflow_probability_available = _is_package_available("tensorflow_probability")
+_tensorflow_text_available = _is_package_available("tensorflow_text")
+_tf2onnx_available = _is_package_available("tf2onnx")
+_timm_available = _is_package_available("timm")
+_tokenizers_available = _is_package_available("tokenizers")
+_torchaudio_available = _is_package_available("torchaudio")
+_torchdistx_available = _is_package_available("torchdistx")
+_torchvision_available = _is_package_available("torchvision")
+
+
 _torch_version = "N/A"
+_torch_available = False
 if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available = importlib.util.find_spec("torch") is not None
-    if _torch_available:
-        try:
-            _torch_version = importlib_metadata.version("torch")
-            logger.info(f"PyTorch version {_torch_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _torch_available = False
+    _torch_available, _torch_version = _is_package_available("torch", return_version=True)
 else:
     logger.info("Disabling PyTorch because USE_TF is set")
     _torch_available = False
 
 
 _tf_version = "N/A"
+_tf_available = False
 if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES:
     _tf_available = True
 else:
     if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-        _tf_available = importlib.util.find_spec("tensorflow") is not None
+        _tf_available = _is_package_available("tensorflow")
         if _tf_available:
             candidates = (
                 "tensorflow",
@@ -93,179 +165,9 @@
                     f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum."
                 )
                 _tf_available = False
-            else:
-                logger.info(f"TensorFlow version {_tf_version} available.")
     else:
         logger.info("Disabling Tensorflow because USE_TORCH is set")
-        _tf_available = False
-
-
-if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
-    if _flax_available:
-        try:
-            _jax_version = importlib_metadata.version("jax")
-            _flax_version = importlib_metadata.version("flax")
-            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _flax_available = False
-else:
-    _flax_available = False
-
-
-_datasets_available = importlib.util.find_spec("datasets") is not None
-try:
-    # Check we're not importing a "datasets" directory somewhere but the actual library by trying to grab the version
-    # AND checking it has an author field in the metadata that is HuggingFace.
-    _ = importlib_metadata.version("datasets")
-    _datasets_metadata = importlib_metadata.metadata("datasets")
-    if _datasets_metadata.get("author", "") != "HuggingFace Inc.":
-        _datasets_available = False
-except importlib_metadata.PackageNotFoundError:
-    _datasets_available = False
-
-
-_diffusers_available = importlib.util.find_spec("diffusers") is not None
-try:
-    _diffusers_version = importlib_metadata.version("diffusers")
-    logger.debug(f"Successfully imported diffusers version {_diffusers_version}")
-except importlib_metadata.PackageNotFoundError:
-    _diffusers_available = False
-
-
-_detectron2_available = importlib.util.find_spec("detectron2") is not None
-try:
-    _detectron2_version = importlib_metadata.version("detectron2")
-    logger.debug(f"Successfully imported detectron2 version {_detectron2_version}")
-except importlib_metadata.PackageNotFoundError:
-    _detectron2_available = False
-
-
-_faiss_available = importlib.util.find_spec("faiss") is not None
-try:
-    _faiss_version = importlib_metadata.version("faiss")
-    logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib_metadata.PackageNotFoundError:
-    try:
-        _faiss_version = importlib_metadata.version("faiss-cpu")
-        logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib_metadata.PackageNotFoundError:
-        _faiss_available = False
-
-
-_ftfy_available = importlib.util.find_spec("ftfy") is not None
-try:
-    _ftfy_version = importlib_metadata.version("ftfy")
-    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
-except importlib_metadata.PackageNotFoundError:
-    _ftfy_available = False
-
-
-coloredlogs = importlib.util.find_spec("coloredlogs") is not None
-try:
-    _coloredlogs_available = importlib_metadata.version("coloredlogs")
-    logger.debug(f"Successfully imported sympy version {_coloredlogs_available}")
-except importlib_metadata.PackageNotFoundError:
-    _coloredlogs_available = False
-
-
-sympy_available = importlib.util.find_spec("sympy") is not None
-try:
-    _sympy_available = importlib_metadata.version("sympy")
-    logger.debug(f"Successfully imported sympy version {_sympy_available}")
-except importlib_metadata.PackageNotFoundError:
-    _sympy_available = False
-
-
-_tf2onnx_available = importlib.util.find_spec("tf2onnx") is not None
-try:
-    _tf2onnx_version = importlib_metadata.version("tf2onnx")
-    logger.debug(f"Successfully imported tf2onnx version {_tf2onnx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tf2onnx_available = False
-
-
-_onnx_available = importlib.util.find_spec("onnxruntime") is not None
-try:
-    _onxx_version = importlib_metadata.version("onnx")
-    logger.debug(f"Successfully imported onnx version {_onxx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _onnx_available = False
-
-
-_opencv_available = importlib.util.find_spec("cv2") is not None
-
-
-_pytorch_quantization_available = importlib.util.find_spec("pytorch_quantization") is not None
-try:
-    _pytorch_quantization_version = importlib_metadata.version("pytorch_quantization")
-    logger.debug(f"Successfully imported pytorch-quantization version {_pytorch_quantization_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pytorch_quantization_available = False
-
-
-_soundfile_available = importlib.util.find_spec("soundfile") is not None
-try:
-    _soundfile_version = importlib_metadata.version("soundfile")
-    logger.debug(f"Successfully imported soundfile version {_soundfile_version}")
-except importlib_metadata.PackageNotFoundError:
-    _soundfile_available = False
-
-
-_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None
-try:
-    _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability")
-    logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tensorflow_probability_available = False
-
-
-_timm_available = importlib.util.find_spec("timm") is not None
-try:
-    _timm_version = importlib_metadata.version("timm")
-    logger.debug(f"Successfully imported timm version {_timm_version}")
-except importlib_metadata.PackageNotFoundError:
-    _timm_available = False
-
-
-_natten_available = importlib.util.find_spec("natten") is not None
-try:
-    _natten_version = importlib_metadata.version("natten")
-    logger.debug(f"Successfully imported natten version {_natten_version}")
-except importlib_metadata.PackageNotFoundError:
-    _natten_available = False
-
-
-_torchaudio_available = importlib.util.find_spec("torchaudio") is not None
-try:
-    _torchaudio_version = importlib_metadata.version("torchaudio")
-    logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
-except importlib_metadata.PackageNotFoundError:
-    _torchaudio_available = False
-
 
-_phonemizer_available = importlib.util.find_spec("phonemizer") is not None
-try:
-    _phonemizer_version = importlib_metadata.version("phonemizer")
-    logger.debug(f"Successfully imported phonemizer version {_phonemizer_version}")
-except importlib_metadata.PackageNotFoundError:
-    _phonemizer_available = False
-
-
-_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
-try:
-    _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
-    logger.debug(f"Successfully imported pyctcdecode version {_pyctcdecode_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pyctcdecode_available = False
-
-
-_librosa_available = importlib.util.find_spec("librosa") is not None
-try:
-    _librosa_version = importlib_metadata.version("librosa")
-    logger.debug(f"Successfully imported librosa version {_librosa_version}")
-except importlib_metadata.PackageNotFoundError:
-    _librosa_available = False
 
 ccl_version = "N/A"
 _is_ccl_available = (
@@ -274,38 +176,46 @@
 )
 try:
     ccl_version = importlib_metadata.version("oneccl_bind_pt")
-    logger.debug(f"Successfully imported oneccl_bind_pt version {ccl_version}")
+    logger.debug(f"Detected oneccl_bind_pt version {ccl_version}")
 except importlib_metadata.PackageNotFoundError:
     _is_ccl_available = False
 
-_decord_availale = importlib.util.find_spec("decord") is not None
-try:
-    _decord_version = importlib_metadata.version("decord")
-    logger.debug(f"Successfully imported decord version {_decord_version}")
-except importlib_metadata.PackageNotFoundError:
-    _decord_availale = False
 
-_jieba_available = importlib.util.find_spec("jieba") is not None
-try:
-    _jieba_version = importlib_metadata.version("jieba")
-    logger.debug(f"Successfully imported jieba version {_jieba_version}")
-except importlib_metadata.PackageNotFoundError:
-    _jieba_available = False
+_flax_available = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available, _flax_version = _is_package_available("flax", return_version=True)
+    if _flax_available:
+        _jax_available, _jax_version = _is_package_available("jax", return_version=True)
+        if _jax_available:
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        else:
+            _flax_available = _jax_available = False
+            _jax_version = _flax_version = "N/A"
 
-# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
-TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
+
+_torch_fx_available = False
+if _torch_available:
+    torch_version = version.parse(_torch_version)
+    _torch_fx_available = (torch_version.major, torch_version.minor) >= (
+        TORCH_FX_REQUIRED_VERSION.major,
+        TORCH_FX_REQUIRED_VERSION.minor,
+    )
 
 
 def is_kenlm_available():
-    return importlib.util.find_spec("kenlm") is not None
+    return _kenlm_available
 
 
 def is_torch_available():
     return _torch_available
 
 
+def get_torch_version():
+    return _torch_version
+
+
 def is_torchvision_available():
-    return importlib.util.find_spec("torchvision") is not None
+    return _torchvision_available
 
 
 def is_pyctcdecode_available():
@@ -404,26 +314,16 @@ def is_torch_tf32_available():
     return True
 
 
-torch_version = None
-_torch_fx_available = False
-if _torch_available:
-    torch_version = version.parse(importlib_metadata.version("torch"))
-    _torch_fx_available = (torch_version.major, torch_version.minor) >= (
-        TORCH_FX_REQUIRED_VERSION.major,
-        TORCH_FX_REQUIRED_VERSION.minor,
-    )
-
-
 def is_torch_fx_available():
     return _torch_fx_available
 
 
 def is_peft_available():
-    return importlib.util.find_spec("peft") is not None
+    return _peft_available
 
 
 def is_bs4_available():
-    return importlib.util.find_spec("bs4") is not None
+    return _bs4_available
 
 
 def is_tf_available():
@@ -443,7 +343,7 @@ def is_onnx_available():
 
 
 def is_openai_available():
-    return importlib.util.find_spec("openai") is not None
+    return _openai_available
 
 
 def is_flax_available():
@@ -517,40 +417,36 @@ def is_detectron2_available():
 
 
 def is_rjieba_available():
-    return importlib.util.find_spec("rjieba") is not None
+    return _rjieba_available
 
 
 def is_psutil_available():
-    return importlib.util.find_spec("psutil") is not None
+    return _psutil_available
 
 
 def is_py3nvml_available():
-    return importlib.util.find_spec("py3nvml") is not None
+    return _py3nvml_available
 
 
 def is_sacremoses_available():
-    return importlib.util.find_spec("sacremoses") is not None
+    return _sacremoses_available
 
 
 def is_apex_available():
-    return importlib.util.find_spec("apex") is not None
+    return _apex_available
 
 
 def is_ninja_available():
-    return importlib.util.find_spec("ninja") is not None
+    return _ninja_available
 
 
 def is_ipex_available():
     def get_major_and_minor_from_version(full_version):
         return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
 
-    if not is_torch_available() or importlib.util.find_spec("intel_extension_for_pytorch") is None:
-        return False
-    _ipex_version = "N/A"
-    try:
-        _ipex_version = importlib_metadata.version("intel_extension_for_pytorch")
-    except importlib_metadata.PackageNotFoundError:
+    if not is_torch_available() or not _ipex_available:
         return False
+
     torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
     ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
     if torch_major_and_minor != ipex_major_and_minor:
@@ -563,11 +459,11 @@ def get_major_and_minor_from_version(full_version):
 
 
 def is_bitsandbytes_available():
-    return importlib.util.find_spec("bitsandbytes") is not None
+    return _bitsandbytes_available
 
 
 def is_torchdistx_available():
-    return importlib.util.find_spec("torchdistx") is not None
+    return _torchdistx_available
 
 
 def is_faiss_available():
@@ -575,17 +471,15 @@ def is_faiss_available():
 
 
 def is_scipy_available():
-    return importlib.util.find_spec("scipy") is not None
+    return _scipy_available
 
 
 def is_sklearn_available():
-    if importlib.util.find_spec("sklearn") is None:
-        return False
-    return is_scipy_available() and importlib.util.find_spec("sklearn.metrics")
+    return _sklearn_available
 
 
 def is_sentencepiece_available():
-    return importlib.util.find_spec("sentencepiece") is not None
+    return _sentencepiece_available
 
 
 def is_protobuf_available():
@@ -595,56 +489,54 @@ def is_protobuf_available():
 
 
 def is_accelerate_available(check_partial_state=False):
-    accelerate_available = importlib.util.find_spec("accelerate") is not None
-    if accelerate_available:
-        if check_partial_state:
-            return version.parse(importlib_metadata.version("accelerate")) >= version.parse("0.17.0")
-        else:
-            return True
-    else:
-        return False
+    if check_partial_state:
+        return _accelerate_available and version.parse(_accelerate_version) >= version.parse("0.17.0")
+    return _accelerate_available
 
 
 def is_optimum_available():
-    return importlib.util.find_spec("optimum") is not None
+    return _optimum_available
 
 
 def is_optimum_neuron_available():
-    return importlib.util.find_spec("optimum.neuron") is not None
+    return _optimumneuron_available
 
 
 def is_safetensors_available():
-    if is_torch_available():
-        if version.parse(_torch_version) >= version.parse("1.10"):
-            return importlib.util.find_spec("safetensors") is not None
-        else:
-            return False
-    else:
-        return importlib.util.find_spec("safetensors") is not None
+    if is_torch_available() and version.parse(_torch_version) < version.parse("1.10"):
+        return False
+    return _safetensors_available
 
 
 def is_tokenizers_available():
-    return importlib.util.find_spec("tokenizers") is not None
+    return _tokenizers_available
 
 
 def is_vision_available():
-    return importlib.util.find_spec("PIL") is not None
+    _pil_available = importlib.util.find_spec("PIL") is not None
+    if _pil_available:
+        try:
+            package_version = importlib_metadata.version("Pillow")
+        except importlib_metadata.PackageNotFoundError:
+            return False
+        logger.debug(f"Detected PIL version {package_version}")
+    return _pil_available
 
 
 def is_pytesseract_available():
-    return importlib.util.find_spec("pytesseract") is not None
+    return _pytesseract_available
 
 
 def is_spacy_available():
-    return importlib.util.find_spec("spacy") is not None
+    return _spacy_available
 
 
 def is_tensorflow_text_available():
-    return is_tf_available() and importlib.util.find_spec("tensorflow_text") is not None
+    return is_tf_available() and _tensorflow_text_available
 
 
 def is_keras_nlp_available():
-    return is_tensorflow_text_available() and importlib.util.find_spec("keras_nlp") is not None
+    return is_tensorflow_text_available() and _keras_nlp_available
 
 
 def is_in_notebook():
@@ -674,7 +566,7 @@ def is_tensorflow_probability_available():
 
 
 def is_pandas_available():
-    return importlib.util.find_spec("pandas") is not None
+    return _pandas_available
 
 
 def is_sagemaker_dp_enabled():
@@ -688,7 +580,7 @@ def is_sagemaker_dp_enabled():
     except json.JSONDecodeError:
         return False
     # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
+    return _smdistributed_available
 
 
 def is_sagemaker_mp_enabled():
@@ -712,7 +604,7 @@ def is_sagemaker_mp_enabled():
     except json.JSONDecodeError:
         return False
     # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
+    return _smdistributed_available
 
 
 def is_training_run_on_sagemaker():
@@ -762,11 +654,11 @@ def is_ccl_available():
 
 
 def is_decord_available():
-    return _decord_availale
+    return _decord_available
 
 
 def is_sudachi_available():
-    return importlib.util.find_spec("sudachipy") is not None
+    return _sudachipy_available
 
 
 def is_jumanpp_available():
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index 81e28a3796a7..796fa1b3ea6a 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -319,12 +319,12 @@ def _onnx_export(
         onnx_config = onnx_config_class_constructor(model.config)
 
         if is_torch_available():
-            from transformers.utils import torch_version
+            from transformers.utils import get_torch_version
 
-            if torch_version < onnx_config.torch_onnx_minimum_version:
+            if get_torch_version() < onnx_config.torch_onnx_minimum_version:
                 pytest.skip(
                     "Skipping due to incompatible PyTorch version. Minimum required is"
-                    f" {onnx_config.torch_onnx_minimum_version}, got: {torch_version}"
+                    f" {onnx_config.torch_onnx_minimum_version}, got: {get_torch_version()}"
                 )
 
         preprocessor = get_preprocessor(model_name)
@@ -362,12 +362,12 @@ def _onnx_export_encoder_decoder_models(
         onnx_config = onnx_config_class_constructor(model.config)
 
         if is_torch_available():
-            from transformers.utils import torch_version
+            from transformers.utils import get_torch_version
 
-            if torch_version < onnx_config.torch_onnx_minimum_version:
+            if get_torch_version() < onnx_config.torch_onnx_minimum_version:
                 pytest.skip(
                     "Skipping due to incompatible PyTorch version. Minimum required is"
-                    f" {onnx_config.torch_onnx_minimum_version}, got: {torch_version}"
+                    f" {onnx_config.torch_onnx_minimum_version}, got: {get_torch_version()}"
                 )
 
         encoder_model = model.get_encoder()

From ab96bf02943a2611b1c1f65acaedd1ad81a94757 Mon Sep 17 00:00:00 2001
From: raghavanone <115454562+raghavanone@users.noreply.github.com>
Date: Thu, 11 May 2023 23:43:05 +0530
Subject: [PATCH 088/935] Add gradient_checkpointing parameter to
 FlaxWhisperEncoder (#23300)

Add gradient_checkpointing parameter
---
 src/transformers/models/whisper/modeling_flax_whisper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index 1a994acea4df..5fbc158a44a0 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -1515,7 +1515,9 @@ class FlaxWhisperForAudioClassificationModule(nn.Module):
     gradient_checkpointing: bool = False
 
     def setup(self) -> None:
-        self.encoder = FlaxWhisperEncoder(config=self.config, dtype=self.dtype)
+        self.encoder = FlaxWhisperEncoder(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
         self.config.is_encoder_decoder = False
         num_layers = self.config.num_hidden_layers + 1
         if self.config.use_weighted_layer_sum:

From 71b19ee2518c9d0bf20e75599d19461be4c85b91 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 May 2023 20:25:51 +0200
Subject: [PATCH 089/935] Agents extras (#23301)

* Agents extras

* Add to docs
---
 docs/source/en/transformers_agents.mdx        | 12 +++++++++++-
 setup.py                                      | 14 +++++++++++++-
 src/transformers/dependency_versions_table.py |  4 +++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.mdx
index 514e34c30b31..7cf1ce00b2ae 100644
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.mdx
@@ -65,7 +65,17 @@ We provide support for openAI models as well as opensource alternatives from Big
 models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is
 providing free access to endpoints for BigCode and OpenAssistant models.
 
-To use openAI models, you instantiate an [`OpenAiAgent`]:
+To start with, please install the `agents` extras in order to install all default dependencies.
+```bash
+pip install transformers[agents]
+```
+
+To use openAI models, you instantiate an [`OpenAiAgent`] after installing the `openai` dependency:
+
+```bash
+pip install openai
+```
+
 
 ```py
 from transformers import OpenAiAgent
diff --git a/setup.py b/setup.py
index 4ac28772625e..2bc440110772 100644
--- a/setup.py
+++ b/setup.py
@@ -112,6 +112,7 @@
     "datasets!=2.5.0",
     "decord==0.6.0",
     "deepspeed>=0.8.3",
+    "diffusers",
     "dill<0.3.5",
     "evaluate>=0.2.0",
     "fairscale>0.3",
@@ -123,7 +124,7 @@
     "fugashi>=1.0",
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.11.0,<1.0",
+    "huggingface-hub>=0.14.1,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
@@ -140,6 +141,7 @@
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
     "onnxruntime>=1.4.0",
+    "opencv-python",
     "optuna",
     "optax>=0.0.8,<=0.1.4",
     "packaging>=20.0",
@@ -412,6 +414,16 @@ def run(self):
     "tqdm",
 )
 
+extras["agents"] = deps_list(
+    "diffusers",
+    "accelerate",
+    "datasets",
+    "torch",
+    "sentencepiece",
+    "opencv-python",
+    "Pillow"
+)
+
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
     deps["importlib_metadata"] + ";python_version<'3.8'",  # importlib_metadata for Python versions that don't have it
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 9012b21ff43a..e79a11ba54bc 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -13,6 +13,7 @@
     "datasets": "datasets!=2.5.0",
     "decord": "decord==0.6.0",
     "deepspeed": "deepspeed>=0.8.3",
+    "diffusers": "diffusers",
     "dill": "dill<0.3.5",
     "evaluate": "evaluate>=0.2.0",
     "fairscale": "fairscale>0.3",
@@ -24,7 +25,7 @@
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.11.0,<1.0",
+    "huggingface-hub": "huggingface-hub>=0.14.1,<1.0",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
@@ -41,6 +42,7 @@
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
     "onnxruntime": "onnxruntime>=1.4.0",
+    "opencv-python": "opencv-python",
     "optuna": "optuna",
     "optax": "optax>=0.0.8,<=0.1.4",
     "packaging": "packaging>=20.0",

From f76fb3aeeafa98f2270e71f307559b6ab26d3801 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 11 May 2023 14:26:19 -0400
Subject: [PATCH 090/935] Fix broken links in the agent docs (#23297)

---
 docs/source/en/main_classes/agent.mdx  |  2 +-
 docs/source/en/transformers_agents.mdx | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/main_classes/agent.mdx b/docs/source/en/main_classes/agent.mdx
index ee910b893b6d..953857c410cb 100644
--- a/docs/source/en/main_classes/agent.mdx
+++ b/docs/source/en/main_classes/agent.mdx
@@ -19,7 +19,7 @@ can vary as the APIs or underlying models are prone to change.
 
 </Tip>
 
-To learn more about agents and tools make sure to read the [introductory guide](../agents_and_tools). This page
+To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
 contains the API docs for the underlying classes.
 
 ## Agents
diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.mdx
index 7cf1ce00b2ae..e388c3640ab6 100644
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.mdx
@@ -266,16 +266,16 @@ with the code generated by the agent.
 We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated 
 in `transformers`:
 
-- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](../model_doc/donut))
-- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](../model_doc/flan-t5))
-- **Unconditional image captioning**: Caption the image! ([BLIP](../model_doc/blip))
-- **Image question answering**: given an image, answer a question on this image ([VILT](../model_doc/vilt))
-- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](../model_doc/clipseg))
-- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](../model_doc/whisper))
-- **Text to speech**: convert text to speech ([SpeechT5](../model_doc/speecht5))
-- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](../model_doc/bart))
-- **Text summarization**: summarize a long text in one or a few sentences ([BART](../model_doc/bart))
-- **Translation**: translate the text into a given language ([NLLB](../model_doc/nllb))
+- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
+- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](./model_doc/flan-t5))
+- **Unconditional image captioning**: Caption the image! ([BLIP](./model_doc/blip))
+- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
+- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](./model_doc/clipseg))
+- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
+- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
+- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](./model_doc/bart))
+- **Text summarization**: summarize a long text in one or a few sentences ([BART](./model_doc/bart))
+- **Translation**: translate the text into a given language ([NLLB](./model_doc/nllb))
 
 These tools have an integration in transformers, and can be used manually as well, for example:
 

From 662751b4e24bd4ff431e31e66173784d82836d63 Mon Sep 17 00:00:00 2001
From: Freddy Boulton <alfonsoboulton@gmail.com>
Date: Thu, 11 May 2023 14:31:28 -0400
Subject: [PATCH 091/935] Fix typo in gradio-tools docs (#23305)

Fix typo
---
 docs/source/en/custom_tools.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
index 3f53ad873796..23ca4bccbadc 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@@ -726,7 +726,7 @@ We pass that instance to the `Tool.from_gradio` method:
 ```python
 from transformers import Tool
 
-tool = Tool.from_gradio(gradio_tools)
+tool = Tool.from_gradio(gradio_tool)
 ```
 
 Now we can manage it exactly as we would a usual custom tool. We leverage it to improve our prompt

From 4eea25b44530198c16c048096c791a2f749ebf58 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 11 May 2023 14:38:11 -0400
Subject: [PATCH 092/935] Fix image segmentation tool test (#23306)

---
 tests/tools/test_image_segmentation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tools/test_image_segmentation.py b/tests/tools/test_image_segmentation.py
index b71d84729929..8a741b501b87 100644
--- a/tests/tools/test_image_segmentation.py
+++ b/tests/tools/test_image_segmentation.py
@@ -44,10 +44,10 @@ def test_exact_match_arg_remote(self):
 
     def test_exact_match_kwarg(self):
         image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
-        result = self.tool(image=image, prompt="cat")
+        result = self.tool(image=image, label="cat")
         self.assertTrue(isinstance(result, Image.Image))
 
     def test_exact_match_kwarg_remote(self):
         image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
-        result = self.remote_tool(image=image, prompt="cat")
+        result = self.remote_tool(image=image, label="cat")
         self.assertTrue(isinstance(result, Image.Image))

From 786b9cf5ca6e26df33d1ad5bcfff048ef3e1215b Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Thu, 11 May 2023 14:40:38 -0400
Subject: [PATCH 093/935] Style

---
 setup.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 2bc440110772..43b33be417cc 100644
--- a/setup.py
+++ b/setup.py
@@ -415,13 +415,7 @@ def run(self):
 )
 
 extras["agents"] = deps_list(
-    "diffusers",
-    "accelerate",
-    "datasets",
-    "torch",
-    "sentencepiece",
-    "opencv-python",
-    "Pillow"
+    "diffusers", "accelerate", "datasets", "torch", "sentencepiece", "opencv-python", "Pillow"
 )
 
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py

From ba71d9e94c8f6de0615a454b5b9e306ceecb8764 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 11 May 2023 21:28:08 +0200
Subject: [PATCH 094/935] unpin tf prob (#23293)

* unpin tf prob

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py                |  6 +++---
 docker/transformers-all-latest-gpu/Dockerfile      | 14 ++++++--------
 .../Dockerfile                                     |  2 +-
 docker/transformers-pytorch-gpu/Dockerfile         |  2 +-
 docker/transformers-tensorflow-gpu/Dockerfile      |  4 ++--
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 31627dbe594a..ef100bdbb135 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -217,7 +217,7 @@ def job_name(self):
         "git lfs install",
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
-        'pip install "tensorflow_probability<0.20"',
+        "pip install tensorflow_probability",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
@@ -258,7 +258,7 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
-        'pip install "tensorflow_probability<0.20"',
+        "pip install tensorflow_probability",
     ],
     parallelism=1,
     pytest_options={"rA": None},
@@ -297,7 +297,7 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install -y cmake",
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]",
-        'pip install "tensorflow_probability<0.20"',
+        "pip install tensorflow_probability",
     ],
     pytest_options={"rA": None},
     marker="is_pipeline_test",
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 7717a96f0b8c..c01eb22ea98e 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='2.0.0'
+ARG PYTORCH='2.0.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='1.11.0'
 # Example: `cu102`, `cu113`, etc.
@@ -32,15 +32,13 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.11
-RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.20"
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12
+# (temporarily) The above TF installation upgrade protobuf to v4, which we haven't tested yet!
+# (Before the above line, the installed TF is 2.11 due to some conflict)
+RUN python3 -m pip install --no-cache-dir -U protobuf==3.20.3
+RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
 RUN python3 -m pip uninstall -y flax jax
 
-# To include the change in this commit https://github.com/onnx/tensorflow-onnx/commit/ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
-# Otherwise, we get tf2onnx==1.8 (caused by `flatbuffers` version),  and some tests fail with `ValueError: from_keras requires input_signature`.
-# TODO: remove this line once the conflict is resolved in these libraries.
-RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow-onnx.git@ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
-
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index c407da1f1081..6d54b971d22c 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -4,7 +4,7 @@ LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
-ARG PYTORCH='2.0.0'
+ARG PYTORCH='2.0.1'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu117'
 
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index 3cb8625339ba..9bf91674afcd 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
 
 # If set to nothing, will install the latest version
-ARG PYTORCH='2.0.0'
+ARG PYTORCH='2.0.1'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
index 09df552546ce..5261a37a9144 100644
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -12,13 +12,13 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
 
 # If set to nothing, will install the latest version
-ARG TENSORFLOW='2.11'
+ARG TENSORFLOW='2.12'
 
 RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip uninstall -y torch flax
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
-RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.20"
+RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.

From 273f5ba0266b223c1d611bd00d4a4b2d58771a33 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 11 May 2023 15:31:59 -0400
Subject: [PATCH 095/935] Revert "search buffers for dtype" (#23308)

Revert "search buffers for dtype (#23159)"

This reverts commit ef42c2c487260c2a0111fa9d17f2507d84ddedea.
---
 src/transformers/modeling_utils.py | 36 ++++++++++++------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 8f13d4aa2300..bf06d9c40538 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -207,29 +207,21 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil
         # if no floating dtype was found return whatever the first dtype is
         return last_dtype
 
-    for t in parameter.buffers():
-        last_dtype = t.dtype
-        if t.is_floating_point():
-            return t.dtype
-
-    if last_dtype is not None:
-        # if no floating dtype was found return whatever the first dtype is
-        return last_dtype
-
-    # For nn.DataParallel compatibility in PyTorch > 1.5
-    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-        tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-        return tuples
-
-    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-    last_tuple = None
-    for tuple in gen:
-        last_tuple = tuple
-        if tuple[1].is_floating_point():
-            return tuple[1].dtype
+    else:
+        # For nn.DataParallel compatibility in PyTorch > 1.5
+        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
 
-    # fallback to the last dtype
-    return last_tuple[1].dtype
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        last_tuple = None
+        for tuple in gen:
+            last_tuple = tuple
+            if tuple[1].is_floating_point():
+                return tuple[1].dtype
+
+        # fallback to the last dtype
+        return last_tuple[1].dtype
 
 
 def get_state_dict_float_dtype(state_dict):

From 364ced6893cd136de4c4f82c96b4301d691d4140 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 12 May 2023 12:11:20 +0200
Subject: [PATCH 096/935] Remove `LanguageIdentificationTool` in `__init__.py`
 as we don't have it yet (#23326)

remove LanguageIdentificationTool

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/tools/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/tools/__init__.py b/src/transformers/tools/__init__.py
index d064f35a5e66..8465d7370d3c 100644
--- a/src/transformers/tools/__init__.py
+++ b/src/transformers/tools/__init__.py
@@ -38,7 +38,6 @@
     _import_structure["image_captioning"] = ["ImageCaptioningTool"]
     _import_structure["image_question_answering"] = ["ImageQuestionAnsweringTool"]
     _import_structure["image_segmentation"] = ["ImageSegmentationTool"]
-    _import_structure["language_identifier"] = ["LanguageIdentificationTool"]
     _import_structure["speech_to_text"] = ["SpeechToTextTool"]
     _import_structure["text_classification"] = ["TextClassificationTool"]
     _import_structure["text_question_answering"] = ["TextQuestionAnsweringTool"]
@@ -60,7 +59,6 @@
         from .image_captioning import ImageCaptioningTool
         from .image_question_answering import ImageQuestionAnsweringTool
         from .image_segmentation import ImageSegmentationTool
-        from .language_identifier import LanguageIdentificationTool
         from .speech_to_text import SpeechToTextTool
         from .text_classification import TextClassificationTool
         from .text_question_answering import TextQuestionAnsweringTool

From c045249049d6c3007dd08dd90dfca3ea5a764d8c Mon Sep 17 00:00:00 2001
From: Shehan Munasinghe <shehanmunasinghe@gmail.com>
Date: Fri, 12 May 2023 16:22:31 +0530
Subject: [PATCH 097/935] Add swiftformer (#22686)

* Commit the automatically generated code

using add-new-model-like

* Update description at swiftformer.mdx file

* remove autogenerated code for MaskedImageModeling

* update weight conversion scripts

* Update modeling_swiftformer.py

* update configuration_swiftformer.py

* Update test_modeling_swiftformer.py

* update modeling code - remove einops dependency

* Update _toctree.yml

* update modeling code - remove copied from comments

* update docs

* Revert "update docs"

This reverts commit c2e05e2998fe2cd6eaee8b8cc31aca5222bac9fb.

* update docs

* remove unused reference SwiftFormerImageProcessor

* update dependency_versions_table.py

* update swiftformer.mdx

* update swiftformer.mdx

* change model output type - no attentions

* update model org name

* Fix typo

* fix copies

* Update tests/models/swiftformer/test_modeling_swiftformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/auto/feature_extraction_auto.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/swiftformer.mdx

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/swiftformer/configuration_swiftformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update modeling_swiftformer.py

fix-copies

* make style, make quality, fix-copies

* Apply suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make style

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make fix-copies

* Update modeling_swiftformer.py

* Update modeling_swiftformer.py

* Add suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |   1 +
 README_es.md                                  |   9 +-
 README_hd.md                                  |   9 +-
 README_ja.md                                  |   9 +-
 README_ko.md                                  |   9 +-
 README_zh-hans.md                             |   9 +-
 README_zh-hant.md                             |   9 +-
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.mdx                      |   2 +
 docs/source/en/model_doc/swiftformer.mdx      |  45 ++
 docs/source/en/serialization.mdx              |   1 +
 docs/source/en/tasks/image_classification.mdx |   2 +-
 src/transformers/__init__.py                  |  16 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 .../models/swiftformer/__init__.py            |  67 ++
 .../swiftformer/configuration_swiftformer.py  | 136 ++++
 .../convert_swiftformer_original_to_hf.py     | 176 +++++
 .../swiftformer/modeling_swiftformer.py       | 623 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 tests/models/swiftformer/__init__.py          |   0
 .../swiftformer/test_modeling_swiftformer.py  | 307 +++++++++
 25 files changed, 1439 insertions(+), 25 deletions(-)
 create mode 100644 docs/source/en/model_doc/swiftformer.mdx
 create mode 100644 src/transformers/models/swiftformer/__init__.py
 create mode 100644 src/transformers/models/swiftformer/configuration_swiftformer.py
 create mode 100644 src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
 create mode 100644 src/transformers/models/swiftformer/modeling_swiftformer.py
 create mode 100644 tests/models/swiftformer/__init__.py
 create mode 100644 tests/models/swiftformer/test_modeling_swiftformer.py

diff --git a/README.md b/README.md
index 7b0ff6f293cf..005a67e85e50 100644
--- a/README.md
+++ b/README.md
@@ -432,6 +432,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/README_es.md b/README_es.md
index c3c569c531d6..2f611fb09dfa 100644
--- a/README_es.md
+++ b/README_es.md
@@ -329,7 +329,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@@ -388,7 +388,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -410,9 +410,9 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@@ -420,6 +420,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/README_hd.md b/README_hd.md
index af9c58011455..8245082beade 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -301,7 +301,7 @@ conda install -c huggingface transformers
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा।
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा।
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले ​​द्वारा रिहाई।
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा।
@@ -360,7 +360,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta से) the NLLB team. द्वाराअनुसंधान पत्र [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) के साथ जारी किया गया
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -382,9 +382,9 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP से) साथ में पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स] (https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योआव आर्टज़ी द्वारा पोस्ट किया गया।
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@@ -392,6 +392,7 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [लार्ज-स्केल सेल्फ- एंड सेमी-सुपरवाइज्ड लर्निंग फॉर स्पीच ट्रांसलेशन](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया।
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [स्पैन सिलेक्शन को प्री-ट्रेनिंग करके कुछ-शॉट क्वेश्चन आंसरिंग](https:// arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा।
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा।
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883।
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/README_ja.md b/README_ja.md
index 953ff5598e4a..8a75ec530c75 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -363,7 +363,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
@@ -422,7 +422,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
@@ -444,9 +444,9 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research から) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. から公開された研究論文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)
@@ -454,6 +454,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
diff --git a/README_ko.md b/README_ko.md
index 2707f191dad0..c7e128ad98d0 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -278,7 +278,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@@ -337,7 +337,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
@@ -359,9 +359,9 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research 에서 제공)은 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.의 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)논문과 함께 발표했습니다.
@@ -369,6 +369,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 74d39b1df3d6..1a4b647ca144 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -302,7 +302,7 @@ conda install -c huggingface transformers
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
@@ -361,7 +361,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
@@ -383,9 +383,9 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (来自 Microsoft Research) 伴随论文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 由 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei 发布。
@@ -393,6 +393,7 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 4fd2a3caad54..9e915cde44dd 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -314,7 +314,7 @@ conda install -c huggingface transformers
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@@ -373,7 +373,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/main/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -395,9 +395,9 @@ conda install -c huggingface transformers
 1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[RWKV](https://huggingface.co/docs/transformers/main/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
@@ -405,6 +405,7 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 31c782697843..a9113c5e7400 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -502,6 +502,8 @@
         title: ResNet
       - local: model_doc/segformer
         title: SegFormer
+      - local: model_doc/swiftformer
+        title: SwiftFormer
       - local: model_doc/swin
         title: Swin Transformer
       - local: model_doc/swinv2
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index c9af68ae078e..d0a3c0babbbe 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -206,6 +206,7 @@ The documentation is organized into five sections:
 1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
@@ -408,6 +409,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           SpeechT5            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SwiftFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/swiftformer.mdx b/docs/source/en/model_doc/swiftformer.mdx
new file mode 100644
index 000000000000..29ca6e229441
--- /dev/null
+++ b/docs/source/en/model_doc/swiftformer.mdx
@@ -0,0 +1,45 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SwiftFormer
+
+## Overview
+
+The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+
+The SwiftFormer paper introduces a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations in the self-attention computation with linear element-wise multiplications. A series of models called 'SwiftFormer' is built based on this, which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Even their small variant achieves 78.5% top-1 ImageNet1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2× faster compared to MobileViT-v2.
+
+The abstract from the paper is the following:
+
+*Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.*
+
+Tips:
+    - One can use the [`ViTImageProcessor`] API to prepare images for the model.
+
+
+This model was contributed by [shehan97](https://huggingface.co/shehan97).
+The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
+
+
+## SwiftFormerConfig
+
+[[autodoc]] SwiftFormerConfig
+
+## SwiftFormerModel
+
+[[autodoc]] SwiftFormerModel
+    - forward
+
+## SwiftFormerForImageClassification
+
+[[autodoc]] SwiftFormerForImageClassification
+    - forward
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index 29fab3724d56..cc429dea08a5 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -112,6 +112,7 @@ Ready-made configurations include the following architectures:
 - RoFormer
 - SegFormer
 - SqueezeBERT
+- SwiftFormer
 - Swin Transformer
 - T5
 - Table Transformer
diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx
index 4e9f436a3c5f..635b0b6358c8 100644
--- a/docs/source/en/tasks/image_classification.mdx
+++ b/docs/source/en/tasks/image_classification.mdx
@@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
 <!--End of the generated tip-->
 
 </Tip>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 894aed09900e..ca476f30c291 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -464,6 +464,7 @@
     ],
     "models.splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig", "SplinterTokenizer"],
     "models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
+    "models.swiftformer": ["SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwiftFormerConfig"],
     "models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"],
     "models.swin2sr": ["SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swin2SRConfig"],
     "models.swinv2": ["SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swinv2Config"],
@@ -2463,6 +2464,14 @@
             "SqueezeBertPreTrainedModel",
         ]
     )
+    _import_structure["models.swiftformer"].extend(
+        [
+            "SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SwiftFormerForImageClassification",
+            "SwiftFormerModel",
+            "SwiftFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.swin"].extend(
         [
             "SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4221,6 +4230,7 @@
     )
     from .models.splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig, SplinterTokenizer
     from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
+    from .models.swiftformer import SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SwiftFormerConfig
     from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig
     from .models.swin2sr import SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP, Swin2SRConfig
     from .models.swinv2 import SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Swinv2Config
@@ -5876,6 +5886,12 @@
             SqueezeBertModule,
             SqueezeBertPreTrainedModel,
         )
+        from .models.swiftformer import (
+            SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SwiftFormerForImageClassification,
+            SwiftFormerModel,
+            SwiftFormerPreTrainedModel,
+        )
         from .models.swin import (
             SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
             SwinBackbone,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 9c955a008f30..91a2d3ed5dab 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -173,6 +173,7 @@
     speecht5,
     splinter,
     squeezebert,
+    swiftformer,
     swin,
     swin2sr,
     swinv2,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 8da4a15b29ee..80f7af5caad3 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -174,6 +174,7 @@
         ("speecht5", "SpeechT5Config"),
         ("splinter", "SplinterConfig"),
         ("squeezebert", "SqueezeBertConfig"),
+        ("swiftformer", "SwiftFormerConfig"),
         ("swin", "SwinConfig"),
         ("swin2sr", "Swin2SRConfig"),
         ("swinv2", "Swinv2Config"),
@@ -354,6 +355,7 @@
         ("speecht5", "SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("squeezebert", "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("swiftformer", "SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin", "SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swin2sr", "SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("swinv2", "SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -558,6 +560,7 @@
         ("speecht5", "SpeechT5"),
         ("splinter", "Splinter"),
         ("squeezebert", "SqueezeBERT"),
+        ("swiftformer", "SwiftFormer"),
         ("swin", "Swin Transformer"),
         ("swin2sr", "Swin2SR"),
         ("swinv2", "Swin Transformer V2"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 0a527ee15175..ff9dec171f43 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -78,6 +78,7 @@
         ("sew-d", "Wav2Vec2FeatureExtractor"),
         ("speech_to_text", "Speech2TextFeatureExtractor"),
         ("speecht5", "SpeechT5FeatureExtractor"),
+        ("swiftformer", "ViTFeatureExtractor"),
         ("swin", "ViTFeatureExtractor"),
         ("swinv2", "ViTFeatureExtractor"),
         ("table-transformer", "DetrFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 6445914a395b..d3d2944ff823 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -87,6 +87,7 @@
         ("resnet", "ConvNextImageProcessor"),
         ("sam", "SamImageProcessor"),
         ("segformer", "SegformerImageProcessor"),
+        ("swiftformer", "ViTImageProcessor"),
         ("swin", "ViTImageProcessor"),
         ("swin2sr", "Swin2SRImageProcessor"),
         ("swinv2", "ViTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8e1835e31f09..2fb8f1172f39 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -167,6 +167,7 @@
         ("speecht5", "SpeechT5Model"),
         ("splinter", "SplinterModel"),
         ("squeezebert", "SqueezeBertModel"),
+        ("swiftformer", "SwiftFormerModel"),
         ("swin", "SwinModel"),
         ("swin2sr", "Swin2SRModel"),
         ("swinv2", "Swinv2Model"),
@@ -468,6 +469,7 @@
         ("regnet", "RegNetForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
         ("segformer", "SegformerForImageClassification"),
+        ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
         ("van", "VanForImageClassification"),
diff --git a/src/transformers/models/swiftformer/__init__.py b/src/transformers/models/swiftformer/__init__.py
new file mode 100644
index 000000000000..ddba2b806fd1
--- /dev/null
+++ b/src/transformers/models/swiftformer/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_swiftformer": [
+        "SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SwiftFormerConfig",
+        "SwiftFormerOnnxConfig",
+    ]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_swiftformer"] = [
+        "SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "SwiftFormerForImageClassification",
+        "SwiftFormerModel",
+        "SwiftFormerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_swiftformer import (
+        SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SwiftFormerConfig,
+        SwiftFormerOnnxConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_swiftformer import (
+            SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SwiftFormerForImageClassification,
+            SwiftFormerModel,
+            SwiftFormerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/swiftformer/configuration_swiftformer.py b/src/transformers/models/swiftformer/configuration_swiftformer.py
new file mode 100644
index 000000000000..6c9ae7d4017d
--- /dev/null
+++ b/src/transformers/models/swiftformer/configuration_swiftformer.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2023 MBZUAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SwiftFormer model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "MBZUAI/swiftformer-xs": "https://huggingface.co/MBZUAI/swiftformer-xs/resolve/main/config.json",
+}
+
+
+class SwiftFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SwiftFormerModel`]. It is used to instantiate an
+    SwiftFormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SwiftFormer
+    [MBZUAI/swiftformer-xs](https://huggingface.co/MBZUAI/swiftformer-xs) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels
+        depths (`List[int]`, *optional*, defaults to `[3, 3, 6, 4]`):
+            Depth of each stage
+        embed_dims (`List[int]`, *optional*, defaults to `[48, 56, 112, 220]`):
+            The embedding dimension at each stage
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
+        downsamples (`List[bool]`, *optional*, defaults to `[True, True, True, True]`)
+            Whether or not to downsample inputs between two stages.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (string). `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        down_patch_size (`int`, *optional*, defaults to 3):
+            The size of patches in downsampling layers.
+        down_stride (`int`, *optional*, defaults to 2):
+            The stride of convolution kernels in downsampling layers.
+        down_pad (`int`, *optional*, defaults to 1):
+            Padding in downsampling layers.
+        drop_path_rate (`float`, *optional*, defaults to 0.):
+            Rate at which to increase dropout probability in DropPath.
+        use_layer_scale (`bool`, *optional*, defaults to `True`):
+            Whether to scale outputs from token mixers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+            Factor by which outputs from token mixers are scaled.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the batch normalization layers.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import SwiftFormerConfig, SwiftFormerModel
+
+    >>> # Initializing a SwiftFormer swiftformer-base-patch16-224 style configuration
+    >>> configuration = SwiftFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the swiftformer-base-patch16-224 style configuration
+    >>> model = SwiftFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "swiftformer"
+
+    def __init__(
+        self,
+        num_channels=3,
+        depths=[3, 3, 6, 4],
+        embed_dims=[48, 56, 112, 220],
+        mlp_ratio=4,
+        downsamples=[True, True, True, True],
+        hidden_act="gelu",
+        down_patch_size=3,
+        down_stride=2,
+        down_pad=1,
+        drop_path_rate=0.0,
+        use_layer_scale=True,
+        layer_scale_init_value=1e-5,
+        batch_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.depths = depths
+        self.embed_dims = embed_dims
+        self.mlp_ratio = mlp_ratio
+        self.downsamples = downsamples
+        self.hidden_act = hidden_act
+        self.down_patch_size = down_patch_size
+        self.down_stride = down_stride
+        self.down_pad = down_pad
+        self.drop_path_rate = drop_path_rate
+        self.use_layer_scale = use_layer_scale
+        self.layer_scale_init_value = layer_scale_init_value
+        self.batch_norm_eps = batch_norm_eps
+
+
+class SwiftFormerOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
new file mode 100644
index 000000000000..f6cabb34b6a5
--- /dev/null
+++ b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert SwiftFormer checkpoints from the original implementation."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import (
+    SwiftFormerConfig,
+    SwiftFormerForImageClassification,
+    ViTImageProcessor,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+device = torch.device("cpu")
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+def get_expected_output(swiftformer_name):
+    if swiftformer_name == "swiftformer_xs":
+        return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])
+
+    elif swiftformer_name == "swiftformer_s":
+        return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])
+
+    elif swiftformer_name == "swiftformer_l1":
+        return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])
+
+    elif swiftformer_name == "swiftformer_l3":
+        return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def create_rename_keys(state_dict):
+    rename_keys = []
+    for k in state_dict.keys():
+        k_new = k
+        if ".pwconv" in k:
+            k_new = k_new.replace(".pwconv", ".point_wise_conv")
+        if ".dwconv" in k:
+            k_new = k_new.replace(".dwconv", ".depth_wise_conv")
+        if ".Proj." in k:
+            k_new = k_new.replace(".Proj.", ".proj.")
+        if "patch_embed" in k_new:
+            k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
+        if "network" in k_new:
+            ls = k_new.split(".")
+            if ls[2].isdigit():
+                k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
+            else:
+                k_new = k_new.replace("network", "swiftformer.encoder.network")
+        rename_keys.append((k, k_new))
+    return rename_keys
+
+
+@torch.no_grad()
+def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
+    """
+    Copy/paste/tweak model's weights to our SwiftFormer structure.
+    """
+
+    # define default SwiftFormer configuration
+    config = SwiftFormerConfig()
+
+    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
+    config.num_labels = 1000
+    repo_id = "huggingface/label-files"
+    filename = "imagenet-1k-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    # size of the architecture
+    if swiftformer_name == "swiftformer_xs":
+        config.depths = [3, 3, 6, 4]
+        config.embed_dims = [48, 56, 112, 220]
+
+    elif swiftformer_name == "swiftformer_s":
+        config.depths = [3, 3, 9, 6]
+        config.embed_dims = [48, 64, 168, 224]
+
+    elif swiftformer_name == "swiftformer_l1":
+        config.depths = [4, 3, 10, 5]
+        config.embed_dims = [48, 96, 192, 384]
+
+    elif swiftformer_name == "swiftformer_l3":
+        config.depths = [4, 4, 12, 6]
+        config.embed_dims = [64, 128, 320, 512]
+
+    # load state_dict of original model, remove and rename some keys
+    if original_ckpt:
+        if original_ckpt.startswith("https"):
+            checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
+        else:
+            checkpoint = torch.load(original_ckpt, map_location="cpu")
+    state_dict = checkpoint
+
+    rename_keys = create_rename_keys(state_dict)
+    for rename_key_src, rename_key_dest in rename_keys:
+        rename_key(state_dict, rename_key_src, rename_key_dest)
+
+    # load HuggingFace model
+    hf_model = SwiftFormerForImageClassification(config).eval()
+    hf_model.load_state_dict(state_dict)
+
+    # prepare test inputs
+    image = prepare_img()
+    processor = ViTImageProcessor.from_pretrained("preprocessor_config")
+    inputs = processor(images=image, return_tensors="pt")
+
+    # compare outputs from both models
+    timm_logits = get_expected_output(swiftformer_name)
+    hf_logits = hf_model(inputs["pixel_values"]).logits
+
+    assert hf_logits.shape == torch.Size([1, 1000])
+    assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--swiftformer_name",
+        default="swiftformer_xs",
+        choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
+        type=str,
+        help="Name of the SwiftFormer model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="./converted_outputs/",
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
+
+    args = parser.parse_args()
+    convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py
new file mode 100644
index 000000000000..a29ed38fb42f
--- /dev/null
+++ b/src/transformers/models/swiftformer/modeling_swiftformer.py
@@ -0,0 +1,623 @@
+# coding=utf-8
+# Copyright 2023 MBZUAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch SwiftFormer model."""
+
+
+import collections.abc
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2CLS
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_swiftformer import SwiftFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "SwiftFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "MBZUAI/swiftformer-xs"
+_EXPECTED_OUTPUT_SHAPE = [1, 220, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "MBZUAI/swiftformer-xs"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "MBZUAI/swiftformer-xs",
+    # See all SwiftFormer models at https://huggingface.co/models?filter=swiftformer
+]
+
+
+class SwiftFormerPatchEmbedding(nn.Module):
+    """
+    Patch Embedding Layer constructed of two 2D convolutional layers.
+
+    Input: tensor of shape `[batch_size, in_channels, height, width]`
+
+    Output: tensor of shape `[batch_size, out_channels, height/4, width/4]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig):
+        super().__init__()
+
+        in_chs = config.num_channels
+        out_chs = config.embed_dims[0]
+        self.patch_embedding = nn.Sequential(
+            nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(out_chs // 2, eps=config.batch_norm_eps),
+            nn.ReLU(),
+            nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(out_chs, eps=config.batch_norm_eps),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.patch_embedding(x)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Swiftformer
+class SwiftFormerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class SwiftFormerEmbeddings(nn.Module):
+    """
+    Embeddings layer consisting of a single 2D convolutional and batch normalization layer.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height/stride, width/stride]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, index: int):
+        super().__init__()
+
+        patch_size = config.down_patch_size
+        stride = config.down_stride
+        padding = config.down_pad
+        embed_dims = config.embed_dims
+
+        in_chans = embed_dims[index]
+        embed_dim = embed_dims[index + 1]
+
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
+        padding = padding if isinstance(padding, collections.abc.Iterable) else (padding, padding)
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.norm(x)
+        return x
+
+
+class SwiftFormerConvEncoder(nn.Module):
+    """
+    `SwiftFormerConvEncoder` with 3*3 and 1*1 convolutions.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int):
+        super().__init__()
+        hidden_dim = int(config.mlp_ratio * dim)
+
+        self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
+        self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
+        self.point_wise_conv1 = nn.Conv2d(dim, hidden_dim, kernel_size=1)
+        self.act = nn.GELU()
+        self.point_wise_conv2 = nn.Conv2d(hidden_dim, dim, kernel_size=1)
+        self.drop_path = nn.Identity()
+        self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
+
+    def forward(self, x):
+        input = x
+        x = self.depth_wise_conv(x)
+        x = self.norm(x)
+        x = self.point_wise_conv1(x)
+        x = self.act(x)
+        x = self.point_wise_conv2(x)
+        x = input + self.drop_path(self.layer_scale * x)
+        return x
+
+
+class SwiftFormerMlp(nn.Module):
+    """
+    MLP layer with 1*1 convolutions.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, in_features: int):
+        super().__init__()
+        hidden_features = int(in_features * config.mlp_ratio)
+        self.norm1 = nn.BatchNorm2d(in_features, eps=config.batch_norm_eps)
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        act_layer = ACT2CLS[config.hidden_act]
+        self.act = act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, in_features, 1)
+        self.drop = nn.Dropout(p=0.0)
+
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwiftFormerEfficientAdditiveAttention(nn.Module):
+    """
+    Efficient Additive Attention module for SwiftFormer.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int = 512):
+        super().__init__()
+
+        self.to_query = nn.Linear(dim, dim)
+        self.to_key = nn.Linear(dim, dim)
+
+        self.w_g = nn.Parameter(torch.randn(dim, 1))
+        self.scale_factor = dim**-0.5
+        self.proj = nn.Linear(dim, dim)
+        self.final = nn.Linear(dim, dim)
+
+    def forward(self, x):
+        query = self.to_query(x)
+        key = self.to_key(x)
+
+        query = torch.nn.functional.normalize(query, dim=-1)
+        key = torch.nn.functional.normalize(key, dim=-1)
+
+        query_weight = query @ self.w_g
+        scaled_query_weight = query_weight * self.scale_factor
+        scaled_query_weight = scaled_query_weight.softmax(dim=-1)
+
+        global_queries = torch.sum(scaled_query_weight * query, dim=1)
+        global_queries = global_queries.unsqueeze(1).repeat(1, key.shape[1], 1)
+
+        out = self.proj(global_queries * key) + query
+        out = self.final(out)
+
+        return out
+
+
+class SwiftFormerLocalRepresentation(nn.Module):
+    """
+    Local Representation module for SwiftFormer that is implemented by 3*3 depth-wise and point-wise convolutions.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int):
+        super().__init__()
+
+        self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
+        self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
+        self.point_wise_conv1 = nn.Conv2d(dim, dim, kernel_size=1)
+        self.act = nn.GELU()
+        self.point_wise_conv2 = nn.Conv2d(dim, dim, kernel_size=1)
+        self.drop_path = nn.Identity()
+        self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
+
+    def forward(self, x):
+        input = x
+        x = self.depth_wise_conv(x)
+        x = self.norm(x)
+        x = self.point_wise_conv1(x)
+        x = self.act(x)
+        x = self.point_wise_conv2(x)
+        x = input + self.drop_path(self.layer_scale * x)
+        return x
+
+
+class SwiftFormerEncoderBlock(nn.Module):
+    """
+    SwiftFormer Encoder Block for SwiftFormer. It consists of (1) Local representation module, (2)
+    SwiftFormerEfficientAdditiveAttention, and (3) MLP block.
+
+    Input: tensor of shape `[batch_size, channels, height, width]`
+
+    Output: tensor of shape `[batch_size, channels,height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, dim: int, drop_path: float = 0.0) -> None:
+        super().__init__()
+
+        layer_scale_init_value = config.layer_scale_init_value
+        use_layer_scale = config.use_layer_scale
+
+        self.local_representation = SwiftFormerLocalRepresentation(config, dim=dim)
+        self.attn = SwiftFormerEfficientAdditiveAttention(config, dim=dim)
+        self.linear = SwiftFormerMlp(config, in_features=dim)
+        self.drop_path = SwiftFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = use_layer_scale
+        if use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(
+                layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
+            )
+            self.layer_scale_2 = nn.Parameter(
+                layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
+            )
+
+    def forward(self, x):
+        x = self.local_representation(x)
+        batch_size, channels, height, width = x.shape
+        if self.use_layer_scale:
+            x = x + self.drop_path(
+                self.layer_scale_1
+                * self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
+                .reshape(batch_size, height, width, channels)
+                .permute(0, 3, 1, 2)
+            )
+            x = x + self.drop_path(self.layer_scale_2 * self.linear(x))
+
+        else:
+            x = x + self.drop_path(
+                self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
+                .reshape(batch_size, height, width, channels)
+                .permute(0, 3, 1, 2)
+            )
+            x = x + self.drop_path(self.linear(x))
+        return x
+
+
+class SwiftFormerStage(nn.Module):
+    """
+    A Swiftformer stage consisting of a series of `SwiftFormerConvEncoder` blocks and a final
+    `SwiftFormerEncoderBlock`.
+
+    Input: tensor in shape `[batch_size, channels, height, width]`
+
+    Output: tensor in shape `[batch_size, channels, height, width]`
+    """
+
+    def __init__(self, config: SwiftFormerConfig, index: int) -> None:
+        super().__init__()
+
+        layer_depths = config.depths
+        dim = config.embed_dims[index]
+        depth = layer_depths[index]
+
+        blocks = []
+        for block_idx in range(depth):
+            block_dpr = config.drop_path_rate * (block_idx + sum(layer_depths[:index])) / (sum(layer_depths) - 1)
+
+            if depth - block_idx <= 1:
+                blocks.append(SwiftFormerEncoderBlock(config, dim=dim, drop_path=block_dpr))
+            else:
+                blocks.append(SwiftFormerConvEncoder(config, dim=dim))
+
+        self.blocks = nn.ModuleList(blocks)
+
+    def forward(self, input):
+        for block in self.blocks:
+            input = block(input)
+        return input
+
+
+class SwiftFormerEncoder(nn.Module):
+    def __init__(self, config: SwiftFormerConfig) -> None:
+        super().__init__()
+        self.config = config
+
+        embed_dims = config.embed_dims
+        downsamples = config.downsamples
+        layer_depths = config.depths
+
+        # Transformer model
+        network = []
+        for i in range(len(layer_depths)):
+            stage = SwiftFormerStage(config=config, index=i)
+            network.append(stage)
+            if i >= len(layer_depths) - 1:
+                break
+            if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
+                # downsampling between two stages
+                network.append(SwiftFormerEmbeddings(config, index=i))
+        self.network = nn.ModuleList(network)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+
+        for block in self.network:
+            hidden_states = block(hidden_states)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+class SwiftFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SwiftFormerConfig
+    base_model_prefix = "swiftformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Conv2d, nn.Linear)):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+        elif isinstance(module, (nn.LayerNorm)):
+            nn.init.constant_(module.bias, 0)
+            nn.init.constant_(module.weight, 1.0)
+
+    def _set_gradient_checkpointing(self, module: SwiftFormerEncoder, value: bool = False) -> None:
+        if isinstance(module, SwiftFormerEncoder):
+            module.gradient_checkpointing = value
+
+
+SWIFTFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`SwiftFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SWIFTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare SwiftFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    SWIFTFORMER_START_DOCSTRING,
+)
+class SwiftFormerModel(SwiftFormerPreTrainedModel):
+    def __init__(self, config: SwiftFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.patch_embed = SwiftFormerPatchEmbedding(config)
+        self.encoder = SwiftFormerEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
+        r""" """
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.patch_embed(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return tuple(v for v in encoder_outputs if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=encoder_outputs.last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    SwiftFormer Model transformer with an image classification head on top (e.g. for ImageNet).
+    """,
+    SWIFTFORMER_START_DOCSTRING,
+)
+class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
+    def __init__(self, config: SwiftFormerConfig) -> None:
+        super().__init__(config)
+
+        embed_dims = config.embed_dims
+
+        self.num_labels = config.num_labels
+        self.swiftformer = SwiftFormerModel(config)
+
+        # Classifier head
+        self.norm = nn.BatchNorm2d(embed_dims[-1], eps=config.batch_norm_eps)
+        self.head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
+        self.dist_head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # run base model
+        outputs = self.swiftformer(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs.last_hidden_state if return_dict else outputs[0]
+
+        # run classification head
+        sequence_output = self.norm(sequence_output)
+        sequence_output = sequence_output.flatten(2).mean(-1)
+        cls_out = self.head(sequence_output)
+        distillation_out = self.dist_head(sequence_output)
+        logits = (cls_out + distillation_out) / 2
+
+        # calculate loss
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index eeb799e18875..ed09fe33ad5f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -6440,6 +6440,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class SwiftFormerForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SwiftFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SwiftFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/swiftformer/__init__.py b/tests/models/swiftformer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
new file mode 100644
index 000000000000..ca616976fd4a
--- /dev/null
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -0,0 +1,307 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch SwiftFormer model. """
+
+
+import copy
+import inspect
+import unittest
+
+from transformers import PretrainedConfig, SwiftFormerConfig
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import SwiftFormerForImageClassification, SwiftFormerModel
+    from transformers.models.swiftformer.modeling_swiftformer import SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTImageProcessor
+
+
+class SwiftFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        image_size=224,
+        num_labels=1000,
+        layer_depths=[3, 3, 6, 4],
+        embed_dims=[48, 56, 112, 220],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_labels = num_labels
+        self.image_size = image_size
+        self.layer_depths = layer_depths
+        self.embed_dims = embed_dims
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return SwiftFormerConfig(
+            depths=self.layer_depths,
+            embed_dims=self.embed_dims,
+            mlp_ratio=4,
+            downsamples=[True, True, True, True],
+            hidden_act="gelu",
+            num_labels=self.num_labels,
+            down_patch_size=3,
+            down_stride=2,
+            down_pad=1,
+            drop_rate=0.0,
+            drop_path_rate=0.0,
+            use_layer_scale=True,
+            layer_scale_init_value=1e-5,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = SwiftFormerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dims[-1], 7, 7))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = SwiftFormerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+        model = SwiftFormerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        (config, pixel_values, labels) = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SwiftFormer does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (SwiftFormerModel, SwiftFormerForImageClassification) if is_torch_available() else ()
+
+    pipeline_model_mapping = (
+        {"feature-extraction": SwiftFormerModel, "image-classification": SwiftFormerForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = SwiftFormerModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=SwiftFormerConfig,
+            has_text_modality=False,
+            hidden_size=37,
+            num_attention_heads=12,
+            num_hidden_layers=12,
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="SwiftFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = SwiftFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="SwiftFormer does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_stages = 8
+            self.assertEqual(len(hidden_states), expected_num_stages)  # TODO
+
+            # SwiftFormer's feature maps are of shape (batch_size, embed_dims, height, width)
+            # with the width and height being successively divided by 2, after every 2 blocks
+            for i in range(len(hidden_states)):
+                self.assertEqual(
+                    hidden_states[i].shape,
+                    torch.Size(
+                        [
+                            self.model_tester.batch_size,
+                            self.model_tester.embed_dims[i // 2],
+                            (self.model_tester.image_size // 4) // 2 ** (i // 2),
+                            (self.model_tester.image_size // 4) // 2 ** (i // 2),
+                        ]
+                    ),
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_initialization(self):
+        def _config_zero_init(config):
+            configs_no_init = copy.deepcopy(config)
+            for key in configs_no_init.__dict__.keys():
+                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+                    setattr(configs_no_init, key, 1e-10)
+                if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
+                    no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
+                    setattr(configs_no_init, key, no_init_subconfig)
+            return configs_no_init
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9) / 1e9).round().item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class SwiftFormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]])
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 8c8744a94a70ee2fd7ee2047512c980044b13f17 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 12 May 2023 13:37:37 +0200
Subject: [PATCH 098/935] Fix docker image (caused by `tensorflow_text`)
 (#23321)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index c01eb22ea98e..a9628f5909c1 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -32,11 +32,7 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12
-# (temporarily) The above TF installation upgrade protobuf to v4, which we haven't tested yet!
-# (Before the above line, the installed TF is 2.11 due to some conflict)
-RUN python3 -m pip install --no-cache-dir -U protobuf==3.20.3
-RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
 RUN python3 -m pip uninstall -y flax jax
 
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable

From 7f8b909189547944617741d8d3c6c84504701693 Mon Sep 17 00:00:00 2001
From: Mario Lezcano Casado <3291265+lezcano@users.noreply.github.com>
Date: Fri, 12 May 2023 14:35:37 +0100
Subject: [PATCH 099/935] Compute the mask in-place, with less memory reads,
 and on CUDA on `XLNetLMHeadModel` (#23332)

When working on TorchInductor, I realised that there was a part from
`XLNetLMHeadModel` that was being compiled to CPU code.

This PR should allow to fuse this operation with other CUDA operations
in `torch.compile`. It also should be faster on eager mode, as it has a
this implementation has a lower foot-print.

If in-place operations are not allowed even in non-grad context, I still
believe that doing ones + tril rather than a ones + tril + zeros + cat
should be faster simply due to the number of memory reads/writes.

I tested that this code produces the same results for `0 <= qlen,mlen <
10` and `same_length in (True, False)`.
---
 src/transformers/models/xlnet/modeling_xlnet.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index d6ed227ff886..79197f49f10b 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -976,16 +976,15 @@ def create_mask(self, qlen, mlen):
                v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]
 
         """
-        attn_mask = torch.ones([qlen, qlen])
-        mask_up = torch.triu(attn_mask, diagonal=1)
-        attn_mask_pad = torch.zeros([qlen, mlen])
-        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
+        mask = torch.ones(qlen, qlen + mlen, self.device)
         if self.same_length:
-            mask_lo = torch.tril(attn_mask, diagonal=-1)
-            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
+            mask_lo = mask[:, :qlen].tril(-1)
+            mask.triu_(mlen + 1)
+            mask[:, :qlen] += mask_lo
+        else:
+            mask.triu_(mlen + 1)
 
-        ret = ret.to(self.device)
-        return ret
+        return mask
 
     def cache_mem(self, curr_out, prev_mem):
         # cache hidden states into memory.

From a3975f94f3a090a54ed4ec78ab736ce6aaee6742 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 12 May 2023 16:35:15 +0200
Subject: [PATCH 100/935] Only add files with modification outside doc blocks
 (#23327)

* min. version for pytest

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 setup.py                                      |  2 +-
 src/transformers/dependency_versions_table.py |  2 +-
 utils/tests_fetcher.py                        | 19 ++++++++++++++++---
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 43b33be417cc..f553e69bc129 100644
--- a/setup.py
+++ b/setup.py
@@ -151,7 +151,7 @@
     "psutil",
     "pyyaml>=5.1",
     "pydantic",
-    "pytest",
+    "pytest>=7.2.0",
     "pytest-timeout",
     "pytest-xdist",
     "python>=3.7.0",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index e79a11ba54bc..4819b959cb81 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -52,7 +52,7 @@
     "psutil": "psutil",
     "pyyaml": "pyyaml>=5.1",
     "pydantic": "pydantic",
-    "pytest": "pytest",
+    "pytest": "pytest>=7.2.0",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
     "python": "python>=3.7.0",
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 8aa1015f3947..9cb6dd4f8b80 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -225,10 +225,23 @@ def get_diff_for_py_and_mdx_files(repo, base_commit, commits):
     for commit in commits:
         for diff_obj in commit.diff(base_commit):
             # We always add new python files
-            if diff_obj.change_type in ["A", "M", "R"] and (
-                diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")
-            ):
+            if diff_obj.change_type in ["A"] and (diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")):
                 code_diff.append(diff_obj.b_path)
+            # Now for modified files
+            elif (
+                diff_obj.change_type in ["M", "R"]
+                and diff_obj.b_path.endswith(".py")
+                or diff_obj.b_path.endswith(".mdx")
+            ):
+                # In case of renames, we'll look at the tests using both the old and new name.
+                if diff_obj.a_path != diff_obj.b_path:
+                    code_diff.extend([diff_obj.a_path, diff_obj.b_path])
+                else:
+                    # Otherwise, we check modifications are in code and not docstrings.
+                    if diff_is_docstring_only(repo, commit, diff_obj.b_path):
+                        print(f"Ignoring diff in {diff_obj.b_path} as it only concerns docstrings or comments.")
+                    else:
+                        code_diff.append(diff_obj.a_path)
 
     return code_diff
 

From ef3e25ce4e86b04b6c09383fdbd90cadfb379919 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 12 May 2023 08:29:13 -0700
Subject: [PATCH 101/935] [docs] Fix Agents and Tools docstring (#23313)

fix kwargs
---
 src/transformers/tools/agents.py | 4 ++--
 src/transformers/tools/base.py   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index a7b07a4932cf..fb8649bafad3 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -248,7 +248,7 @@ def chat(self, task, *, return_code=False, remote=False, **kwargs):
                 Whether to just return code and not evaluate it.
             remote (`bool`, *optional*, defaults to `False`):
                 Whether or not to use remote tools (inference endpoints) instead of local ones.
-            kwargs:
+            kwargs (additional keyword arguments, *optional*):
                 Any keyword argument to send to the agent when evaluating the code.
 
         Example:
@@ -298,7 +298,7 @@ def run(self, task, *, return_code=False, remote=False, **kwargs):
                 Whether to just return code and not evaluate it.
             remote (`bool`, *optional*, defaults to `False`):
                 Whether or not to use remote tools (inference endpoints) instead of local ones.
-            kwargs:
+            kwargs (additional keyword arguments, *optional*):
                 Any keyword argument to send to the agent when evaluating the code.
 
         Example:
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index b51868e7c60b..b2abdc818c29 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -188,7 +188,7 @@ def from_hub(cls, repo_id, model_repo_id=None, token=None, remote=False, **kwarg
                 `huggingface-cli login` (stored in `~/.huggingface`).
             remote (`bool`, *optional*, defaults to `False`):
                 Whether to use your tool by downloading the model or (if it is available) with an inference endpoint.
-            kwargs:
+            kwargs (additional keyword arguments, *optional*):
                 Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
                 `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the
                 others will be passed along to its init.
@@ -438,7 +438,7 @@ class PipelineTool(Tool):
         token (`str`, *optional*):
             The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
             running `huggingface-cli login` (stored in `~/.huggingface`).
-        hub_kwargs:
+        hub_kwargs (additional keyword arguments, *optional*):
             Any additional keyword argument to send to the methods that will load the data from the Hub.
     """
 
@@ -631,7 +631,7 @@ def load_tool(task_or_repo_id, model_repo_id=None, remote=False, token=None, **k
         token (`str`, *optional*):
             The token to identify you on hf.co. If unset, will use the token generated when running `huggingface-cli
             login` (stored in `~/.huggingface`).
-        kwargs:
+        kwargs (additional keyword arguments, *optional*):
             Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
             `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the others
             will be passed along to its init.

From 65d7b21b77fe0d13e6a96fa37f5a6a6c4d0a550f Mon Sep 17 00:00:00 2001
From: hwuebben <wbben123@yahoo.de>
Date: Fri, 12 May 2023 17:47:40 +0200
Subject: [PATCH 102/935] OR am I crazy? (#23295)

or or and
---
 src/transformers/models/sam/processing_sam.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 83356f41ee16..b5ae51d7db29 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -207,7 +207,7 @@ def _check_and_preprocess_points(
             if isinstance(input_points, torch.Tensor):
                 input_points = input_points.numpy().tolist()
 
-            if not isinstance(input_points, list) and not isinstance(input_points[0], list):
+            if not isinstance(input_points, list) or not isinstance(input_points[0], list):
                 raise ValueError("Input points must be a list of list of floating integers.")
             input_points = [np.array(input_point) for input_point in input_points]
         else:
@@ -217,7 +217,7 @@ def _check_and_preprocess_points(
             if isinstance(input_labels, torch.Tensor):
                 input_labels = input_labels.numpy().tolist()
 
-            if not isinstance(input_labels, list) and not isinstance(input_labels[0], list):
+            if not isinstance(input_labels, list) or not isinstance(input_labels[0], list):
                 raise ValueError("Input labels must be a list of list integers.")
             input_labels = [np.array(label) for label in input_labels]
         else:
@@ -229,8 +229,8 @@ def _check_and_preprocess_points(
 
             if (
                 not isinstance(input_boxes, list)
-                and not isinstance(input_boxes[0], list)
-                and not isinstance(input_boxes[0][0], list)
+                or not isinstance(input_boxes[0], list)
+                or not isinstance(input_boxes[0][0], list)
             ):
                 raise ValueError("Input boxes must be a list of list of list of floating integers.")
             input_boxes = [np.array(box).astype(np.float32) for box in input_boxes]

From 291c5e9b256ad3ae970f8ef47d1693f3ae976a6e Mon Sep 17 00:00:00 2001
From: Alisamar Husain <zrthxn@gmail.com>
Date: Fri, 12 May 2023 21:36:15 +0530
Subject: [PATCH 103/935] Handle padding warning in generation when using
 `inputs_embeds` (#23131)

* Handle padding warning in generation when using `inputs_embeds`

* Simpler condition

* Black formatter

* Changed warning logic
---
 src/transformers/generation/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 9f6b072a9a0c..9bf9add17e16 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1307,8 +1307,11 @@ def generate(
 
         # decoder-only models should use left-padding for generation
         if not self.config.is_encoder_decoder:
+            # If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
+            # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
             if (
                 generation_config.pad_token_id is not None
+                and len(inputs_tensor.shape) == 2
                 and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0
             ):
                 logger.warning(

From 79743cedab2ff429d2d362baae5354c8df050008 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Fri, 12 May 2023 23:59:50 +0530
Subject: [PATCH 104/935] replaced assert with raise ValueError for t5,
 switch_transformers, pix2struct, mt5, longt5, gptsan_japanese. (#23273)

* replaced assert with raise ValueError

* one liner

* reverse one liner and cache-decoder check
---
 .../modeling_gptsan_japanese.py               | 12 +++---
 .../models/longt5/modeling_longt5.py          | 19 +++++----
 src/transformers/models/mt5/modeling_mt5.py   | 35 ++++++++++------
 .../models/pix2struct/modeling_pix2struct.py  | 22 ++++++----
 .../modeling_switch_transformers.py           |  7 ++--
 src/transformers/models/t5/modeling_t5.py     | 40 ++++++++++++-------
 6 files changed, 85 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index 4b9181aac172..4343340a7f73 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -767,10 +767,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
-            " See T5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
+                "See T5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -782,7 +783,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 6d9b9876ab5a..56baa32eb488 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -451,9 +451,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -1349,10 +1350,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id."
-            " See LongT5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id."
+                "See LongT5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -1364,7 +1366,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index b8517cbb94d7..23a3cc168c1c 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -333,9 +333,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -818,10 +819,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id."
-            " See MT5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id."
+                "See MT5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -833,7 +835,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
@@ -952,7 +955,8 @@ def forward(
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
 
         if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
@@ -961,7 +965,8 @@ def forward(
         mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
 
         if use_cache is True:
-            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
 
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
@@ -1852,8 +1857,14 @@ def _reorder_cache(self, past_key_values, beam_idx):
                     layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                 )
 
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
 
             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
         return reordered_decoder_past
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index ffaebc20372e..1505ff07da40 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -479,10 +479,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id."
-            " See Pix2Struct docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id."
+                "See Pix2Struct docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -494,7 +495,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
@@ -1356,8 +1358,14 @@ def _reorder_cache(self, past_key_values, beam_idx):
                     layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                 )
 
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
 
             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
         return reordered_decoder_past
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index cba7ab96d84e..2a24d82f6aa3 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -515,9 +515,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index c8d34c7f656d..8ecae8dc8506 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -163,9 +163,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
             array = np.transpose(array)
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
@@ -473,9 +472,10 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            if len(past_key_value) != 2:
+                raise ValueError(
+                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                )
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
@@ -848,10 +848,11 @@ def _shift_right(self, input_ids):
         decoder_start_token_id = self.config.decoder_start_token_id
         pad_token_id = self.config.pad_token_id
 
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
-            " See T5 docs for more information"
-        )
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
+                "See T5 docs for more information."
+            )
 
         # shift inputs to the right
         if is_torch_fx_proxy(input_ids):
@@ -863,7 +864,8 @@ def _shift_right(self, input_ids):
             shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
 
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
         # replace possible -100 values in labels by `pad_token_id`
         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
@@ -981,7 +983,8 @@ def forward(
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
 
         if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
@@ -990,7 +993,8 @@ def forward(
         mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
 
         if use_cache is True:
-            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
 
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
@@ -1817,8 +1821,14 @@ def _reorder_cache(self, past_key_values, beam_idx):
                     layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
                 )
 
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
 
             reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
         return reordered_decoder_past

From cf11493dce0a1d22446efe0d6c4ade02fd928e50 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 12 May 2023 21:58:15 +0200
Subject: [PATCH 105/935] Use cu118 with cudnn >= 8.6 in docker file (#23339)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 4 ++--
 docker/transformers-tensorflow-gpu/Dockerfile | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index a9628f5909c1..3ad87790d61c 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -13,7 +13,7 @@ ARG PYTORCH='2.0.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='1.11.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu117'
+ARG CUDA='cu118'
 
 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
index 5261a37a9144..e404dd5a62c7 100644
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive

From 2958b55fe5738d769be8d246564f5dc19815a2b5 Mon Sep 17 00:00:00 2001
From: Gregory <GregorySenay@users.noreply.github.com>
Date: Mon, 15 May 2023 02:35:55 -0700
Subject: [PATCH 106/935] Removing one of the twice defined position_embeddings
 in LongFormer (#23343)

Removing twice defined position_embeddings

The self.position_embeddings in LongFormerEmbeddings is defined twice.
Removing the first with padding_idx
---
 src/transformers/models/longformer/modeling_longformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 7f088259c8c1..9768641afe45 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -438,7 +438,6 @@ class LongformerEmbeddings(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load

From 81a73fa638adf8a3768b37f3080ddbd6cc07418a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 May 2023 11:38:44 +0200
Subject: [PATCH 107/935] Fix issue introduced in PR #23163 (#23363)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/onnx/config.py | 2 +-
 tests/onnx/test_onnx_v2.py      | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index 66236e98645e..02bf2421f4d2 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -234,7 +234,7 @@ def is_torch_support_available(self) -> bool:
         if is_torch_available():
             from transformers.utils import get_torch_version
 
-            return get_torch_version() >= self.torch_onnx_minimum_version
+            return version.parse(get_torch_version()) >= self.torch_onnx_minimum_version
         else:
             return False
 
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index 796fa1b3ea6a..e160cd77f9a3 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -6,6 +6,7 @@
 from unittest.mock import patch
 
 import pytest
+from packaging import version
 from parameterized import parameterized
 
 from transformers import AutoConfig, PreTrainedTokenizerBase, is_tf_available, is_torch_available
@@ -321,7 +322,7 @@ def _onnx_export(
         if is_torch_available():
             from transformers.utils import get_torch_version
 
-            if get_torch_version() < onnx_config.torch_onnx_minimum_version:
+            if version.parse(get_torch_version()) < onnx_config.torch_onnx_minimum_version:
                 pytest.skip(
                     "Skipping due to incompatible PyTorch version. Minimum required is"
                     f" {onnx_config.torch_onnx_minimum_version}, got: {get_torch_version()}"
@@ -364,7 +365,7 @@ def _onnx_export_encoder_decoder_models(
         if is_torch_available():
             from transformers.utils import get_torch_version
 
-            if get_torch_version() < onnx_config.torch_onnx_minimum_version:
+            if version.parse(get_torch_version()) < onnx_config.torch_onnx_minimum_version:
                 pytest.skip(
                     "Skipping due to incompatible PyTorch version. Minimum required is"
                     f" {onnx_config.torch_onnx_minimum_version}, got: {get_torch_version()}"

From 65b885027a6d14872ae2425a1e24392d9082d2e7 Mon Sep 17 00:00:00 2001
From: richardachen <85973297+richardachen@users.noreply.github.com>
Date: Mon, 15 May 2023 07:04:16 -0400
Subject: [PATCH 108/935] Typo suggestion (#23360)

Update graphormer.mdx

Typo suggestion
---
 docs/source/en/model_doc/graphormer.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/graphormer.mdx b/docs/source/en/model_doc/graphormer.mdx
index 33092fe8898c..cfa8e30dcf03 100644
--- a/docs/source/en/model_doc/graphormer.mdx
+++ b/docs/source/en/model_doc/graphormer.mdx
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 ## Overview
 
 The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  by 
-Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessign and collation, then using a modified attention.
+Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.
 
 The abstract from the paper is the following:
 

From 96ae83a0d2e440bcaa6c0317673d1fd989ca9ca3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 May 2023 14:08:45 +0200
Subject: [PATCH 109/935] Fix some `is_xxx_available` (#23365)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/utils/import_utils.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index b349f5382760..512c460e15a1 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -36,6 +36,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
 def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
     # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
     package_exists = importlib.util.find_spec(pkg_name) is not None
@@ -69,12 +70,23 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
 _apex_available = _is_package_available("apex")
 _bitsandbytes_available = _is_package_available("bitsandbytes")
-_bs4_available = _is_package_available("bs4")
+# `importlib_metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
+_bs4_available = importlib.util.find_spec("bs4") is not None
 _coloredlogs_available = _is_package_available("coloredlogs")
 _datasets_available = _is_package_available("datasets")
 _decord_available = importlib.util.find_spec("decord") is not None
 _detectron2_available = _is_package_available("detectron2")
-_faiss_available = _is_package_available("faiss") or _is_package_available("faiss-cpu")
+# We need to check both `faiss` and `faiss-cpu`.
+_faiss_available = importlib.util.find_spec("faiss") is not None
+try:
+    _faiss_version = importlib_metadata.version("faiss")
+    logger.debug(f"Successfully imported faiss version {_faiss_version}")
+except importlib_metadata.PackageNotFoundError:
+    try:
+        _faiss_version = importlib_metadata.version("faiss-cpu")
+        logger.debug(f"Successfully imported faiss version {_faiss_version}")
+    except importlib_metadata.PackageNotFoundError:
+        _faiss_available = False
 _ftfy_available = _is_package_available("ftfy")
 _ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
 _jieba_available = _is_package_available("jieba")

From 380280d99419d7e3da912826213a228885ffbb3c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 May 2023 14:15:43 +0200
Subject: [PATCH 110/935] Fix `BigBirdForMaskedLM` doctest (#23369)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/big_bird/modeling_big_bird.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 73a5c7d9b52e..1f707069704d 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2453,7 +2453,7 @@ def forward(
         >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
         >>> outputs = model(**inputs, labels=labels)
         >>> round(outputs.loss.item(), 2)
-        1.08
+        1.99
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From c94f7a1cce7570bb9fd64a57db644dfd28879b43 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 May 2023 14:17:09 +0200
Subject: [PATCH 111/935] Fix `OwlViTForObjectDetection.image_guided_detection`
 doc example (#23370)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/owlvit/modeling_owlvit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 93872fffcf41..1e296e01ac38 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1554,8 +1554,8 @@ def image_guided_detection(
         >>> for box, score in zip(boxes, scores):
         ...     box = [round(i, 2) for i in box.tolist()]
         ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
-        Detected similar object with confidence 0.782 at location [-0.06, -1.52, 637.96, 271.16]
-        Detected similar object with confidence 1.0 at location [39.64, 71.61, 176.21, 117.15]
+        Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
+        Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 569a97adb2bdfecf8549c6f18781eda3e898b046 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 May 2023 14:28:36 +0200
Subject: [PATCH 112/935] Revert "Only add files with modification outside doc
 blocks" (#23371)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/tests_fetcher.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 9cb6dd4f8b80..8aa1015f3947 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -225,23 +225,10 @@ def get_diff_for_py_and_mdx_files(repo, base_commit, commits):
     for commit in commits:
         for diff_obj in commit.diff(base_commit):
             # We always add new python files
-            if diff_obj.change_type in ["A"] and (diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")):
-                code_diff.append(diff_obj.b_path)
-            # Now for modified files
-            elif (
-                diff_obj.change_type in ["M", "R"]
-                and diff_obj.b_path.endswith(".py")
-                or diff_obj.b_path.endswith(".mdx")
+            if diff_obj.change_type in ["A", "M", "R"] and (
+                diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")
             ):
-                # In case of renames, we'll look at the tests using both the old and new name.
-                if diff_obj.a_path != diff_obj.b_path:
-                    code_diff.extend([diff_obj.a_path, diff_obj.b_path])
-                else:
-                    # Otherwise, we check modifications are in code and not docstrings.
-                    if diff_is_docstring_only(repo, commit, diff_obj.b_path):
-                        print(f"Ignoring diff in {diff_obj.b_path} as it only concerns docstrings or comments.")
-                    else:
-                        code_diff.append(diff_obj.a_path)
+                code_diff.append(diff_obj.b_path)
 
     return code_diff
 

From 41d47db90fbe9937c0941f2f9cdb2ddd83e49a2e Mon Sep 17 00:00:00 2001
From: AinL <gmlwns5176@gmail.com>
Date: Mon, 15 May 2023 21:31:53 +0900
Subject: [PATCH 113/935] [Bugfix] `OPTDecoderLayer` does not return attentions
 when `gradient_checkpointing` and `training` is enabled. (#23367)

Update modeling_opt.py
---
 src/transformers/models/opt/modeling_opt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 6086b620e7fe..94269ffbf0ae 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -299,9 +299,9 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:

From 8f76dc8e5aaad58f2df7748b6d6970376f315a9a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 May 2023 16:46:58 +0200
Subject: [PATCH 114/935] Skip failing
 `AlignModelTest::test_multi_gpu_data_parallel_forward` (#23374)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/align/test_modeling_align.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 02d046924cb1..2357c20e213a 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -437,6 +437,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip(reason="Start to fail after using torch `cu118`.")
+    def test_multi_gpu_data_parallel_forward(self):
+        super().test_multi_gpu_data_parallel_forward()
+
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass

From ee3be05310d59c870a930c841eb122694bfdf5b5 Mon Sep 17 00:00:00 2001
From: LWprogramming <LWprogramming@users.noreply.github.com>
Date: Mon, 15 May 2023 09:22:10 -0700
Subject: [PATCH 115/935] Fix test typos - audio feature extractors (#23310)

---
 tests/models/mctct/test_feature_extraction_mctct.py       | 4 ++--
 tests/models/speecht5/test_feature_extraction_speecht5.py | 4 ++--
 tests/models/tvlt/test_feature_extraction_tvlt.py         | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
index a3c07474d281..cab2911fdd40 100644
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ b/tests/models/mctct/test_feature_extraction_mctct.py
@@ -114,8 +114,8 @@ def _check_zero_mean_unit_variance(self, input_vector):
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 12000
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
         # Test feature size
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
index da733614913f..11ed50de4bbc 100644
--- a/tests/models/speecht5/test_feature_extraction_speecht5.py
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -255,8 +255,8 @@ def test_double_precision_pad(self):
     def test_call_target(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 8000, 14000, and 2000
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
         # Test feature size
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
index 560abd78f92e..a76f3c9dca08 100644
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -160,7 +160,7 @@ def test_call(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
 
         # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 20000)]
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
         # Test not batched input

From c2393cad085e3875ee2206d917d46d15e50602a3 Mon Sep 17 00:00:00 2001
From: dewa <95997298+dewasahu2003@users.noreply.github.com>
Date: Mon, 15 May 2023 22:57:41 +0530
Subject: [PATCH 116/935] Added type hints for `Graphormer` pytorch version
 (#23073)

* Added type hints for `Graphormer` pytorch version

added type hints for graphormers pytorch , checked formating issues .

* made the code less bloated
---
 .../models/graphormer/modeling_graphormer.py  | 117 +++++++++++-------
 1 file changed, 69 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py
index c5d293a61350..2dd86b7b55ff 100755
--- a/src/transformers/models/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/graphormer/modeling_graphormer.py
@@ -14,16 +14,18 @@
 # limitations under the License.
 """ PyTorch Graphormer model."""
 
-
 import math
-from typing import Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutputWithNoAttention, SequenceClassifierOutput
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    SequenceClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from .configuration_graphormer import GraphormerConfig
@@ -42,7 +44,7 @@
 ]
 
 
-def quant_noise(module, p, block_size):
+def quant_noise(module: nn.Module, p: float, block_size: int):
     """
     From:
     https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py
@@ -161,11 +163,11 @@ class LayerDropModuleList(nn.ModuleList):
         modules (iterable, optional): an iterable of modules to add
     """
 
-    def __init__(self, p, modules=None):
+    def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
         super().__init__(modules)
         self.p = p
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[nn.Module]:
         dropout_probs = torch.empty(len(self)).uniform_()
         for i, m in enumerate(super().__iter__()):
             if not self.training or (dropout_probs[i] > self.p):
@@ -177,7 +179,7 @@ class GraphormerGraphNodeFeature(nn.Module):
     Compute node features for each node in the graph.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: GraphormerConfig):
         super().__init__()
         self.num_heads = config.num_attention_heads
         self.num_atoms = config.num_atoms
@@ -192,7 +194,12 @@ def __init__(self, config):
 
         self.graph_token = nn.Embedding(1, config.hidden_size)
 
-    def forward(self, input_nodes, in_degree, out_degree):
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+    ) -> torch.Tensor:
         n_graph, n_node = input_nodes.size()[:2]
 
         node_feature = (  # node feature + graph token
@@ -213,7 +220,7 @@ class GraphormerGraphAttnBias(nn.Module):
     Compute attention bias for each head.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: GraphormerConfig):
         super().__init__()
         self.num_heads = config.num_attention_heads
         self.multi_hop_max_dist = config.multi_hop_max_dist
@@ -233,7 +240,14 @@ def __init__(self, config):
 
         self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)
 
-    def forward(self, input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type):
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        spatial_pos: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+    ) -> torch.Tensor:
         n_graph, n_node = input_nodes.size()[:2]
         graph_attn_bias = attn_bias.clone()
         graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
@@ -289,7 +303,7 @@ class GraphormerMultiheadAttention(nn.Module):
     See "Attention Is All You Need" for more details.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: GraphormerConfig):
         super().__init__()
         self.embedding_dim = config.embedding_dim
         self.kdim = config.kdim if config.kdim is not None else config.embedding_dim
@@ -352,7 +366,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        query,
+        query: torch.LongTensor,
         key: Optional[torch.Tensor],
         value: Optional[torch.Tensor],
         attn_bias: Optional[torch.Tensor],
@@ -458,7 +472,7 @@ def forward(
             raise AssertionError("The attention generated do not match the expected dimensions.")
 
         attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embedding_dim)
-        attn = self.out_proj(attn)
+        attn: torch.Tensor = self.out_proj(attn)
 
         attn_weights = None
         if need_weights:
@@ -469,12 +483,12 @@ def forward(
 
         return attn, attn_weights
 
-    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+    def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
         return attn_weights
 
 
 class GraphormerGraphEncoderLayer(nn.Module):
-    def __init__(self, config) -> None:
+    def __init__(self, config: GraphormerConfig) -> None:
         super().__init__()
 
         # Initialize parameters
@@ -512,7 +526,9 @@ def __init__(self, config) -> None:
         # layer norm associated with the position wise feed-forward NN
         self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
 
-    def build_fc(self, input_dim, output_dim, q_noise, qn_block_size):
+    def build_fc(
+        self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
+    ) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
         return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
 
     def forward(
@@ -521,7 +537,7 @@ def forward(
         self_attn_bias: Optional[torch.Tensor] = None,
         self_attn_mask: Optional[torch.Tensor] = None,
         self_attn_padding_mask: Optional[torch.Tensor] = None,
-    ):
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         nn.LayerNorm is applied either before or after the self-attention/ffn modules similar to the original
         Transformer implementation.
@@ -559,7 +575,7 @@ def forward(
 
 
 class GraphormerGraphEncoder(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config: GraphormerConfig):
         super().__init__()
 
         self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
@@ -608,18 +624,18 @@ def __init__(self, config):
 
     def forward(
         self,
-        input_nodes,
-        input_edges,
-        attn_bias,
-        in_degree,
-        out_degree,
-        spatial_pos,
-        attn_edge_type,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
         perturb=None,
         last_state_only: bool = False,
         token_embeddings: Optional[torch.Tensor] = None,
         attn_mask: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.torch.Tensor, torch.Tensor]:
+    ) -> Tuple[Union[torch.Tensor, List[torch.LongTensor]], torch.Tensor]:
         # compute padding mask. This is needed for multi-head attention
         data_x = input_nodes
         n_graph, n_node = data_x.size()[:2]
@@ -676,14 +692,14 @@ def forward(
 
 
 class GraphormerDecoderHead(nn.Module):
-    def __init__(self, embedding_dim, num_classes):
+    def __init__(self, embedding_dim: int, num_classes: int):
         super().__init__()
         """num_classes should be 1 for regression, or the number of classes for classification"""
         self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
         self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
         self.num_classes = num_classes
 
-    def forward(self, input_nodes, **unused):
+    def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
         input_nodes = self.classifier(input_nodes)
         input_nodes = input_nodes + self.lm_output_learned_bias
         return input_nodes
@@ -702,12 +718,12 @@ class GraphormerPreTrainedModel(PreTrainedModel):
     main_input_name_nodes = "input_nodes"
     main_input_name_edges = "input_edges"
 
-    def normal_(self, data):
+    def normal_(self, data: torch.Tensor):
         # with FSDP, module params will be on CUDA, so we cast them back to CPU
         # so that the RNG is consistent with and without FSDP
         data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
 
-    def init_graphormer_params(self, module):
+    def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
         """
         Initialize the weights specific to the Graphormer Model.
         """
@@ -724,7 +740,12 @@ def init_graphormer_params(self, module):
             self.normal_(module.k_proj.weight.data)
             self.normal_(module.v_proj.weight.data)
 
-    def _init_weights(self, module):
+    def _init_weights(
+        self,
+        module: Union[
+            nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
+        ],
+    ):
         """
         Initialize the weights
         """
@@ -766,7 +787,7 @@ class GraphormerModel(GraphormerPreTrainedModel):
     this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: GraphormerConfig):
         super().__init__(config)
         self.max_nodes = config.max_nodes
 
@@ -789,18 +810,18 @@ def reset_output_layer_parameters(self):
 
     def forward(
         self,
-        input_nodes,
-        input_edges,
-        attn_bias,
-        in_degree,
-        out_degree,
-        spatial_pos,
-        attn_edge_type,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
         perturb=None,
         masked_tokens=None,
         return_dict: Optional[bool] = None,
         **unused,
-    ):
+    ) -> Union[Tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         inner_states, graph_rep = self.graph_encoder(
@@ -841,7 +862,7 @@ class GraphormerForGraphClassification(GraphormerPreTrainedModel):
       of integer labels for each graph.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: GraphormerConfig):
         super().__init__(config)
         self.encoder = GraphormerModel(config)
         self.embedding_dim = config.embedding_dim
@@ -854,13 +875,13 @@ def __init__(self, config):
 
     def forward(
         self,
-        input_nodes,
-        input_edges,
-        attn_bias,
-        in_degree,
-        out_degree,
-        spatial_pos,
-        attn_edge_type,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
         labels: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
         **unused,

From ba6815e82438f0816726fcbd5cbbe9d1c964db4a Mon Sep 17 00:00:00 2001
From: Koki Tanaka <mail@gojiteji.com>
Date: Tue, 16 May 2023 18:54:19 +0900
Subject: [PATCH 117/935] Replace NumPy Operations with JAX NumPy Equivalents
 for JIT Compilation Compatibility (#23356)

* Replace numpy operations with jax.numpy for JIT compatibility

Replaced numpy operations with their jax.numpy equivalents in the transformer library. This change was necessary to prevent errors during JIT compilation. Specifically, the modifications involve changing numpy's in-place assignments to jax.numpy's immutable update methods.

* rm numpy import

* rm numpy import and fix np->jnp

* fixed slices bug

* fixed decoder_start_tokens -> decoder_start_token_id

* fixed jnp in modleing mt5

* doc fix

* rm numpy import

* make
---
 .../models/bart/modeling_flax_bart.py             | 11 +++++------
 .../models/blenderbot/modeling_flax_blenderbot.py |  8 ++++----
 .../modeling_flax_blenderbot_small.py             |  9 ++++-----
 .../models/longt5/modeling_flax_longt5.py         |  8 ++++----
 .../models/marian/modeling_flax_marian.py         |  8 ++++----
 .../models/mbart/modeling_flax_mbart.py           | 15 +++++++--------
 src/transformers/models/mt5/modeling_flax_mt5.py  | 12 ++++++------
 .../models/pegasus/modeling_flax_pegasus.py       |  8 ++++----
 src/transformers/models/t5/modeling_flax_t5.py    |  8 ++++----
 9 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index ac292cc77707..b7ce63ffcc77 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -22,7 +22,6 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
-import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -218,15 +217,15 @@
 """
 
 
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 629ddb99a80c..6796f48163a7 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -209,11 +209,11 @@ def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_tok
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 226e401c921e..e13b90c06009 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -23,7 +23,6 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
-import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -221,11 +220,11 @@ def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 458f6d597f30..7fa708c59956 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -60,11 +60,11 @@ def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_tok
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 96b26f8325ce..c3d89b693a67 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -231,11 +231,11 @@ def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 78375afce4fc..aeeec3e583b7 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -22,7 +22,6 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
-import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -223,20 +222,20 @@ def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int) -> jnp.ndarray
     Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
     have a single `decoder_start_token_id` in contrast to other Bart-like models.
     """
-    prev_output_tokens = np.array(input_ids).copy()
+    prev_output_tokens = jnp.array(input_ids).copy()
 
     if pad_token_id is None:
         raise ValueError("self.model.config.pad_token_id has to be defined.")
 
     # replace possible -100 values in labels by `pad_token_id`
-    prev_output_tokens = np.where(prev_output_tokens == -100, pad_token_id, input_ids)
-    index_of_eos = (np.where(prev_output_tokens != pad_token_id, 1, 0).sum(axis=-1) - 1).reshape(-1, 1)
-    decoder_start_tokens = np.array(
-        [prev_output_tokens[i, eos_idx] for i, eos_idx in enumerate(index_of_eos)], dtype=np.int32
+    prev_output_tokens = jnp.where(prev_output_tokens == -100, pad_token_id, input_ids)
+    index_of_eos = (jnp.where(prev_output_tokens != pad_token_id, 1, 0).sum(axis=-1) - 1).reshape(-1, 1)
+    decoder_start_tokens = jnp.array(
+        [prev_output_tokens[i, eos_idx] for i, eos_idx in enumerate(index_of_eos)], dtype=jnp.int32
     ).squeeze()
 
-    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].copy()
-    prev_output_tokens[:, 0] = decoder_start_tokens
+    prev_output_tokens = prev_output_tokens.at[:, 1:].set(prev_output_tokens[:, :-1])
+    prev_output_tokens = prev_output_tokens.at[:, 0].set(decoder_start_tokens)
 
     return prev_output_tokens
 
diff --git a/src/transformers/models/mt5/modeling_flax_mt5.py b/src/transformers/models/mt5/modeling_flax_mt5.py
index 6b6eaf7fd135..86ddf477ffab 100644
--- a/src/transformers/models/mt5/modeling_flax_mt5.py
+++ b/src/transformers/models/mt5/modeling_flax_mt5.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ Flax mT5 model."""
 
-import numpy as np
+import jax.numpy as jnp
 
 from ...utils import logging
 from ..t5.modeling_flax_t5 import FlaxT5EncoderModel, FlaxT5ForConditionalGeneration, FlaxT5Model
@@ -27,15 +27,15 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index b39e2c437e53..ddd83709e911 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -214,11 +214,11 @@ def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_tok
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 249d4913e010..bc26ade028dc 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -60,11 +60,11 @@ def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_tok
     """
     Shift input ids one token to the right.
     """
-    shifted_input_ids = np.zeros_like(input_ids)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1]
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
 
-    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
     return shifted_input_ids
 
 

From 80ca92470938bbcc348e2d9cf4734c7c25cb1c43 Mon Sep 17 00:00:00 2001
From: ready-research <72916209+ready-research@users.noreply.github.com>
Date: Tue, 16 May 2023 15:40:54 +0530
Subject: [PATCH 118/935] Use `mkstemp` to replace deprecated `mktemp` (#23372)

* Use `mkstemp` to replace deprecated `mktemp`

The `tempfile.mktemp` function is [deprecated](https://docs.python.org/3/library/tempfile.html#tempfile.mktemp) due to [security issues](https://cwe.mitre.org/data/definitions/377.html).

* Update src/transformers/utils/hub.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/utils/hub.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 58aac59d24e8..335547f2b825 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -578,7 +578,7 @@ def download_url(url, proxies=None):
         " that this is not compatible with the caching system (your file will be downloaded at each execution) or"
         " multiple processes (each process will download the file in a different temporary file)."
     )
-    tmp_file = tempfile.mktemp()
+    tmp_file = tempfile.mkstemp()[1]
     with open(tmp_file, "wb") as f:
         http_get(url, f, proxies=proxies)
     return tmp_file

From d765717c76026281f2fb27ddc44fa3636306bb48 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 May 2023 12:14:54 +0200
Subject: [PATCH 119/935] Fix `RwkvModel` (#23392)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/rwkv/modeling_rwkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index dd85c279daeb..3733f0db99db 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -679,7 +679,7 @@ def forward(
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         if not return_dict:
-            return (hidden_states, state, all_hidden_states, all_self_attentions)
+            return tuple(x for x in [hidden_states, state, all_hidden_states, all_self_attentions] if x is not None)
 
         return RwkvOutput(
             last_hidden_state=hidden_states,

From 21741e8c7efb13c9263985be1a2cd04c132bcf99 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 May 2023 14:49:24 +0200
Subject: [PATCH 120/935] Update
 `test_batched_inference_image_captioning_conditioned` (#23391)

* fix

* fix

* fix test + add more docs

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 docs/source/en/model_doc/pix2struct.mdx             | 2 ++
 tests/models/pix2struct/test_modeling_pix2struct.py | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/pix2struct.mdx b/docs/source/en/model_doc/pix2struct.mdx
index c6d313628569..f4ead88f5cc8 100644
--- a/docs/source/en/model_doc/pix2struct.mdx
+++ b/docs/source/en/model_doc/pix2struct.mdx
@@ -25,6 +25,8 @@ Tips:
 Pix2Struct has been fine tuned on a variety of tasks and datasets, ranging from image captioning, visual question answering (VQA) over different inputs (books, charts, science diagrams), captioning UI components etc. The full list can be found in Table 1 of the paper.
 We therefore advise you to use these models for the tasks they have been fine tuned on. For instance, if you want to use Pix2Struct for UI captioning, you should use the model fine tuned on the UI dataset. If you want to use Pix2Struct for image captioning, you should use the model fine tuned on the natural images captioning dataset and so on.
 
+If you want to use the model to perform conditional text captioning, make sure to use the processor with `add_special_tokens=False`.
+
 This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
 The original code can be found [here](https://github.com/google-research/pix2struct).
 
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 42ee3c2b4c6a..4dbd7e649f16 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -749,17 +749,20 @@ def test_batched_inference_image_captioning_conditioned(self):
         texts = ["A picture of", "An photography of"]
 
         # image only
-        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt").to(torch_device)
+        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", add_special_tokens=False).to(
+            torch_device
+        )
 
         predictions = model.generate(**inputs)
 
         self.assertEqual(
-            processor.decode(predictions[0], skip_special_tokens=True), "A picture of a stop sign that says yes."
+            processor.decode(predictions[0], skip_special_tokens=True),
+            "A picture of a stop sign with a red stop sign on it.",
         )
 
         self.assertEqual(
             processor.decode(predictions[1], skip_special_tokens=True),
-            "An photography of the Temple Bar and a few other places.",
+            "An photography of the Temple Bar and the Temple Bar.",
         )
 
     def test_vqa_model(self):

From 466af1a356c6ebd03b544735677d6eb9086dbb44 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 16 May 2023 13:59:53 +0100
Subject: [PATCH 121/935] OPT/BioGPT: Improved attention mask shape exception
 (#23270)

---
 src/transformers/models/biogpt/modeling_biogpt.py | 6 ++++++
 src/transformers/models/opt/modeling_opt.py       | 5 +++++
 src/transformers/models/opt/modeling_tf_opt.py    | 9 +++++++++
 3 files changed, 20 insertions(+)

diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 4fc7f7b48931..df6c18182dc2 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -546,6 +546,12 @@ def forward(
 
         if attention_mask is None:
             attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
+        elif attention_mask.shape[1] != past_key_values_length + input_shape[1]:
+            raise ValueError(
+                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                f"{past_key_values_length + input_shape[1]} (sum of the lengths of current and past inputs)"
+            )
+
         # embed positions
         positions = self.embed_positions(attention_mask, past_key_values_length)
 
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 94269ffbf0ae..b6c84777cc1f 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -642,6 +642,11 @@ def forward(
         # embed positions
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        elif attention_mask.shape[1] != mask_seq_length:
+            raise ValueError(
+                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+            )
         causal_attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index cd34130228a6..4f738b1605d4 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -645,6 +645,15 @@ def call(
 
         if attention_mask is None:
             attention_mask = tf.ones(inputs_embeds.shape[:2], dtype=tf.bool)
+        else:
+            tf.debugging.assert_equal(
+                attention_mask.shape[1],
+                past_key_values_length + input_shape[1],
+                message=(
+                    f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                    f"{past_key_values_length + input_shape[1]} (sum of the lengths of current and past inputs)"
+                ),
+            )
 
         pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
 

From 770a1275d3c01c38461dec31e90a3b159fa1647b Mon Sep 17 00:00:00 2001
From: Ivan Sedykh <sed.ivan.dm@gmail.com>
Date: Tue, 16 May 2023 16:18:58 +0300
Subject: [PATCH 122/935] Fix chat prompt in HFAgent (#23335)

fix chat prompts
---
 src/transformers/tools/agents.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index fb8649bafad3..6a17c59445c4 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -198,7 +198,7 @@ class Agent:
     def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
         _setup_default_tools()
 
-        self.chat_prompt_template = CHAT_MESSAGE_PROMPT if chat_prompt_template is None else chat_prompt_template
+        self.chat_prompt_template = CHAT_PROMPT_TEMPLATE if chat_prompt_template is None else chat_prompt_template
         self.run_prompt_template = RUN_PROMPT_TEMPLATE if run_prompt_template is None else run_prompt_template
         self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
         if additional_tools is not None:
@@ -229,7 +229,7 @@ def format_prompt(self, task, chat_mode=False):
         description = "\n".join([f"- {name}: {tool.description}" for name, tool in self.toolbox.items()])
         if chat_mode:
             if self.chat_history is None:
-                prompt = CHAT_PROMPT_TEMPLATE.replace("<<all_tools>>", description)
+                prompt = self.chat_prompt_template.replace("<<all_tools>>", description)
             else:
                 prompt = self.chat_history
             prompt += CHAT_MESSAGE_PROMPT.replace("<<task>>", task)

From 728c5e82cc017aa3a8f11abbf106320f55593220 Mon Sep 17 00:00:00 2001
From: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Date: Tue, 16 May 2023 22:22:56 +0900
Subject: [PATCH 123/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`asr.mdx`=20to=20Korean=20(#23106)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: task/asr.mdx

* feat: manual draft

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/_toctree.yml  |   4 +-
 docs/source/ko/tasks/asr.mdx | 376 +++++++++++++++++++++++++++++++++++
 2 files changed, 378 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tasks/asr.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 35744420e93e..d00acd282efc 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -45,8 +45,8 @@
   - sections:
       - local: in_translation
         title: (번역중) Audio classification
-      - local: in_translation
-        title: (번역중) Automatic speech recognition
+      - local: tasks/asr
+        title: 자동 음성 인식
     title: (번역중) 오디오
     isExpanded: false
   - sections:
diff --git a/docs/source/ko/tasks/asr.mdx b/docs/source/ko/tasks/asr.mdx
new file mode 100644
index 000000000000..ec84bd4e8f7e
--- /dev/null
+++ b/docs/source/ko/tasks/asr.mdx
@@ -0,0 +1,376 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 자동 음성 인식[[automatic-speech-recognition]]
+
+[[open-in-colab]]
+
+<Youtube id="TksaY_FDgnk"/>
+
+자동 음성 인식(Automatic Speech Recognition, ASR)은 음성 신호를 텍스트로 변환하여 음성 입력 시퀀스를 텍스트 출력에 매핑합니다. 
+Siri와 Alexa와 같은 가상 어시스턴트는 ASR 모델을 사용하여 일상적으로 사용자를 돕고 있으며, 회의 중 라이브 캡션 및 메모 작성과 같은 유용한 사용자 친화적 응용 프로그램도 많이 있습니다.
+
+이 가이드에서 소개할 내용은 아래와 같습니다:
+
+1. [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트에서 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base)를 미세 조정하여 오디오를 텍스트로 변환합니다.
+2. 미세 조정한 모델을 추론에 사용합니다.
+
+<Tip>
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate jiwer
+```
+
+Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다. 토큰을 입력하여 로그인하세요.
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## MInDS-14 데이터 세트 가져오기[[load-minds-14-dataset]]
+
+먼저, 🤗 Datasets 라이브러리에서 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트의 일부분을 가져오세요. 
+이렇게 하면 전체 데이터 세트에 대한 훈련에 시간을 들이기 전에 모든 것이 작동하는지 실험하고 검증할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
+```
+
+[`~Dataset.train_test_split`] 메소드를 사용하여 데이터 세트의 `train`을 훈련 세트와 테스트 세트로 나누세요:
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+그리고 데이터 세트를 확인하세요:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 16
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 4
+    })
+})
+```
+
+데이터 세트에는 `lang_id`와 `english_transcription`과 같은 유용한 정보가 많이 포함되어 있지만, 이 가이드에서는 `audio`와 `transcription`에 초점을 맞출 것입니다. 다른 열은 [`~datasets.Dataset.remove_columns`] 메소드를 사용하여 제거하세요:
+
+```py
+>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
+```
+
+예시를 다시 한번 확인해보세요:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
+          0.00024414,  0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 8000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+두 개의 필드가 있습니다:
+
+- `audio`: 오디오 파일을 가져오고 리샘플링하기 위해 호출해야 하는 음성 신호의 1차원 `array(배열)`
+- `transcription`: 목표 텍스트
+
+## 전처리[[preprocess]]
+
+다음으로 오디오 신호를 처리하기 위한 Wav2Vec2 프로세서를 가져옵니다:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
+```
+
+MInDS-14 데이터 세트의 샘플링 레이트는 8000kHz이므로([데이터 세트 카드](https://huggingface.co/datasets/PolyAI/minds14)에서 확인), 사전 훈련된 Wav2Vec2 모델을 사용하려면 데이터 세트를 16000kHz로 리샘플링해야 합니다:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
+          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 16000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+위의 'transcription'에서 볼 수 있듯이 텍스트는 대문자와 소문자가 섞여 있습니다. Wav2Vec2 토크나이저는 대문자 문자에 대해서만 훈련되어 있으므로 텍스트가 토크나이저의 어휘와 일치하는지 확인해야 합니다:
+
+```py
+>>> def uppercase(example):
+...     return {"transcription": example["transcription"].upper()}
+
+
+>>> minds = minds.map(uppercase)
+```
+
+이제 다음 작업을 수행할 전처리 함수를 만들어보겠습니다:
+
+1. `audio` 열을 호출하여 오디오 파일을 가져오고 리샘플링합니다.
+2. 오디오 파일에서 `input_values`를 추출하고 프로세서로 `transcription` 열을 토큰화합니다.
+
+```py
+>>> def prepare_dataset(batch):
+...     audio = batch["audio"]
+...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
+...     batch["input_length"] = len(batch["input_values"][0])
+...     return batch
+```
+
+전체 데이터 세트에 전처리 함수를 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. `num_proc` 매개변수를 사용하여 프로세스 수를 늘리면 `map`의 속도를 높일 수 있습니다. [`~datasets.Dataset.remove_columns`] 메소드를 사용하여 필요하지 않은 열을 제거하세요:
+
+```py
+>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
+```
+
+🤗 Transformers에는 자동 음성 인식용 데이터 콜레이터가 없으므로 예제 배치를 생성하려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 이렇게 하면 데이터 콜레이터는 텍스트와 레이블을 배치에서 가장 긴 요소의 길이에 동적으로 패딩하여 길이를 균일하게 합니다. `tokenizer` 함수에서 `padding=True`를 설정하여 텍스트를 패딩할 수 있지만, 동적 패딩이 더 효율적입니다.
+
+다른 데이터 콜레이터와 달리 이 특정 데이터 콜레이터는 `input_values`와 `labels`에 대해 다른 패딩 방법을 적용해야 합니다.
+
+```py
+>>> import torch
+
+>>> from dataclasses import dataclass, field
+>>> from typing import Any, Dict, List, Optional, Union
+
+
+>>> @dataclass
+... class DataCollatorCTCWithPadding:
+...     processor: AutoProcessor
+...     padding: Union[bool, str] = "longest"
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         # 입력과 레이블을 분할합니다
+...         # 길이가 다르고, 각각 다른 패딩 방법을 사용해야 하기 때문입니다
+...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
+...         label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
+
+...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
+
+...         # 패딩에 대해 손실을 적용하지 않도록 -100으로 대체합니다
+...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+...         batch["labels"] = labels
+
+...         return batch
+```
+
+이제 `DataCollatorForCTCWithPadding`을 인스턴스화합니다:
+
+```py
+>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
+```
+
+## 평가하기[[evaluate]]
+
+훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다. 
+이 작업에서는 [단어 오류율(Word Error Rate, WER)](https://huggingface.co/spaces/evaluate-metric/wer) 평가 지표를 가져옵니다.
+(평가 지표를 불러오고 계산하는 방법은 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요):
+
+```py
+>>> import evaluate
+
+>>> wer = evaluate.load("wer")
+```
+
+그런 다음 예측값과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 WER을 계산하는 함수를 만듭니다:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(pred):
+...     pred_logits = pred.predictions
+...     pred_ids = np.argmax(pred_logits, axis=-1)
+
+...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+
+...     pred_str = processor.batch_decode(pred_ids)
+...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+
+...     wer = wer.compute(predictions=pred_str, references=label_str)
+
+...     return {"wer": wer}
+```
+
+이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정할 때 이 함수로 되돌아올 것입니다.
+
+## 훈련하기[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]로 모델을 미세 조정하는 것이 익숙하지 않다면, [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 확인해보세요!
+
+</Tip>
+
+이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForCTC`]로 Wav2Vec2를 가져오세요. `ctc_loss_reduction` 매개변수로 CTC 손실에 적용할 축소(reduction) 방법을 지정하세요. 기본값인 합계 대신 평균을 사용하는 것이 더 좋은 경우가 많습니다:
+
+```py
+>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
+
+>>> model = AutoModelForCTC.from_pretrained(
+...     "facebook/wav2vec2-base",
+...     ctc_loss_reduction="mean",
+...     pad_token_id=processor.tokenizer.pad_token_id,
+... )
+```
+
+이제 세 단계만 남았습니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. `output_dir`은 모델을 저장할 경로를 지정하는 유일한 필수 매개변수입니다. `push_to_hub=True`를 설정하여 모델을 Hub에 업로드 할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다). [`Trainer`]는 각 에폭마다 WER을 평가하고 훈련 체크포인트를 저장합니다.
+2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 [`Trainer`]에 훈련 인수를 전달하세요.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_asr_mind_model",
+...     per_device_train_batch_size=8,
+...     gradient_accumulation_steps=2,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=2000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     group_by_length=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=8,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="wer",
+...     greater_is_better=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=processor.feature_extractor,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 모두가 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 Hub에 공유하세요:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+자동 음성 인식을 위해 모델을 미세 조정하는 더 자세한 예제는 영어 자동 음성 인식을 위한 [블로그 포스트](https://huggingface.co/blog/fine-tune-wav2vec2-english)와 다국어 자동 음성 인식을 위한 [포스트](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)를 참조하세요.
+
+</Tip>
+
+## 추론하기[[inference]]
+
+좋아요, 이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
+
+추론에 사용할 오디오 파일을 가져오세요. 필요한 경우 오디오 파일의 샘플링 비율을 모델의 샘플링 레이트에 맞게 리샘플링하는 것을 잊지 마세요!
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+추론을 위해 미세 조정된 모델을 시험해보는 가장 간단한 방법은 [`pipeline`]을 사용하는 것입니다. 모델을 사용하여 자동 음성 인식을 위한 `pipeline`을 인스턴스화하고 오디오 파일을 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
+>>> transcriber(audio_file)
+{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
+```
+
+<Tip>
+
+텍스트로 변환된 결과가 꽤 괜찮지만 더 좋을 수도 있습니다! 더 나은 결과를 얻으려면 더 많은 예제로 모델을 미세 조정하세요!
+
+</Tip>
+
+`pipeline`의 결과를 수동으로 재현할 수도 있습니다:
+
+<frameworkcontent>
+<pt>
+오디오 파일과 텍스트를 전처리하고 PyTorch 텐서로 `input`을 반환할 프로세서를 가져오세요:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+입력을 모델에 전달하고 로짓을 반환하세요:
+
+```py
+>>> from transformers import AutoModelForCTC
+
+>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+가장 높은 확률의 `input_ids`를 예측하고, 프로세서를 사용하여 예측된 `input_ids`를 다시 텍스트로 디코딩하세요:
+
+```py
+>>> import torch
+
+>>> predicted_ids = torch.argmax(logits, dim=-1)
+>>> transcription = processor.batch_decode(predicted_ids)
+>>> transcription
+['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
+```
+</pt>
+</frameworkcontent>
\ No newline at end of file

From 52d516c3a94d62492be2ced977761dddd7eb0c12 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Tue, 16 May 2023 15:55:44 +0200
Subject: [PATCH 124/935] Minor fixes in transformers-tools (#23364)

* Few fixes in new Tools implementation

* code quality
---
 src/transformers/tools/base.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index b2abdc818c29..b97bc4a43beb 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -23,8 +23,8 @@
 import tempfile
 from typing import Any, Dict, List, Optional, Union
 
-from huggingface_hub import CommitOperationAdd, HfFolder, create_commit, create_repo, hf_hub_download, metadata_update
-from huggingface_hub.utils import RepositoryNotFoundError, get_session
+from huggingface_hub import create_repo, hf_hub_download, metadata_update, upload_folder
+from huggingface_hub.utils import RepositoryNotFoundError, build_hf_headers, get_session
 
 from ..dynamic_module_utils import custom_object_save, get_class_from_dynamic_module, get_imports
 from ..image_utils import is_pil_image
@@ -173,7 +173,14 @@ def save(self, output_dir):
             f.write("\n".join(imports) + "\n")
 
     @classmethod
-    def from_hub(cls, repo_id, model_repo_id=None, token=None, remote=False, **kwargs):
+    def from_hub(
+        cls,
+        repo_id: str,
+        model_repo_id: Optional[str] = None,
+        token: Optional[str] = None,
+        remote: bool = False,
+        **kwargs,
+    ):
         """
         Loads a tool defined on the Hub.
 
@@ -285,22 +292,17 @@ def push_to_hub(
         repo_url = create_repo(
             repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="space", space_sdk="gradio"
         )
-        metadata_update(repo_id, {"tags": ["tool"]}, repo_type="space")
         repo_id = repo_url.repo_id
+        metadata_update(repo_id, {"tags": ["tool"]}, repo_type="space")
 
         with tempfile.TemporaryDirectory() as work_dir:
             # Save all files.
             self.save(work_dir)
-            os.listdir(work_dir)
-            operations = [
-                CommitOperationAdd(path_or_fileobj=os.path.join(work_dir, f), path_in_repo=f)
-                for f in os.listdir(work_dir)
-            ]
             logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}")
-            return create_commit(
+            return upload_folder(
                 repo_id=repo_id,
-                operations=operations,
                 commit_message=commit_message,
+                folder_path=work_dir,
                 token=token,
                 create_pr=create_pr,
                 repo_type="space",
@@ -482,7 +484,7 @@ def __init__(
         self.hub_kwargs = hub_kwargs
         self.hub_kwargs["use_auth_token"] = token
 
-        self.is_initialized = False
+        super().__init__()
 
     def setup(self):
         """
@@ -508,6 +510,8 @@ def setup(self):
         if self.device_map is None:
             self.model.to(self.device)
 
+        super().setup()
+
     def encode(self, raw_inputs):
         """
         Uses the `pre_processor` to prepare the inputs for the `model`.
@@ -674,9 +678,7 @@ def inner(func):
 ## Will move to the Hub
 class EndpointClient:
     def __init__(self, endpoint_url: str, token: Optional[str] = None):
-        if token is None:
-            token = HfFolder().get_token()
-        self.headers = {"authorization": f"Bearer {token}", "Content-Type": "application/json"}
+        self.headers = {**build_hf_headers(token=token), "Content-Type": "application/json"}
         self.endpoint_url = endpoint_url
 
     @staticmethod

From 2922e394e3a754a780ad432604ebf6c20b021338 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 16 May 2023 15:59:18 +0200
Subject: [PATCH 125/935] [`Pix2Struct`] Add conditional generation on
 docstring example (#23399)

add conditional generation on docstring
---
 .../models/pix2struct/modeling_pix2struct.py             | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 1505ff07da40..cd08c69086f7 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1687,6 +1687,15 @@ def forward(
         >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         >>> print(generated_text)
         A stop sign is on a street corner.
+
+        >>> # conditional generation
+        >>> text = "A picture of"
+        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> print(generated_text)
+        A picture of a stop sign with a red stop sign on it.
         ```
 
         Training:

From 130e15429116689c9d747be2cdd8c4be7bb7e2bd Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 16 May 2023 15:12:21 +0100
Subject: [PATCH 126/935] Generate: faster `can_generate` check on TF and Flax
 (#23398)

---
 src/transformers/modeling_flax_utils.py | 2 +-
 src/transformers/modeling_tf_utils.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index eee84ba5f965..6a612293eb66 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -474,7 +474,7 @@ def can_generate(self) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation):
+        if "GenerationMixin" in str(self.prepare_inputs_for_generation.__func__):
             return False
         return True
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 35c526379c88..5618472744fb 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1243,7 +1243,7 @@ def can_generate(self) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation):
+        if "GenerationMixin" in str(self.prepare_inputs_for_generation.__func__):
             return False
         return True
 

From 8a588093127c536201b132721a51eefe9a7bc2bb Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Tue, 16 May 2023 13:10:42 -0400
Subject: [PATCH 127/935] Fix translation no_trainer (#23407)

* Fix translation
---
 examples/pytorch/test_accelerate_examples.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index da777d627285..4cfe45b02294 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -177,7 +177,7 @@ def test_run_ner_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
 
-    @unittest.skip(reason="Fix me @zack")
+    @unittest.skip(reason="Fix me @muellerzr")
     @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_squad_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
@@ -270,6 +270,7 @@ def test_run_translation_no_trainer(self):
             --output_dir {tmp_dir}
             --max_train_steps=50
             --num_warmup_steps=8
+            --num_beams=6
             --learning_rate=3e-3
             --per_device_train_batch_size=2
             --per_device_eval_batch_size=1

From bbbc5c15d4fbf87f74fc4e5caa78d94d3d73bbce Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Tue, 16 May 2023 10:21:42 -0700
Subject: [PATCH 128/935] [AutoModel] fix `torch_dtype=auto` in
 `from_pretrained` (#23379)

* [automodel] fix torch_dtype=auto in from_pretrained

* add test

* fix logic

* Update src/transformers/models/auto/auto_factory.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/auto/auto_factory.py | 13 +++++++++----
 tests/test_modeling_common.py                |  4 ++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 17cc7f95799f..aad113d45442 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -435,19 +435,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         ]
         hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
         if not isinstance(config, PretrainedConfig):
-            kwargs_copy = copy.deepcopy(kwargs)
+            kwargs_orig = copy.deepcopy(kwargs)
             # ensure not to pollute the config object with torch_dtype="auto" - since it's
             # meaningless in the context of the config object - torch.dtype values are acceptable
-            if kwargs_copy.get("torch_dtype", None) == "auto":
-                _ = kwargs_copy.pop("torch_dtype")
+            if kwargs.get("torch_dtype", None) == "auto":
+                _ = kwargs.pop("torch_dtype")
 
             config, kwargs = AutoConfig.from_pretrained(
                 pretrained_model_name_or_path,
                 return_unused_kwargs=True,
                 trust_remote_code=trust_remote_code,
                 **hub_kwargs,
-                **kwargs_copy,
+                **kwargs,
             )
+
+            # if torch_dtype=auto was passed here, ensure to pass it on
+            if kwargs_orig.get("torch_dtype", None) == "auto":
+                kwargs["torch_dtype"] = "auto"
+
         if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
             if not trust_remote_code:
                 raise ValueError(
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 43e06e906797..e6e7968ae531 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2920,6 +2920,10 @@ def remove_torch_dtype(model_path):
         model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
         self.assertEqual(model.dtype, torch.float16)
 
+        # 3. now retest that AutoModel behaves the same wrt torch_dtype="auto" as T5ForConditionalGeneration
+        model = AutoModel.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
         # test fp16 save_pretrained, loaded with the explicit fp16
         model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
         self.assertEqual(model.dtype, torch.float16)

From 5b1ad0eb732a07ccc4ea406fb33dd21c590c80be Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 16 May 2023 18:54:34 +0100
Subject: [PATCH 129/935] Docs: add link to assisted generation blog post
 (#23397)

---
 docs/source/en/generation_strategies.mdx | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/generation_strategies.mdx b/docs/source/en/generation_strategies.mdx
index 2b4f9880cffc..b59649bae41d 100644
--- a/docs/source/en/generation_strategies.mdx
+++ b/docs/source/en/generation_strategies.mdx
@@ -338,9 +338,8 @@ For the complete list of the available parameters, refer to the [API documentati
 Assisted decoding is a modification of the decoding strategies above that uses an assistant model with the same
 tokenizer (ideally a much smaller model) to greedily generate a few candidate tokens. The main model then validates
 the candidate tokens in a single forward pass, which speeds up the decoding process. Currently, only greedy search
-and sampling are supported with assisted decoding, and doesn't support batched inputs.
-
-<!-- TODO: add link to the blog post about assisted decoding when it exists -->
+and sampling are supported with assisted decoding, and doesn't support batched inputs. To learn more about assisted
+decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).
 
 To enable assisted decoding, set the `assistant_model` argument with a model.
 
@@ -364,8 +363,6 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 When using assisted decoding with sampling methods, you can use the `temperarure` argument to control the randomness
 just like in multinomial sampling. However, in assisted decoding, reducing the temperature will help improving latency.
 
-<!-- TODO: link the blog post again to explain why the tradeoff exists -->
-
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 

From 9cf4a8b45668cbcb1bb10ad61d9a9c2c27ab9890 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 16 May 2023 14:23:10 -0400
Subject: [PATCH 130/935] Build with non Python files (#23405)

* Add a test of the built release

* Polish everything

* Trigger CI
---
 MANIFEST.in          |  1 -
 Makefile             |  7 +++++++
 setup.py             | 12 ++++-------
 utils/check_build.py | 48 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 9 deletions(-)
 delete mode 100644 MANIFEST.in
 create mode 100644 utils/check_build.py

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 1aba38f67a22..000000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-include LICENSE
diff --git a/Makefile b/Makefile
index d6d6966a1dad..25ab889148bd 100644
--- a/Makefile
+++ b/Makefile
@@ -111,3 +111,10 @@ post-release:
 
 post-patch:
 	python utils/release.py --post_release --patch
+
+build-release:
+	rm -rf dist
+	rm -rf build
+	python setup.py bdist_wheel
+	python setup.py sdist
+	python utils/check_build.py
diff --git a/setup.py b/setup.py
index f553e69bc129..952cfcf510aa 100644
--- a/setup.py
+++ b/setup.py
@@ -38,14 +38,9 @@
 7. Build both the sources and the wheel. Do not change anything in setup.py between
    creating the wheel and the source distribution (obviously).
 
-   Clean up your build and dist folders (to avoid re-uploading oldies):
-   rm -rf dist
-   rm -rf build
+   Run `make build-release`. This will build the release and do some sanity checks for you. If this ends with an error
+   message, you need to fix things before going further.
 
-   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
-   (this will build a wheel for the python version you use to build it).
-
-   For the sources, run: "python setup.py sdist"
    You should now have a /dist directory with both .whl and .tar.gz source versions.
 
 8. Check that everything looks correct by uploading the package to the pypi test server:
@@ -61,6 +56,7 @@
    Check you can run the following commands:
    python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))"
    python -c "from transformers import *"
+   python utils/check_build.py --check_lib
 
    If making a patch release, double check the bug you are patching is indeed resolved.
 
@@ -446,7 +442,7 @@ def run(self):
     package_dir={"": "src"},
     packages=find_packages("src"),
     include_package_data=True,
-    package_data={"transformers": ["*.cu", "*.cpp", "*.cuh", "*.h", "*.pyx"]},
+    package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx"]},
     zip_safe=False,
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
diff --git a/utils/check_build.py b/utils/check_build.py
new file mode 100644
index 000000000000..a699ed4f7e0f
--- /dev/null
+++ b/utils/check_build.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import importlib
+from pathlib import Path
+
+
+# Test all the extensions added in the setup
+FILES_TO_FIND = [
+    "kernels/rwkv/wkv_cuda.cu",
+    "kernels/rwkv/wkv_op.cpp",
+    "models/deformable_detr/custom_kernel/ms_deform_attn.h",
+    "models/deformable_detr/custom_kernel/cuda/ms_deform_im2col_cuda.cuh",
+    "models/graphormer/algos_graphormer.pyx",
+]
+
+
+def test_custom_files_are_present(transformers_path):
+    # Test all the extensions added in the setup
+    for file in FILES_TO_FIND:
+        if not (transformers_path / file).exists():
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--check_lib", action="store_true", help="Whether to check the build or the actual package.")
+    args = parser.parse_args()
+    if args.check_lib:
+        transformers_module = importlib.import_module("transformers")
+        transformers_path = Path(transformers_module.__file__).parent
+    else:
+        transformers_path = Path.cwd() / "build/lib/transformers"
+    if not test_custom_files_are_present(transformers_path):
+        raise ValueError("The built release does not contain the custom files. Fix this before going further!")

From 918a06e25dfd6f79a20b6f07f63598c71e440161 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 16 May 2023 19:28:19 +0100
Subject: [PATCH 131/935] Generate: add test to check KV format (#23403)

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 tests/generation/test_utils.py                | 71 +++++++++++++++++++
 tests/models/bloom/test_modeling_bloom.py     |  4 ++
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  |  4 ++
 .../models/reformer/test_modeling_reformer.py |  6 +-
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 70de057d5fe7..06e56498d925 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1645,6 +1645,77 @@ def test_left_padding_compatibility(self):
 
             self.assertTrue(no_failures)
 
+    def test_past_key_values_format(self):
+        # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
+        # standard KV cache format is important for a consistent API (and for advanced generation methods).
+        for model_class in self.all_generative_model_classes:
+            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # If it doesn't support cache, pass the test
+            if not hasattr(config, "use_cache"):
+                return
+
+            model = model_class(config).to(torch_device)
+            if "use_cache" not in inputs:
+                inputs["use_cache"] = True
+            outputs = model(**inputs)
+
+            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
+            if "past_key_values" not in outputs:
+                return
+
+            num_hidden_layers = (
+                getattr(config, "decoder_layers", None)
+                or getattr(config, "num_decoder_layers", None)
+                or config.num_hidden_layers
+            )
+            num_attention_heads = getattr(config, "decoder_attention_heads", config.num_attention_heads)
+            embed_dim = getattr(config, "d_model", config.hidden_size)
+            per_head_embed_dim = embed_dim // num_attention_heads
+
+            past_kv = outputs["past_key_values"]
+            self.assertEqual(len(past_kv), num_hidden_layers)
+
+            # Encoder-Decoder checks
+            if config.is_encoder_decoder:
+                encoder_num_attention_heads = config.encoder_attention_heads
+                encoder_per_head_embed_dim = embed_dim // encoder_num_attention_heads
+                batch_size, seq_length = inputs["decoder_input_ids"].shape
+                for i in range(num_hidden_layers):
+                    self.assertEqual(len(past_kv[i]), 4)  # K V for the decoder + K V for the encoder = 4
+                    self.assertEqual(
+                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+                    self.assertEqual(
+                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+                    # The sequence length for the encoder K V depends on the model. Since it is not manipulated in
+                    # autoregressive generation, I'm keeping the test general and not checking the 3rd dim
+                    self.assertEqual(
+                        (past_kv[i][2].shape[0], past_kv[i][2].shape[1], past_kv[i][2].shape[3]),
+                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
+                    )
+                    self.assertEqual(
+                        (past_kv[i][3].shape[0], past_kv[i][3].shape[1], past_kv[i][3].shape[3]),
+                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
+                    )
+
+            # Decoder-only checks
+            else:
+                # TODO: this line is only needed because of imagegpt, where "pixel_values" = "input_ids". Fix the
+                # tests in imagegpt such that `prepare_config_and_inputs_for_common` returns the later (and the other
+                # tests use it)
+                key = "input_ids" if "input_ids" in inputs else "pixel_values"
+                batch_size, seq_length = inputs[key].shape
+                for i in range(num_hidden_layers):
+                    self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
+                    self.assertEqual(
+                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+                    self.assertEqual(
+                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                    )
+
     def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
         batch_size, seq_length = input_ids.shape
         num_sequences_in_output = batch_size * num_return_sequences
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 617998cc61b3..678c46bd0ca4 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -393,6 +393,10 @@ def test_bloom_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs)
 
+    @unittest.skip("Bloom has a non-standard KV cache format.")
+    def test_past_key_values_format(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index a9f4d204bf9c..01e6ceef9e90 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -475,6 +475,10 @@ def test_cpu_offload(self):
     def test_disk_offload(self):
         pass
 
+    @unittest.skip("BigCodeGPT has a non-standard KV cache format.")
+    def test_past_key_values_format(self):
+        pass
+
     def test_gpt_bigcode_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt_bigcode_model(*config_and_inputs)
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index a7f4f2f45416..39e1389477b8 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -831,8 +831,12 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
+    @unittest.skip("Fails because the sequence length is not a multiple of 4")
     def test_problem_types(self):
-        # Fails because the sequence length is not a multiple of 4
+        pass
+
+    @unittest.skip("Fails because the sequence length is not a multiple of 4")
+    def test_past_key_values_format(self):
         pass
 
 

From 4e244b8817356d8a090b80185e8e98a865871e91 Mon Sep 17 00:00:00 2001
From: Taras Tsugrii <taras.tsugriy@gmail.com>
Date: Tue, 16 May 2023 12:14:11 -0700
Subject: [PATCH 132/935] Replace appends with list comprehension. (#23359)

It's more idiomatic and significantly more efficient because
1) it avoids repeated `append` call that Python has to resolve on each iteration
2) can preallocate the size of the final list avoiding resizing
---
 src/transformers/tokenization_utils_fast.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 51b5acebbf27..106e0d5bf841 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -249,10 +249,7 @@ def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, Lis
         if isinstance(tokens, str):
             return self._convert_token_to_id_with_added_voc(tokens)
 
-        ids = []
-        for token in tokens:
-            ids.append(self._convert_token_to_id_with_added_voc(token))
-        return ids
+        return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
 
     def _convert_token_to_id_with_added_voc(self, token: str) -> int:
         index = self._tokenizer.token_to_id(token)

From d712ebd86dc6080d476846bcefca5522b4da7e31 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 16 May 2023 15:18:31 -0400
Subject: [PATCH 133/935] Fix smdistributed check (#23414)

---
 src/transformers/utils/import_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 512c460e15a1..413aa5ecfa08 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -118,7 +118,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
         importlib_metadata.version("scikit-learn")
     except importlib_metadata.PackageNotFoundError:
         _sklearn_available = False
-_smdistributed_available = _is_package_available("smdistributed")
+_smdistributed_available = importlib.util.find_spec("smdistributed") is not None
 _soundfile_available = _is_package_available("soundfile")
 _spacy_available = _is_package_available("spacy")
 _sudachipy_available = _is_package_available("sudachipy")

From 17d0290e575c2415e41c1e98eaa7759d4483cb70 Mon Sep 17 00:00:00 2001
From: ropoctl <ropoctl@gmail.com>
Date: Tue, 16 May 2023 12:46:53 -0700
Subject: [PATCH 134/935] Why crash the whole run when HFHub gives a 50x error?
 (#23320)

Logging an error and continuing is probably following the principle of least surprise.
---
 src/transformers/trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f7fb3558df7f..1efc262a6292 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3649,6 +3649,8 @@ def _push_from_checkpoint(self, checkpoint_folder):
             _, self.push_in_progress = self.repo.push_to_hub(
                 commit_message=commit_message, blocking=False, auto_lfs_prune=True
             )
+        except Exception as e:
+            logger.error(f"Error when pushing to hub: {e}")
         finally:
             if self.args.hub_strategy == HubStrategy.CHECKPOINT:
                 # Move back the checkpoint to its place

From ca3df9f0cfc73adbda59fd8044527d40fc09ffff Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 May 2023 23:29:02 +0200
Subject: [PATCH 135/935] Run doctest (in PRs) only when some doc example(s)
 are modified (#23387)

* fix

* fix

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml                |  8 +++
 .circleci/create_circleci_config.py | 12 +++-
 utils/tests_fetcher.py              | 89 +++++++++++++++++++++++------
 3 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 63c6162fc15d..66678d0d4a0f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -43,6 +43,12 @@ jobs:
                 else
                     touch test_preparation/test_list.txt
                 fi
+            - run: |
+                if [ -f doctest_list.txt ]; then
+                    cp doctest_list.txt test_preparation/doctest_list.txt
+                else
+                    touch test_preparation/doctest_list.txt
+                fi
             - run: |
                 if [ -f test_repo_utils.txt ]; then
                     mv test_repo_utils.txt test_preparation/test_repo_utils.txt
@@ -71,6 +77,8 @@ jobs:
                   fi
             - store_artifacts:
                   path: test_preparation/test_list.txt
+            - store_artifacts:
+                  path: test_preparation/doctest_list.txt
             - store_artifacts:
                   path: ~/transformers/test_preparation/filtered_test_list.txt
             - store_artifacts:
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index ef100bdbb135..4bc5ce17d08c 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -483,7 +483,6 @@ def job_name(self):
     hub_job,
     onnx_job,
     exotic_models_job,
-    doc_test_job
 ]
 EXAMPLES_TESTS = [
     examples_torch_job,
@@ -495,6 +494,8 @@ def job_name(self):
     pipelines_tf_job,
 ]
 REPO_UTIL_TESTS = [repo_utils_job]
+DOC_TESTS = [doc_test_job]
+
 
 def create_circleci_config(folder=None):
     if folder is None:
@@ -552,6 +553,15 @@ def create_circleci_config(folder=None):
     if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
         jobs.extend(EXAMPLES_TESTS)
 
+    doctest_file = os.path.join(folder, "doctest_list.txt")
+    if os.path.exists(doctest_file):
+        with open(doctest_file) as f:
+            doctest_list = f.read()
+    else:
+        doctest_list = []
+    if len(doctest_list) > 0:
+        jobs.extend(DOC_TESTS)
+
     repo_util_file = os.path.join(folder, "test_repo_utils.txt")
     if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0:
         jobs.extend(REPO_UTIL_TESTS)
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 8aa1015f3947..05009e9759fb 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -116,6 +116,26 @@ def clean_code(content):
     return "\n".join(lines_to_keep)
 
 
+def keep_doc_examples_only(content):
+    """
+    Remove code, docstring that is not code example, empty line or comments from `content`.
+    """
+    # Keep doc examples only by splitting on triple "`"
+    splits = content.split("```")
+    # Add leading and trailing "```" so the navigation is easier when compared to the original input `content`
+    content = "```" + "```".join(splits[1::2]) + "```"
+
+    # Remove empty lines and comments
+    lines_to_keep = []
+    for line in content.split("\n"):
+        # remove anything that is after a # sign.
+        line = re.sub("#.*$", "", line)
+        if len(line) == 0 or line.isspace():
+            continue
+        lines_to_keep.append(line)
+    return "\n".join(lines_to_keep)
+
+
 def get_all_tests():
     """
     Return a list of paths to all test folders and files under `tests`. All paths are rooted at `tests`.
@@ -162,6 +182,24 @@ def diff_is_docstring_only(repo, branching_point, filename):
     return old_content_clean == new_content_clean
 
 
+def diff_contains_doc_examples(repo, branching_point, filename):
+    """
+    Check if the diff is only in code in a filename.
+    """
+    folder = Path(repo.working_dir)
+    with checkout_commit(repo, branching_point):
+        with open(folder / filename, "r", encoding="utf-8") as f:
+            old_content = f.read()
+
+    with open(folder / filename, "r", encoding="utf-8") as f:
+        new_content = f.read()
+
+    old_content_clean = keep_doc_examples_only(old_content)
+    new_content_clean = keep_doc_examples_only(new_content)
+
+    return old_content_clean != new_content_clean
+
+
 def get_diff(repo, base_commit, commits):
     """
     Get's the diff between one or several commits and the head of the repository.
@@ -216,32 +254,46 @@ def get_modified_python_files(diff_with_last_commit=False):
         return get_diff(repo, repo.head.commit, parent_commits)
 
 
-def get_diff_for_py_and_mdx_files(repo, base_commit, commits):
+def get_diff_for_doctesting(repo, base_commit, commits):
     """
-    Get's the diff between one or several commits and the head of the repository.
+    Get's the diff between one or several commits and the head of the repository where some doc example(s) are changed.
     """
     print("\n### DIFF ###\n")
     code_diff = []
     for commit in commits:
         for diff_obj in commit.diff(base_commit):
-            # We always add new python files
-            if diff_obj.change_type in ["A", "M", "R"] and (
-                diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")
-            ):
+            # We always add new python/mdx files
+            if diff_obj.change_type in ["A"] and (diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")):
                 code_diff.append(diff_obj.b_path)
+            # Now for modified files
+            elif (
+                diff_obj.change_type in ["M", "R"]
+                and diff_obj.b_path.endswith(".py")
+                or diff_obj.b_path.endswith(".mdx")
+            ):
+                # In case of renames, we'll look at the tests using both the old and new name.
+                if diff_obj.a_path != diff_obj.b_path:
+                    code_diff.extend([diff_obj.a_path, diff_obj.b_path])
+                else:
+                    # Otherwise, we check modifications contain some doc example(s).
+                    if diff_contains_doc_examples(repo, commit, diff_obj.b_path):
+                        code_diff.append(diff_obj.a_path)
+                    else:
+                        print(f"Ignoring diff in {diff_obj.b_path} as it doesn't contain any doc example.")
 
     return code_diff
 
 
-def get_modified_python_and_mdx_files(diff_with_last_commit=False):
+def get_doctest_files(diff_with_last_commit=False):
     """
-    Return a list of python and mdx files that have been modified between:
+    Return a list of python and mdx files where some doc example(s) in them have been modified between:
 
     - the current head and the main branch if `diff_with_last_commit=False` (default)
     - the current head and its parent commit otherwise.
     """
     repo = Repo(PATH_TO_REPO)
 
+    test_files_to_run = []  # noqa
     if not diff_with_last_commit:
         print(f"main is at {repo.refs.main.commit}")
         print(f"Current head is at {repo.head.commit}")
@@ -249,23 +301,14 @@ def get_modified_python_and_mdx_files(diff_with_last_commit=False):
         branching_commits = repo.merge_base(repo.refs.main, repo.head)
         for commit in branching_commits:
             print(f"Branching commit: {commit}")
-        return get_diff_for_py_and_mdx_files(repo, repo.head.commit, branching_commits)
+        test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, branching_commits)
     else:
         print(f"main is at {repo.head.commit}")
         parent_commits = repo.head.commit.parents
         for commit in parent_commits:
             print(f"Parent commit: {commit}")
-        return get_diff_for_py_and_mdx_files(repo, repo.head.commit, parent_commits)
-
-
-def get_doctest_files(diff_with_last_commit=False):
-    """
-    Return a list of python and mdx files that have been modified between:
+        test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, parent_commits)
 
-    - the current head and the main branch if `diff_with_last_commit=False` (default)
-    - the current head and its parent commit otherwise.
-    """
-    test_files_to_run = get_modified_python_and_mdx_files(diff_with_last_commit)
     with open("utils/documentation_tests.txt") as fp:
         documentation_tests = set(fp.read().strip().split("\n"))
     # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
@@ -647,6 +690,14 @@ def infer_tests_to_run(
 
         create_json_map(test_files_to_run, json_output_file)
 
+    doctest_list = get_doctest_files()
+
+    print(f"\n### DOCTEST TO RUN ###\n{_print_list(doctest_list)}")
+    if len(doctest_list) > 0:
+        doctest_file = Path(output_file).parent / "doctest_list.txt"
+        with open(doctest_file, "w", encoding="utf-8") as f:
+            f.write(" ".join(doctest_list))
+
 
 def filter_tests(output_file, filters):
     """

From 46d2468695a85dfcc2be0524caa912edefcf2391 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 May 2023 23:35:11 +0200
Subject: [PATCH 136/935] Update
 `ConvNextV2ModelIntegrationTest::test_inference_image_classification_head`
 (#23402)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/convnextv2/test_modeling_convnextv2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index 85ce5b1813bb..c3f8804f1cca 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -353,5 +353,5 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.3083, -0.3040, -0.4344]).to(torch_device)
+        expected_slice = torch.tensor([0.9996, 0.1966, -0.4386]).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 43f146208efef87be3bb8f3ffc5427ac1908106c Mon Sep 17 00:00:00 2001
From: Taras Tsugrii <taras.tsugriy@gmail.com>
Date: Wed, 17 May 2023 01:43:02 -0700
Subject: [PATCH 137/935] Fix a typo in HfAgent docstring. (#23420)

---
 src/transformers/tools/agents.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 6a17c59445c4..15413332f28b 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -430,7 +430,7 @@ def _completion_generate(self, prompts, stop):
 
 class HfAgent(Agent):
     """
-    Agent that uses and inference endpoint to generate code.
+    Agent that uses an inference endpoint to generate code.
 
     Args:
         url_endpoint (`str`):

From a6c9643ce7a08fd2d387e98225e6ec192f99fa31 Mon Sep 17 00:00:00 2001
From: Taras Tsugrii <taras.tsugriy@gmail.com>
Date: Wed, 17 May 2023 03:25:29 -0700
Subject: [PATCH 138/935] Use dict.items to avoid unnecessary lookups. (#23415)

It's more efficient to iterate over key, value dict pairs instead of iterating over keys and performing value lookups on each iteration. It's also more idiomatic.
---
 src/transformers/tools/agents.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 15413332f28b..200ff5ea1d08 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -92,8 +92,7 @@ def _setup_default_tools():
     tools_module = main_module.tools
 
     remote_tools = get_remote_tools()
-    for task_name in TASK_MAPPING:
-        tool_class_name = TASK_MAPPING.get(task_name)
+    for task_name, tool_class_name in TASK_MAPPING.items():
         tool_class = getattr(tools_module, tool_class_name)
         description = tool_class.description
         HUGGINGFACE_DEFAULT_TOOLS[tool_class.name] = PreTool(task=task_name, description=description, repo_id=None)

From 22a0769933448719aa682001748397da9b380625 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 17 May 2023 14:26:50 +0200
Subject: [PATCH 139/935] Update 3 docker files to use cu118 (#23406)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/build-docker-images.yml     | 20 +++++++++++++++++++
 docker/transformers-all-latest-gpu/Dockerfile |  2 +-
 .../Dockerfile                                |  6 ++++--
 .../Dockerfile                                |  6 ++++--
 docker/transformers-pytorch-gpu/Dockerfile    |  4 ++--
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 26c5e517d2a5..dfc407e69abc 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -71,6 +71,16 @@ jobs:
     name: "Latest PyTorch + DeepSpeed"
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
@@ -98,6 +108,16 @@ jobs:
     name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 3ad87790d61c..c5ada9209bca 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -35,7 +35,7 @@ RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
 RUN python3 -m pip uninstall -y flax jax
 
-RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
+RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://developer.intel.com/ipex-whl-stable-cpu
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 6d54b971d22c..b6e892a5e15b 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -1,12 +1,12 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
-FROM nvcr.io/nvidia/pytorch:22.08-py3
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 ARG PYTORCH='2.0.1'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu117'
+ARG CUDA='cu118'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
@@ -15,6 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
+RUN python3 -m pip uninstall -y torch torchvision torchaudio
+
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
index 1e69546a21eb..50efc08129d8 100644
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -1,11 +1,11 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
-FROM nvcr.io/nvidia/pytorch:22.08-py3
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu117'
+ARG CUDA='cu118'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
@@ -14,6 +14,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 
+RUN python3 -m pip uninstall -y torch torchvision torchaudio
+
 # Install **nightly** release PyTorch (flag `--pre`)
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index 9bf91674afcd..d06a523af0ce 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -16,7 +16,7 @@ ARG PYTORCH='2.0.1'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu117'
+ARG CUDA='cu118'
 
 RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
 RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' ||  VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA

From 3d3c7d4213e08d69254edb9c04ac28b3dfbd40f4 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 17 May 2023 14:27:43 +0200
Subject: [PATCH 140/935] [`SAM`] fix sam slow test (#23376)

* fix sam slow test

* oops

* fix error message
---
 src/transformers/models/sam/processing_sam.py | 4 ++--
 tests/models/sam/test_modeling_sam.py         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index b5ae51d7db29..919ed685a861 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -208,7 +208,7 @@ def _check_and_preprocess_points(
                 input_points = input_points.numpy().tolist()
 
             if not isinstance(input_points, list) or not isinstance(input_points[0], list):
-                raise ValueError("Input points must be a list of list of floating integers.")
+                raise ValueError("Input points must be a list of list of floating points.")
             input_points = [np.array(input_point) for input_point in input_points]
         else:
             input_points = None
@@ -232,7 +232,7 @@ def _check_and_preprocess_points(
                 or not isinstance(input_boxes[0], list)
                 or not isinstance(input_boxes[0][0], list)
             ):
-                raise ValueError("Input boxes must be a list of list of list of floating integers.")
+                raise ValueError("Input boxes must be a list of list of list of floating points.")
             input_boxes = [np.array(box).astype(np.float32) for box in input_boxes]
         else:
             input_boxes = None
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index e51eb07dd311..a45e5c4bf545 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -481,7 +481,7 @@ def test_inference_mask_generation_one_point_one_bb(self):
         model.eval()
 
         raw_image = prepare_image()
-        input_boxes = [[650, 900, 1000, 1250]]
+        input_boxes = [[[650, 900, 1000, 1250]]]
         input_points = [[[820, 1080]]]
 
         inputs = processor(
@@ -541,7 +541,7 @@ def test_inference_mask_generation_one_point_one_bb_zero(self):
         model.eval()
 
         raw_image = prepare_image()
-        input_boxes = [[620, 900, 1000, 1255]]
+        input_boxes = [[[620, 900, 1000, 1255]]]
         input_points = [[[820, 1080]]]
         labels = [[0]]
 

From 3d764fe86074661614a14a84607bcba43daa94a2 Mon Sep 17 00:00:00 2001
From: Taras Tsugrii <taras.tsugriy@gmail.com>
Date: Wed, 17 May 2023 06:00:08 -0700
Subject: [PATCH 141/935] Return early once stop token is found. (#23421)

Previously even after finding a stop token, other stop tokens were considered, which is unnecessary and slows down processing.

Currently, this unnecessary overhead is negligible since there are usually 2 stop tokens considered and they are fairly short, but in future it may become more expensive.
---
 src/transformers/tools/agents.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 200ff5ea1d08..79413954df10 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -490,5 +490,5 @@ def generate_one(self, prompt, stop):
         # Inference API returns the stop sequence
         for stop_seq in stop:
             if result.endswith(stop_seq):
-                result = result[: -len(stop_seq)]
+                return result[: -len(stop_seq)]
         return result

From a2789adddf442a40ad8cacaf7c345170bb65c8b5 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Wed, 17 May 2023 21:05:07 +0800
Subject: [PATCH 142/935] [Reland] search model buffers for dtype as the last
 resort (#23319)

search model buffers for dtype as the last resort
---
 src/transformers/modeling_utils.py | 33 ++++++++++++++++++------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index bf06d9c40538..a8e8e4b2e241 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -207,22 +207,29 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil
         # if no floating dtype was found return whatever the first dtype is
         return last_dtype
 
-    else:
-        # For nn.DataParallel compatibility in PyTorch > 1.5
-        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        last_tuple = None
-        for tuple in gen:
-            last_tuple = tuple
-            if tuple[1].is_floating_point():
-                return tuple[1].dtype
-
+    # For nn.DataParallel compatibility in PyTorch > 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+        tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+        return tuples
+
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    last_tuple = None
+    for tuple in gen:
+        last_tuple = tuple
+        if tuple[1].is_floating_point():
+            return tuple[1].dtype
+
+    if last_tuple is not None:
         # fallback to the last dtype
         return last_tuple[1].dtype
 
+    # fallback to buffer dtype
+    for t in parameter.buffers():
+        last_dtype = t.dtype
+        if t.is_floating_point():
+            return t.dtype
+    return last_dtype
+
 
 def get_state_dict_float_dtype(state_dict):
     """

From ebb649a4e38a5d1c1b0ae409514c0cc1a5e136d9 Mon Sep 17 00:00:00 2001
From: IMvision12 <88665786+IMvision12@users.noreply.github.com>
Date: Wed, 17 May 2023 20:15:15 +0530
Subject: [PATCH 143/935] Add Missing tokenization test [electra] (#22997)

* Create test_tokenization_electra.py

* Update tests/models/electra/test_tokenization_electra.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../electra/test_tokenization_electra.py      | 335 ++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 tests/models/electra/test_tokenization_electra.py

diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py
new file mode 100644
index 000000000000..1c9b517f1f1d
--- /dev/null
+++ b/tests/models/electra/test_tokenization_electra.py
@@ -0,0 +1,335 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import ElectraTokenizerFast
+from transformers.models.electra.tokenization_electra import (
+    VOCAB_FILES_NAMES,
+    BasicTokenizer,
+    ElectraTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.testing_utils import require_tokenizers, slow
+
+from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
+
+
+@require_tokenizers
+class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = ElectraTokenizer
+    rust_tokenizer_class = ElectraTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+    from_pretrained_filter = filter_non_english
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # With lower casing
+        tokenizer = self.get_tokenizer(do_lower_case=True)
+        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for i, token in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        self.assertListEqual(
+            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("google/electra-base-discriminator")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+    def test_offsets_with_special_characters(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
+                    sentence,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    return_offsets_mapping=True,
+                    add_special_tokens=True,
+                )
+
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
+                expected_results = (
+                    [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "A"),
+                        ((1, 2), ","),
+                        ((3, 5), "na"),
+                        ((5, 6), "##ï"),
+                        ((6, 8), "##ve"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "Allen"),
+                        ((21, 23), "##NL"),
+                        ((23, 24), "##P"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                    if not do_lower_case
+                    else [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "a"),
+                        ((1, 2), ","),
+                        ((3, 8), "naive"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "allen"),
+                        ((21, 23), "##nl"),
+                        ((23, 24), "##p"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                )
+
+                self.assertEqual(
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
+
+    def test_change_tokenize_chinese_chars(self):
+        list_of_commun_chinese_char = ["的", "人", "有"]
+        text_with_chinese_char = "".join(list_of_commun_chinese_char)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                kwargs["tokenize_chinese_chars"] = True
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
+                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
+
+                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
+                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
+
+                # it is expected that each Chinese character is not preceded by "##"
+                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
+                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
+
+                kwargs["tokenize_chinese_chars"] = False
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
+                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
+
+                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
+                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
+
+                # it is expected that only the first Chinese character is not preceded by "##".
+                expected_tokens = [
+                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
+                ]
+                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
+                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)

From 5ba0c332b6bef130ab6dcb734230849c903839f7 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Wed, 17 May 2023 16:46:55 +0200
Subject: [PATCH 144/935] Top 100 (#22912)

* Awesome Transformers

* Update

* Update

* Keywords

* Keywords

* Complete document

* Add lm-evaluation-harness

* Edit txtai according to David's comments

* Update awesome-transformers.md
---
 awesome-transformers.md | 584 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 584 insertions(+)
 create mode 100644 awesome-transformers.md

diff --git a/awesome-transformers.md b/awesome-transformers.md
new file mode 100644
index 000000000000..6dab782a1d46
--- /dev/null
+++ b/awesome-transformers.md
@@ -0,0 +1,584 @@
+# Awesome projects built with Transformers
+
+This page lists awesome projects built on top of Transformers. Transformers is more than a toolkit to use pretrained
+models: it's a community of projects built around it and the Hugging Face Hub. We want Transformers to enable
+developers, researchers, students, professors, engineers, and anyone else to build their dream projects.
+
+In this list, we showcase incredibly impactful and novel projects that have pushed the field forward. We celebrate
+100 of these projects as we reach the milestone of 100k stars as a community; but we're very open to pull requests
+adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR 
+to add it.
+
+## [gpt4all](https://github.com/nomic-ai/gpt4all)
+
+[gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.
+
+Keywords: Open-source, LLaMa, GPT-J, instruction, assistant
+
+## [recommenders](https://github.com/microsoft/recommenders)
+
+This repository contains examples and best practices for building recommendation systems, provided as Jupyter notebooks. It goes over several aspects required to build efficient recommendation systems: data preparation, modeling, evaluation, model selection & optimization, as well as operationalization
+
+Keywords: Recommender systems, AzureML
+
+## [lama-cleaner](https://github.com/Sanster/lama-cleaner)
+
+Image inpainting tool powered by Stable Diffusion. Remove any unwanted object, defect, people from your pictures or erase and replace anything on your pictures.
+
+Keywords: inpainting, SD, Stable Diffusion
+
+## [flair](https://github.com/flairNLP/flair)
+
+FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and ducoment embeddings, among other things.
+
+Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis
+
+## [mindsdb](https://github.com/mindsdb/mindsdb)
+
+MindsDB is a low-code ML platform, which automates and integrates several ML frameworks into the data stack as "AI Tables" to streamline the integration of AI into applications, making it accessible to developers of all skill levels.
+
+Keywords: Database, low-code, AI table
+
+## [langchain](https://github.com/hwchase17/langchain)
+
+[langchain](https://github.com/hwchase17/langchain) is aimed at assisting in the development of apps merging both LLMs and other sources of knowledge. The library allows chaining calls to applications, creating a sequence across many tools.
+
+Keywords: LLMs, Large Language Models, Agents, Chains
+
+## [ParlAI](https://github.com/facebookresearch/ParlAI)
+
+[ParlAI](https://github.com/facebookresearch/ParlAI) is a python framework for sharing, training and testing dialogue models, from open-domain chitchat, to task-oriented dialogue, to visual question answering. It provides more than 100 datasets under the same API, a large zoo of pretrained models, a set of agents, and has several integrations.
+
+Keywords: Dialogue, Chatbots, VQA, Datasets, Agents
+
+## [sentence-transformers](https://github.com/UKPLab/sentence-transformers)
+
+This framework provides an easy method to compute dense vector representations for sentences, paragraphs, and images. The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and achieve state-of-the-art performance in various task. Text is embedding in vector space such that similar text is close and can efficiently be found using cosine similarity.
+
+Keywords: Dense vector representations, Text embeddings, Sentence embeddings
+
+## [ludwig](https://github.com/ludwig-ai/ludwig)
+
+Ludwig is a declarative machine learning framework that makes it easy to define machine learning pipelines using a simple and flexible data-driven configuration system. Ludwig is targeted at a wide variety of AI tasks. It provides a data-driven configuration system, training, prediction, and evaluation scripts, as well as a programmatic API.
+
+Keywords: Declarative, Data-driven, ML Framework
+
+## [InvokeAI](https://github.com/invoke-ai/InvokeAI)
+
+[InvokeAI](https://github.com/invoke-ai/InvokeAI) is an engine for Stable Diffusion models, aimed at professionals, artists, and enthusiasts. It leverages the latest AI-driven technologies through CLI as well as a WebUI.
+
+Keywords: Stable-Diffusion, WebUI, CLI
+
+## [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)
+
+[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) is an easy-to-use and powerful NLP library particularly targeted at the Chinese languages. It has support for multiple pre-trained model zoos, and supports a wide-range of NLP tasks from research to industrial applications.
+
+Keywords: NLP, Chinese, Research, Industry
+
+## [stanza](https://github.com/stanfordnlp/stanza)
+
+The Stanford NLP Group's official Python NLP library. It contains support for running various accurate natural language processing tools on 60+ languages and for accessing the Java Stanford CoreNLP software from Python.
+
+Keywords: NLP, Multilingual, CoreNLP
+
+## [DeepPavlov](https://github.com/deeppavlov/DeepPavlov)
+
+[DeepPavlov](https://github.com/deeppavlov/DeepPavlov) is an open-source conversational AI library. It is designed for the development of production ready chat-bots and complex conversational systems, as well as research in the area of NLP and, particularly, of dialog systems.
+
+Keywords: Conversational, Chatbot, Dialog
+
+## [alpaca-lora](https://github.com/tloen/alpaca-lora)
+
+Alpaca-lora contains code for reproducing the Stanford Alpaca results using low-rank adaptation (LoRA). The repository provides training (fine-tuning) as well as generation scripts.
+
+Keywords: LoRA, Parameter-efficient fine-tuning
+
+## [imagen-pytorch](https://github.com/lucidrains/imagen-pytorch)
+
+An open-source Implementation of Imagen, Google's closed-source Text-to-Image Neural Network that beats DALL-E2. As of release, it is the new SOTA for text-to-image synthesis.
+
+Keywords: Imagen, Text-to-image
+
+## [adapter-transformers](https://github.com/adapter-hub/adapter-transformers)
+
+[adapter-transformers](https://github.com/adapter-hub/adapter-transformers) is an extension of HuggingFace's Transformers library, integrating adapters into state-of-the-art language models by incorporating AdapterHub, a central repository for pre-trained adapter modules. It is a drop-in replacement for transformers, which is regularly updated to stay up-to-date with the developments of transformers.
+
+Keywords: Adapters, LoRA, Parameter-efficient fine-tuning, Hub
+
+## [NeMo](https://github.com/NVIDIA/NeMo)
+
+NVIDIA [NeMo](https://github.com/NVIDIA/NeMo) is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR), text-to-speech synthesis (TTS), large language models (LLMs), and natural language processing (NLP). The primary objective of [NeMo](https://github.com/NVIDIA/NeMo) is to help researchers from industry and academia to reuse prior work (code and pretrained models) and make it easier to create new https://developer.nvidia.com/conversational-ai#started.
+
+Keywords: Conversational, ASR, TTS, LLMs, NLP
+
+## [Runhouse](https://github.com/run-house/runhouse)
+
+[Runhouse](https://github.com/run-house/runhouse) allows to send code and data to any of your compute or data infra, all in Python, and continue to interact with them normally from your existing code and environment. Runhouse developers mention:
+
+> Think of it as an expansion pack to your Python interpreter that lets it take detours to remote machines or manipulate remote data.
+
+Keywords: MLOps, Infrastructure, Data storage, Modeling
+
+## [MONAI](https://github.com/Project-MONAI/MONAI)
+
+[MONAI](https://github.com/Project-MONAI/MONAI) is a PyTorch-based, open-source framework for deep learning in healthcare imaging, part of PyTorch Ecosystem. Its ambitions are:
+- developing a community of academic, industrial and clinical researchers collaborating on a common foundation;
+- creating state-of-the-art, end-to-end training workflows for healthcare imaging;
+- providing researchers with the optimized and standardized way to create and evaluate deep learning models.
+
+Keywords: Healthcare imaging, Training, Evaluation
+
+## [simpletransformers](https://github.com/ThilinaRajapakse/simpletransformers)
+
+Simple Transformers lets you quickly train and evaluate Transformer models. Only 3 lines of code are needed to initialize, train, and evaluate a model. It supports a wide variety of NLP tasks.
+
+Keywords: Framework, simplicity, NLP
+
+## [JARVIS](https://github.com/microsoft/JARVIS)
+
+[JARVIS](https://github.com/microsoft/JARVIS) is a system attempting to merge LLMs such as GPT-4 with the rest of the open-source ML community: leveraging up to 60 downstream models in order to perform tasks identified by the LLM.
+
+Keywords: LLM, Agents, HF Hub
+
+## [transformers.js](https://xenova.github.io/transformers.js/)
+
+[transformers.js](https://xenova.github.io/transformers.js/) is a JavaScript library targeted at running models from transformers directly within the browser.
+
+Keywords: Transformers, JavaScript, browser
+
+## [bumblebee](https://github.com/elixir-nx/bumblebee)
+
+Bumblebee provides pre-trained Neural Network models on top of Axon, a neural networks library for the Elixir language. It includes integration with 🤗 Models, allowing anyone to download and perform Machine Learning tasks with few lines of code.
+
+Keywords: Elixir, Axon
+
+## [argilla](https://github.com/argilla-io/argilla)
+
+Argilla is an open-source platform providing advanced NLP labeling, monitoring, and workspaces. It is compatible with many open source ecosystems such as Hugging Face, Stanza, FLAIR, and others.
+
+Keywords: NLP, Labeling, Monitoring, Workspaces
+
+## [haystack](https://github.com/deepset-ai/haystack)
+
+Haystack is an open source NLP framework to interact with your data using Transformer models and LLMs. It offers production-ready tools to quickly build complex decision making, question answering, semantic search, text generation applications, and more.
+
+Keywords: NLP, Framework, LLM
+
+## [spaCy](https://github.com/explosion/spaCy)
+
+[spaCy](https://github.com/explosion/spaCy) is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. It offers support for transformers models through its third party package, spacy-transformers.
+
+Keywords: NLP, Framework
+
+## [speechbrain](https://github.com/speechbrain/speechbrain)
+
+SpeechBrain is an open-source and all-in-one conversational AI toolkit based on PyTorch.
+The goal is to create a single, flexible, and user-friendly toolkit that can be used to easily develop state-of-the-art speech technologies, including systems for speech recognition, speaker recognition, speech enhancement, speech separation, language identification, multi-microphone signal processing, and many others.
+
+Keywords: Conversational, Speech
+
+## [skorch](https://github.com/skorch-dev/skorch)
+
+Skorch is a scikit-learn compatible neural network library that wraps PyTorch. It has support for models within transformers, and tokenizers from tokenizers.
+
+Keywords: Scikit-Learn, PyTorch
+
+## [bertviz](https://github.com/jessevig/bertviz)
+
+BertViz is an interactive tool for visualizing attention in Transformer language models such as BERT, GPT2, or T5. It can be run inside a Jupyter or Colab notebook through a simple Python API that supports most Huggingface models.
+
+Keywords: Visualization, Transformers
+
+## [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax)
+
+[mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) is a haiku library using the xmap/pjit operators in JAX for model parallelism of transformers. This library is designed for scalability up to approximately 40B parameters on TPUv3s. It was the library used to train the GPT-J model.
+
+Keywords: Haiku, Model parallelism, LLM, TPU
+
+## [deepchem](https://github.com/deepchem/deepchem)
+
+DeepChem aims to provide a high quality open-source toolchain that democratizes the use of deep-learning in drug discovery, materials science, quantum chemistry, and biology.
+
+Keywords: Drug discovery, Materials Science, Quantum Chemistry, Biology
+
+## [OpenNRE](https://github.com/thunlp/OpenNRE)
+
+An Open-Source Package for Neural Relation Extraction (NRE). It is targeted at a wide range of users, from newcomers to relation extraction, to developers, researchers, or students.
+
+Keywords: Neural Relation Extraction, Framework
+
+## [pycorrector](https://github.com/shibing624/pycorrector)
+
+PyCorrector is a Chinese Text Error Correction Tool. It uses a language model to detect errors, pinyin feature and shape feature to correct Chinese text errors. it can be used for Chinese Pinyin and stroke input method.
+
+Keywords: Chinese, Error correction tool, Language model, Pinyin
+
+## [nlpaug](https://github.com/makcedward/nlpaug)
+
+This python library helps you with augmenting nlp for machine learning projects. It is a lightweight library featuring synthetic data generation for improving model performance, support for audio and text, and compatibility with several ecosystems (scikit-learn, pytorch, tensorflow).
+
+Keywords: Data augmentation, Synthetic data generation, Audio, NLP
+
+## [dream-textures](https://github.com/carson-katri/dream-textures)
+
+[dream-textures](https://github.com/carson-katri/dream-textures) is a library targeted at bringing stable-diffusion support within Blender. It supports several use-cases, such as image generation, texture projection, inpainting/outpainting, rendering passes, and upscaling.
+
+Keywords: Stable-Diffusion, Blender
+
+## [seldon-core](https://github.com/SeldonIO/seldon-core)
+
+Seldon core converts your ML models (Tensorflow, Pytorch, H2o, etc.) or language wrappers (Python, Java, etc.) into production REST/GRPC microservices.
+Seldon handles scaling to thousands of production machine learning models and provides advanced machine learning capabilities out of the box including Advanced Metrics, Request Logging, Explainers, Outlier Detectors, A/B Tests, Canaries and more.
+
+Keywords: Microservices, Modeling, Language wrappers
+
+## [open_model_zoo](https://github.com/openvinotoolkit/open_model_zoo)
+
+This repository includes optimized deep learning models and a set of demos to expedite development of high-performance deep learning inference applications. Use these free pre-trained models instead of training your own models to speed-up the development and production deployment process.
+
+Keywords: Optimized models, Demos
+
+## [ml-stable-diffusion](https://github.com/apple/ml-stable-diffusion)
+
+ML-Stable-Diffusion is a repository by Apple bringing Stable Diffusion support to Core ML, on Apple Silicon devices. It supports stable diffusion checkpoints hosted on the Hugging Face Hub.
+
+Keywords: Stable Diffusion, Apple Silicon, Core ML
+
+## [stable-dreamfusion](https://github.com/ashawkey/stable-dreamfusion)
+
+Stable-Dreamfusion is a pytorch implementation of the text-to-3D model Dreamfusion, powered by the Stable Diffusion text-to-2D model.
+
+Keywords: Text-to-3D, Stable Diffusion
+
+## [txtai](https://github.com/neuml/txtai)
+ 
+[txtai](https://github.com/neuml/txtai) is an open-source platform for semantic search and workflows powered by language models. txtai builds embeddings databases, which are a union of vector indexes and relational databases enabling similarity search with SQL. Semantic workflows connect language models together into unified applications.
+
+Keywords: Semantic search, LLM
+
+## [djl](https://github.com/deepjavalibrary/djl)
+
+Deep Java Library (DJL) is an open-source, high-level, engine-agnostic Java framework for deep learning. DJL is designed to be easy to get started with and simple to use for Java developers. DJL provides a native Java development experience and functions like any other regular Java library.
+
+Keywords: Java, Framework
+
+## [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
+
+This project provides a unified framework to test generative language models on a large number of different evaluation tasks. It has support for more than 200 tasks, and supports different ecosystems: HF Transformers, GPT-NeoX, DeepSpeed, as well as the OpenAI API.
+
+Keywords: LLM, Evaluation, Few-shot
+
+## [gpt-neox](https://github.com/EleutherAI/gpt-neox)
+
+This repository records EleutherAI's library for training large-scale language models on GPUs. The framework is based on NVIDIA's Megatron Language Model and has been augmented with techniques from DeepSpeed as well as some novel optimizations. It is focused on training multi-billion-parameter models.
+
+Keywords: Training, LLM, Megatron, DeepSpeed
+
+## [muzic](https://github.com/microsoft/muzic)
+
+Muzic is a research project on AI music that empowers music understanding and generation with deep learning and artificial intelligence. Muzic was created by researchers from Microsoft Research Asia.
+
+Keywords: Music understanding, Music generation
+
+## [dalle-flow](https://github.com/jina-ai/dalle-flow)
+
+DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
+The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.
+
+Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
+
+## [lightseq](https://github.com/bytedance/lightseq)
+
+LightSeq is a high performance training and inference library for sequence processing and generation implemented in CUDA. It enables highly efficient computation of modern NLP and CV models such as BERT, GPT, Transformer, etc. It is therefore best useful for machine translation, text generation, image classification, and other sequence related tasks.
+
+Keywords: Training, Inference, Sequence Processing, Sequence Generation
+
+## [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
+
+The goal of this project is to create a learning based system that takes an image of a math formula and returns corresponding LaTeX code.
+
+Keywords: OCR, LaTeX, Math formula
+
+## [open_clip](https://github.com/mlfoundations/open_clip)
+
+OpenCLIP is an open source implementation of OpenAI's CLIP.
+
+The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift. 
+The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset. 
+
+Specifically, a ResNet-50 model trained with this codebase on OpenAI's 15 million image subset of YFCC achieves 32.7% top-1 accuracy on ImageNet.
+
+Keywords: CLIP, Open-source, Contrastive, Image-text
+
+## [dalle-playground](https://github.com/saharmor/dalle-playground)
+
+A playground to generate images from any text prompt using Stable Diffusion and Dall-E mini.
+
+Keywords: WebUI, Stable Diffusion, Dall-E mini
+
+## [FedML](https://github.com/FedML-AI/FedML)
+
+[FedML](https://github.com/FedML-AI/FedML) is a federated learning and analytics library enabling secure and collaborative machine learning on decentralized data anywhere at any scale.
+
+It supports large-scale cross-silo federated learning, and cross-device federated learning on smartphones/IoTs, and research simulation.
+
+Keywords: Federated Learning, Analytics, Collaborative ML, Decentralized
+
+## [gpt-code-clippy](https://github.com/CodedotAl/gpt-code-clippy)
+
+GPT-Code-Clippy (GPT-CC) is an open source version of GitHub Copilot, a language model -- based on GPT-3, called GPT-Codex -- that is fine-tuned on publicly available code from GitHub.
+
+Keywords: LLM, Code
+
+## [TextAttack](https://github.com/QData/TextAttack)
+
+[TextAttack](https://github.com/QData/TextAttack) 🐙 is a Python framework for adversarial attacks, data augmentation, and model training in NLP.
+
+Keywords: Adversarial attacks, Data augmentation, NLP
+
+## [OpenPrompt](https://github.com/thunlp/OpenPrompt)
+
+Prompt-learning is a paradigm to adapt pre-trained language models (PLMs) to downstream NLP tasks, which modify the input text with a textual template and directly uses PLMs to conduct pre-trained tasks. This library provides a standard, flexible and extensible framework to deploy the prompt-learning pipeline. [OpenPrompt](https://github.com/thunlp/OpenPrompt) supports loading PLMs directly from https://github.com/huggingface/transformers.
+
+## [text-generation-webui](https://github.com/oobabooga/text-generation-webui/)
+
+[text-generation-webui](https://github.com/oobabooga/text-generation-webui/) is a Gradio Web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, and GALACTICA.
+
+Keywords: LLM, WebUI
+
+## [libra](https://github.com/Palashio/libra)
+
+An ergonomic machine learning [libra](https://github.com/Palashio/libra)ry for non-technical users. It focuses on ergonomics and on ensuring that training a model is as simple as it can be.
+
+Keywords: Ergonomic, Non-technical
+
+## [alibi](https://github.com/SeldonIO/alibi)
+
+Alibi is an open source Python library aimed at machine learning model inspection and interpretation. The focus of the library is to provide high-quality implementations of black-box, white-box, local and global explanation methods for classification and regression models.
+
+Keywords: Model inspection, Model interpretation, Black-box, White-box
+
+## [tortoise-tts](https://github.com/neonbjb/tortoise-tts)
+
+Tortoise is a text-to-speech program built with the following priorities: strong multi-voice capabilities., and highly realistic prosody and intonation.
+
+Keywords: Text-to-speech
+
+## [flower](https://github.com/adap/flower)
+
+Flower (flwr) is a framework for building federated learning systems. The design of Flower is based on a few guiding principles: customizability, extendability, framework agnosticity, and ease-of-use.
+
+Keywords: Federated learning systems, Customizable, Extendable, Framework-agnostic, Simplicity
+
+## [fast-bert](https://github.com/utterworks/fast-bert)
+
+Fast-Bert is a deep learning library that allows developers and data scientists to train and deploy BERT and XLNet based models for natural language processing tasks beginning with Text Classification. It is aimed at simplicity.
+
+Keywords: Deployment, BERT, XLNet
+
+## [towhee](https://github.com/towhee-io/towhee)
+
+Towhee makes it easy to build neural data processing pipelines for AI applications. We provide hundreds of models, algorithms, and transformations that can be used as standard pipeline building blocks. Users can use Towhee's Pythonic API to build a prototype of their pipeline and automatically optimize it for production-ready environments.
+
+Keywords: Data processing pipeline, Optimization
+
+## [alibi-detect](https://github.com/SeldonIO/alibi-detect)
+
+Alibi Detect is an open source Python library focused on outlier, adversarial and drift detection. The package aims to cover both online and offline detectors for tabular data, text, images and time series. Both TensorFlow and PyTorch backends are supported for drift detection.
+
+Keywords: Adversarial, Outlier, Drift detection
+
+## [FARM](https://github.com/deepset-ai/FARM)
+
+[FARM](https://github.com/deepset-ai/FARM) makes Transfer Learning with BERT & Co simple, fast and enterprise-ready. It's built upon transformers and provides additional features to simplify the life of developers: Parallelized preprocessing, highly modular design, multi-task learning, experiment tracking, easy debugging and close integration with AWS SageMaker.
+
+Keywords: Transfer Learning, Modular design, Multi-task learning, Experiment tracking
+
+## [aitextgen](https://github.com/minimaxir/aitextgen)
+
+A robust Python tool for text-based AI training and generation using OpenAI's GPT-2 and EleutherAI's GPT Neo/GPT-3 architecture.
+[aitextgen](https://github.com/minimaxir/aitextgen) is a Python package that leverages PyTorch, Hugging Face Transformers and pytorch-lightning with specific optimizations for text generation using GPT-2, plus many added features.
+
+Keywords: Training, Generation
+
+## [diffgram](https://github.com/diffgram/diffgram)
+
+Diffgram aims to integrate human supervision into platforms. We support your team programmatically changing the UI (Schema, layout, etc.) like in Streamlit. This means that you can collect and annotate timely data from users. In other words, we are the platform behind your platform, an integrated part of your application, to ship new & better AI products faster.
+
+Keywords: Human supervision, Platfor,
+
+## [ecco](https://github.com/jalammar/ecco)
+
+Explain, analyze, and visualize NLP language models. Ecco creates interactive visualizations directly in Jupyter notebooks explaining the behavior of Transformer-based language models (like GPT2, BERT, RoBERTA, T5, and T0).
+
+Keywords: Model explainability
+
+## [s3prl](https://github.com/s3prl/s3prl)
+
+[s3prl](https://github.com/s3prl/s3prl) stands for Self-Supervised Speech Pre-training and Representation Learning. Self-supervised speech pre-trained models are called upstream in this toolkit, and are utilized in various downstream tasks.
+
+Keywords: Speech, Training
+
+## [ru-dalle](https://github.com/ai-forever/ru-dalle)
+
+RuDALL-E aims to be similar to DALL-E, targeted to Russian.
+
+Keywords: DALL-E, Russian
+
+## [DeepKE](https://github.com/zjunlp/DeepKE)
+
+[DeepKE](https://github.com/zjunlp/DeepKE) is a knowledge extraction toolkit for knowledge graph construction supporting cnSchema，low-resource, document-level and multimodal scenarios for entity, relation and attribute extraction.
+
+Keywords: Knowledge Extraction, Knowledge Graphs
+
+## [nebullvm](https://github.com/nebuly-ai/nebullvm)
+
+Nebullvm is an ecosystem of plug and play modules to optimize the performances of your AI systems. The optimization modules are stack-agnostic and work with any library. They are designed to be easily integrated into your system, providing a quick and seamless boost to its performance. Simply plug and play to start realizing the benefits of optimized performance right away.
+
+Keywords: Optimization, Performance
+
+## [imaginAIry](https://github.com/brycedrennan/imaginAIry)
+
+Offers a CLI and a Python API to generate images with Stable Diffusion. It has support for many tools, like image structure control (controlnet), instruction-based image edits (InstructPix2Pix), prompt-based masking (clipseg), among others.
+
+Keywords: Stable Diffusion, CLI, Python API
+
+## [sparseml](https://github.com/neuralmagic/sparseml)
+
+SparseML is an open-source model optimization toolkit that enables you to create inference-optimized sparse models using pruning, quantization, and distillation algorithms. Models optimized with SparseML can then be exported to the ONNX and deployed with DeepSparse for GPU-class performance on CPU hardware.
+
+Keywords: Model optimization, Pruning, Quantization, Distillation
+
+## [opacus](https://github.com/pytorch/opacus)
+
+Opacus is a library that enables training PyTorch models with differential privacy. It supports training with minimal code changes required on the client, has little impact on training performance, and allows the client to online track the privacy budget expended at any given moment.
+
+Keywords: Differential privacy
+
+## [LAVIS](https://github.com/salesforce/LAVIS)
+
+[LAVIS](https://github.com/salesforce/LAVIS) is a Python deep learning library for LAnguage-and-VISion intelligence research and applications. This library aims to provide engineers and researchers with a one-stop solution to rapidly develop models for their specific multimodal scenarios, and benchmark them across standard and customized datasets. It features a unified interface design to access
+
+Keywords: Multimodal, NLP, Vision
+
+## [buzz](https://github.com/chidiwilliams/buzz)
+
+Buzz transcribes and translates audio offline on your personal computer. Powered by OpenAI's Whisper.
+
+Keywords: Audio transcription, Translation
+
+## [rust-bert](https://github.com/guillaume-be/rust-bert)
+
+Rust-native state-of-the-art Natural Language Processing models and pipelines. Port of Hugging Face's Transformers library, using the tch-rs crate and pre-processing from rust-tokenizers. Supports multi-threaded tokenization and GPU inference. This repository exposes the model base architecture, task-specific heads and ready-to-use pipelines.
+
+Keywords: Rust, BERT, Inference
+
+## [EasyNLP](https://github.com/alibaba/EasyNLP)
+
+[EasyNLP](https://github.com/alibaba/EasyNLP) is an easy-to-use NLP development and application toolkit in PyTorch, first released inside Alibaba in 2021. It is built with scalable distributed training strategies and supports a comprehensive suite of NLP algorithms for various NLP applications. [EasyNLP](https://github.com/alibaba/EasyNLP) integrates knowledge distillation and few-shot learning for landing large pre-trained models, together with various popular multi-modality pre-trained models. It provides a unified framework of model training, inference, and deployment for real-world applications.
+
+Keywords: NLP, Knowledge distillation, Few-shot learning, Multi-modality, Training, Inference, Deployment
+
+## [TurboTransformers](https://github.com/Tencent/TurboTransformers)
+
+A fast and user-friendly runtime for transformer inference (Bert, Albert, GPT2, Decoders, etc) on CPU and GPU.
+
+Keywords: Optimization, Performance
+
+## [hivemind](https://github.com/learning-at-home/hivemind)
+
+Hivemind is a PyTorch library for decentralized deep learning across the Internet. Its intended usage is training one large model on hundreds of computers from different universities, companies, and volunteers.
+
+Keywords: Decentralized training
+
+## [docquery](https://github.com/impira/docquery)
+
+DocQuery is a library and command-line tool that makes it easy to analyze semi-structured and unstructured documents (PDFs, scanned images, etc.) using large language models (LLMs). You simply point DocQuery at one or more documents and specify a question you want to ask. DocQuery is created by the team at Impira.
+
+Keywords: Semi-structured documents, Unstructured documents, LLM, Document Question Answering
+
+## [CodeGeeX](https://github.com/THUDM/CodeGeeX)
+
+[CodeGeeX](https://github.com/THUDM/CodeGeeX) is a large-scale multilingual code generation model with 13 billion parameters, pre-trained on a large code corpus of more than 20 programming languages. It has several unique features:
+- Multilingual code generation
+- Crosslingual code translation
+- Is a customizable programming assistant
+
+Keywords: Code Generation Model
+
+## [ktrain](https://github.com/amaiya/ktrain)
+
+[ktrain](https://github.com/amaiya/ktrain) is a lightweight wrapper for the deep learning library TensorFlow Keras (and other libraries) to help build, train, and deploy neural networks and other machine learning models. Inspired by ML framework extensions like fastai and ludwig, [ktrain](https://github.com/amaiya/ktrain) is designed to make deep learning and AI more accessible and easier to apply for both newcomers and experienced practitioners.
+
+Keywords: Keras wrapper, Model building, Training, Deployment
+
+## [FastDeploy](https://github.com/PaddlePaddle/FastDeploy)
+
+[FastDeploy](https://github.com/PaddlePaddle/FastDeploy) is an Easy-to-use and High Performance AI model deployment toolkit for Cloud, Mobile and Edge with packageout-of-the-box and unified experience, endend-to-end optimization for over fire160+ Text, Vision, Speech and Cross-modal AI models. Including image classification, object detection, OCR, face detection, matting, pp-tracking, NLP, stable diffusion, TTS and other tasks to meet developers' industrial deployment needs for multi-scenario, multi-hardware and multi-platform.
+
+Keywords: Model deployment, CLoud, Mobile, Edge
+
+## [underthesea](https://github.com/undertheseanlp/underthesea)
+
+[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
+
+Keywords: Vietnamese, NLP
+
+## [hasktorch](https://github.com/hasktorch/hasktorch)
+
+Hasktorch is a library for tensors and neural networks in Haskell. It is an independent open source community project which leverages the core C++ libraries shared by PyTorch.
+
+Keywords: Haskell, Neural Networks
+
+## [donut](https://github.com/clovaai/donut)
+
+Donut, or Document understanding transformer, is a new method of document understanding that utilizes an OCR-free end-to-end Transformer model.
+
+Donut does not require off-the-shelf OCR engines/APIs, yet it shows state-of-the-art performances on various visual document understanding tasks, such as visual document classification or information extraction (a.k.a. document parsing).
+
+Keywords: Document Understanding
+
+## [transformers-interpret](https://github.com/cdpierse/transformers-interpret)
+
+Transformers Interpret is a model explainability tool designed to work exclusively with the transformers package.
+
+In line with the philosophy of the Transformers package Transformers Interpret allows any transformers model to be explained in just two lines. Explainers are available for both text and computer vision models. Visualizations are also available in notebooks and as savable png and html files
+
+Keywords: Model interpretation, Visualization
+
+## [mlrun](https://github.com/mlrun/mlrun)
+
+MLRun is an open MLOps platform for quickly building and managing continuous ML applications across their lifecycle. MLRun integrates into your development and CI/CD environment and automates the delivery of production data, ML pipelines, and online applications, significantly reducing engineering efforts, time to production, and computation resources. With MLRun, you can choose any IDE on your local machine or on the cloud. MLRun breaks the silos between data, ML, software, and DevOps/MLOps teams, enabling collaboration and fast continuous improvements.
+
+Keywords: MLOps
+
+## [FederatedScope](https://github.com/alibaba/FederatedScope)
+
+[FederatedScope](https://github.com/alibaba/FederatedScope) is a comprehensive federated learning platform that provides convenient usage and flexible customization for various federated learning tasks in both academia and industry. Based on an event-driven architecture, [FederatedScope](https://github.com/alibaba/FederatedScope) integrates rich collections of functionalities to satisfy the burgeoning demands from federated learning, and aims to build up an easy-to-use platform for promoting learning safely and effectively.
+
+Keywords: Federated learning, Event-driven
+
+## [pythainlp](https://github.com/PyThaiNLP/pythainlp)
+
+PyThaiNLP is a Python package for text processing and linguistic analysis, similar to NLTK with focus on Thai language.
+
+Keywords: Thai, NLP, NLTK
+
+## [FlagAI](https://github.com/FlagAI-Open/FlagAI)
+
+[FlagAI](https://github.com/FlagAI-Open/FlagAI) (Fast LArge-scale General AI models) is a fast, easy-to-use and extensible toolkit for large-scale model. Our goal is to support training, fine-tuning, and deployment of large-scale models on various downstream tasks with multi-modality.
+
+Keywords: Large models, Training, Fine-tuning, Deployment, Multi-modal
+
+## [pyserini](https://github.com/castorini/pyserini)
+
+[pyserini](https://github.com/castorini/pyserini) is a Python toolkit for reproducible information retrieval research with sparse and dense representations. Retrieval using sparse representations is provided via integration with the group's Anserini IR toolkit. Retrieval using dense representations is provided via integration with Facebook's Faiss library.
+
+Keywords: IR, Information Retrieval, Dense, Sparse
+
+## [baal](https://github.com/baal-org/baal)
+
+[baal](https://github.com/baal-org/baal) is an active learning library that supports both industrial applications and research usecases. [baal](https://github.com/baal-org/baal) currently supports Monte-Carlo Dropout, MCDropConnect, deep ensembles, and semi-supervised learning.
+
+Keywords: Active Learning, Research, Labeling
+

From ea0eb1564958fa9db8651084806c2e209ab501d0 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Wed, 17 May 2023 17:07:36 +0200
Subject: [PATCH 145/935] Small fixes and link in the README (#23428)

Fix + link
---
 README.md               | 13 +++++++++++++
 awesome-transformers.md |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 005a67e85e50..e2f1b083ab0f 100644
--- a/README.md
+++ b/README.md
@@ -115,6 +115,19 @@ In Multimodal tasks:
 
 **[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.
 
+
+## 100 projects using Transformers
+
+Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the 
+Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone 
+else to build their dream projects.
+
+In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the
+community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100
+incredible projects built in the vicinity of transformers.
+
+If you own or use a project that you believe should be part of the list, please open a PR to add it!
+
 ## If you are looking for custom support from the Hugging Face team
 
 <a target="_blank" href="https://huggingface.co/support">
diff --git a/awesome-transformers.md b/awesome-transformers.md
index 6dab782a1d46..beec5379a53b 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -221,7 +221,7 @@ Keywords: Data augmentation, Synthetic data generation, Audio, NLP
 
 ## [dream-textures](https://github.com/carson-katri/dream-textures)
 
-[dream-textures](https://github.com/carson-katri/dream-textures) is a library targeted at bringing stable-diffusion support within Blender. It supports several use-cases, such as image generation, texture projection, inpainting/outpainting, rendering passes, and upscaling.
+[dream-textures](https://github.com/carson-katri/dream-textures) is a library targeted at bringing stable-diffusion support within Blender. It supports several use-cases, such as image generation, texture projection, inpainting/outpainting, ControlNet, and upscaling.
 
 Keywords: Stable-Diffusion, Blender
 
@@ -258,7 +258,7 @@ Keywords: Semantic search, LLM
 
 ## [djl](https://github.com/deepjavalibrary/djl)
 
-Deep Java Library (DJL) is an open-source, high-level, engine-agnostic Java framework for deep learning. DJL is designed to be easy to get started with and simple to use for Java developers. DJL provides a native Java development experience and functions like any other regular Java library.
+Deep Java Library (DJL) is an open-source, high-level, engine-agnostic Java framework for deep learning. DJL is designed to be easy to get started with and simple to use for developers. DJL provides a native Java development experience and functions like any other regular Java library. DJL offers [a Java binding](https://github.com/deepjavalibrary/djl/tree/master/extensions/tokenizers) for HuggingFace Tokenizers and easy conversion toolkit for HuggingFace model to deploy in Java.
 
 Keywords: Java, Framework
 

From 45e3d6496a9e64c5359eee979b3e4e8b96a8de16 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Wed, 17 May 2023 11:16:02 -0400
Subject: [PATCH 146/935] Update error message when Accelerate isn't installed
 (#23373)

Update error
---
 src/transformers/training_args.py      | 2 +-
 src/transformers/utils/import_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 44f28ff99ef4..3189c5f86bb9 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1611,7 +1611,7 @@ def _setup_devices(self) -> "torch.device":
         logger.info("PyTorch: setting up devices")
         if not is_sagemaker_mp_enabled() and not is_accelerate_available(check_partial_state=True):
             raise ImportError(
-                "Using the `Trainer` with `PyTorch` requires `accelerate`: Run `pip install --upgrade accelerate`"
+                "Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
             )
         if self.no_cuda:
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 413aa5ecfa08..037a0d96a130 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -502,7 +502,7 @@ def is_protobuf_available():
 
 def is_accelerate_available(check_partial_state=False):
     if check_partial_state:
-        return _accelerate_available and version.parse(_accelerate_version) >= version.parse("0.17.0")
+        return _accelerate_available and version.parse(_accelerate_version) >= version.parse("0.19.0")
     return _accelerate_available
 
 

From cf9e7cb079bba57dc5669719e98389dc1e2a41b4 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 17 May 2023 17:04:51 +0100
Subject: [PATCH 147/935] TF: embeddings out of bounds check factored into
 function (#23427)

---
 .../models/albert/modeling_tf_albert.py       | 13 +------
 .../models/bart/modeling_tf_bart.py           | 24 ++----------
 .../models/bert/modeling_tf_bert.py           | 13 +------
 .../blenderbot/modeling_tf_blenderbot.py      | 24 ++----------
 .../modeling_tf_blenderbot_small.py           | 24 ++----------
 .../models/blip/modeling_tf_blip.py           | 13 +------
 .../models/blip/modeling_tf_blip_text.py      | 13 +------
 .../models/camembert/modeling_tf_camembert.py | 13 +------
 .../models/clip/modeling_tf_clip.py           | 13 +------
 .../models/convbert/modeling_tf_convbert.py   | 13 +------
 .../models/ctrl/modeling_tf_ctrl.py           | 13 +------
 .../models/deberta/modeling_tf_deberta.py     | 13 +------
 .../deberta_v2/modeling_tf_deberta_v2.py      | 13 +------
 .../distilbert/modeling_tf_distilbert.py      | 13 +------
 .../models/electra/modeling_tf_electra.py     | 13 +------
 .../models/esm/modeling_tf_esm.py             | 13 +------
 .../models/flaubert/modeling_tf_flaubert.py   | 13 +------
 .../models/funnel/modeling_tf_funnel.py       | 13 +------
 .../models/gpt2/modeling_tf_gpt2.py           | 13 +------
 .../models/gptj/modeling_tf_gptj.py           | 13 +------
 .../models/groupvit/modeling_tf_groupvit.py   | 13 +------
 .../models/layoutlm/modeling_tf_layoutlm.py   | 13 +------
 .../layoutlmv3/modeling_tf_layoutlmv3.py      | 12 +-----
 .../models/led/modeling_tf_led.py             | 24 ++----------
 .../longformer/modeling_tf_longformer.py      | 13 +------
 .../models/lxmert/modeling_tf_lxmert.py       | 13 +------
 .../models/marian/modeling_tf_marian.py       | 24 ++----------
 .../models/mbart/modeling_tf_mbart.py         | 24 ++----------
 .../mobilebert/modeling_tf_mobilebert.py      | 13 +------
 .../models/mpnet/modeling_tf_mpnet.py         | 13 +------
 .../models/openai/modeling_tf_openai.py       | 24 ++----------
 .../models/opt/modeling_tf_opt.py             | 13 +------
 .../models/pegasus/modeling_tf_pegasus.py     | 24 ++----------
 .../models/rembert/modeling_tf_rembert.py     | 13 +------
 .../models/roberta/modeling_tf_roberta.py     | 13 +------
 .../modeling_tf_roberta_prelayernorm.py       | 13 +------
 .../models/roformer/modeling_tf_roformer.py   | 13 +------
 .../modeling_tf_speech_to_text.py             | 13 +------
 src/transformers/models/t5/modeling_tf_t5.py  | 13 +------
 .../models/tapas/modeling_tf_tapas.py         | 13 +------
 .../models/whisper/modeling_tf_whisper.py     | 13 +------
 .../models/xglm/modeling_tf_xglm.py           | 13 +------
 .../models/xlm/modeling_tf_xlm.py             | 13 +------
 .../xlm_roberta/modeling_tf_xlm_roberta.py    | 13 +------
 .../models/xlnet/modeling_tf_xlnet.py         | 13 +------
 src/transformers/tf_utils.py                  | 20 ++++++++++
 ...tf_{{cookiecutter.lowercase_modelname}}.py | 37 +++----------------
 47 files changed, 123 insertions(+), 606 deletions(-)

diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 247ee395dc60..c7f76b175b0b 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -44,7 +44,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -188,16 +188,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 6e29434c4df1..39537b88bfce 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -40,7 +40,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -763,16 +763,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -965,16 +956,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 5391d71a916c..50ff7f2dddaa 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -49,7 +49,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -198,16 +198,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index 6b95bd56739a..ee5755c2035c 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -38,7 +38,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -746,16 +746,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -956,16 +947,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 3d521ea77a4d..e170085e91c5 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -37,7 +37,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -752,16 +752,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -961,16 +952,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index e166f40b9e2d..6ae7a2503cc3 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -29,7 +29,7 @@
     shape_list,
     unpack_inputs,
 )
-from ...tf_utils import stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -316,16 +316,7 @@ def call(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index 262b2cb27966..6e8ed8a891c0 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -32,7 +32,7 @@
     shape_list,
     unpack_inputs,
 )
-from ...tf_utils import invert_attention_mask, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax
 from ...utils import add_start_docstrings_to_model_forward, logging
 from .configuration_blip import BlipTextConfig
 
@@ -112,16 +112,7 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_v
             position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = self.word_embeddings(input_ids)
 
         embeddings = inputs_embeds
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index 5142b3d82b04..c9e4c98c1467 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -239,16 +239,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index d2e1b06e574a..7cf52500aed6 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -34,7 +34,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -238,16 +238,7 @@ def call(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 3976be69eb5b..e853da762771 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -42,7 +42,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
@@ -124,16 +124,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index dcd3f5a03e0c..f4742b4e33d7 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -32,7 +32,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ctrl import CTRLConfig
 
@@ -336,16 +336,7 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.w.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.w.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.w.vocab_size)
             inputs_embeds = self.w(input_ids, mode="embedding")
         seq_len = input_shape[-1]
         mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 016ce15db618..dcd0582777eb 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -39,7 +39,7 @@
     get_initializer,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta import DebertaConfig
 
@@ -778,16 +778,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 015eb3925740..b3c210352a32 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -38,7 +38,7 @@
     get_initializer,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
 
@@ -867,16 +867,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 95c3aef42615..3013f4ca30d7 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -43,7 +43,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
@@ -109,16 +109,7 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=F
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index b782cc987bef..82c3381724dc 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -44,7 +44,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -528,16 +528,7 @@ def call(
             raise ValueError("Need to provide either `input_ids` or `input_embeds`.")
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index 980b6453f6d0..135c16a14b36 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -40,7 +40,7 @@
     shape_list,
     unpack_inputs,
 )
-from ...tf_utils import stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
 from ...utils import logging
 from .configuration_esm import EsmConfig
 
@@ -214,16 +214,7 @@ def call(
                 position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = self.word_embeddings(input_ids)
 
         # Note that if we want to support ESM-1 (not 1b!) in future then we need to support an
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index 919cd6cc1e42..b1dd523dedaf 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -578,16 +578,7 @@ def call(
 
         # embeddings
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embeddings.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embeddings.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
             inputs_embeds = self.embeddings(input_ids)
 
         tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 2b109cdbab8a..84254f2b288c 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -42,7 +42,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -109,16 +109,7 @@ def call(self, input_ids=None, inputs_embeds=None, training=False):
         assert not (input_ids is not None and inputs_embeds is not None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(self.weight, input_ids)
 
         final_embeddings = self.LayerNorm(inputs=inputs_embeds)
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index a84fdbd80664..d0c731878d2d 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -39,7 +39,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     ModelOutput,
@@ -437,16 +437,7 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = self.wte(input_ids, mode="embedding")
 
         position_embeds = tf.gather(self.wpe, position_ids)
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index f077a52a03ae..fbef4f0effc7 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -43,7 +43,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import logging
 from .configuration_gptj import GPTJConfig
 
@@ -437,16 +437,7 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.wte.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.wte.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.wte.vocab_size)
             inputs_embeds = self.wte(input_ids, mode="embedding")
 
         if token_type_ids is not None:
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 3826b83e7a46..4891931c20ab 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -33,7 +33,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -572,16 +572,7 @@ def call(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 2097ae58b8bf..2755e055370b 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -41,7 +41,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlm import LayoutLMConfig
 
@@ -140,16 +140,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
index 95ef5580b954..491ef186e522 100644
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -36,6 +36,7 @@
     keras_serializable,
     unpack_inputs,
 )
+from ...tf_utils import check_embeddings_within_bounds
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from .configuration_layoutlmv3 import LayoutLMv3Config
 
@@ -240,16 +241,7 @@ def call(
             token_type_ids = tf.zeros(input_shape, dtype=position_ids.dtype)
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.word_embeddings.input_dim, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.word_embeddings.input_dim})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.word_embeddings.input_dim)
             inputs_embeds = self.word_embeddings(input_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 75d4a15f194d..324482b4d245 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -33,7 +33,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     ModelOutput,
@@ -1746,16 +1746,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
         elif inputs_embeds is not None:
             input_shape = shape_list(inputs_embeds)[:-1]
@@ -2038,16 +2029,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
 
         hidden_states = inputs_embeds
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index e5e22a21276f..c47df169655c 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -34,7 +34,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -538,16 +538,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index 9408188100f1..948053c93e28 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -32,7 +32,7 @@
     shape_list,
     unpack_inputs,
 )
-from ...tf_utils import stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -232,16 +232,7 @@ def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index a0e26de9bd7e..175115883201 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -37,7 +37,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -778,16 +778,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -990,16 +981,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index 6f48062fc637..13453bd22dba 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -37,7 +37,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -770,16 +770,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -989,16 +980,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 832a4fa3f52b..c47cde847de4 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -212,16 +212,7 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index 48866e21d4d8..08db31017308 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -45,7 +45,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
@@ -144,16 +144,7 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=F
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 5723001729df..7c04520c9c1f 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -35,7 +35,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -295,30 +295,12 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
         position_embeds = tf.gather(self.positions_embed, position_ids)
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                token_type_ids,
-                tf.cast(self.config.vocab_size, dtype=token_type_ids.dtype),
-                message=(
-                    "token_type_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(token_type_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(token_type_ids, self.config.vocab_size, "token_type_ids")
             token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
         else:
             token_type_embeds = 0
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 4f738b1605d4..1855fcb1bc03 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -33,7 +33,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -631,16 +631,7 @@ def call(
         past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
             inputs_embeds = self.embed_tokens(input_ids)
 
         if attention_mask is None:
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 5955d50d61c7..1ccccc2dc5ce 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -38,7 +38,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ContextManagers,
     add_code_sample_docstrings,
@@ -782,16 +782,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -997,16 +988,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         hidden_states = inputs_embeds
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 74683dfc0c08..c4dc8c5a148b 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -45,7 +45,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -122,16 +122,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 606afb754b4f..7aa2c9e07a3d 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -144,16 +144,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
index 1843605bd04a..fedfea56a7a9 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -149,16 +149,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 952250e68a04..2d1387d2d8d8 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
@@ -175,16 +175,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 009d2538ea85..e5c38afa83cb 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -36,7 +36,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -1030,16 +1030,7 @@ def call(
         past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         else:
             inputs_embeds = inputs_embeds
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index f9996e15314e..ec3e67db26d1 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -40,7 +40,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -686,16 +686,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index 5c995aa93014..f876730b095d 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -38,7 +38,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -231,16 +231,7 @@ def call(
                 position_ids = tf.math.minimum(self.max_position_embeddings - 1, position - first_position)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         position_embeddings = tf.gather(self.position_embeddings, indices=position_ids)
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index d9a175062b36..0d2a2682cc97 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -36,7 +36,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_whisper import WhisperConfig
 
@@ -882,16 +882,7 @@ def call(
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
             inputs_embeds = self.embed_tokens(input_ids)
 
         attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index d112e641a93e..1a0146bf19d7 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -42,7 +42,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import logging
 from .configuration_xglm import XGLMConfig
 
@@ -527,16 +527,7 @@ def call(
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index f77111cee450..da9bd1c6034f 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -45,7 +45,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -440,16 +440,7 @@ def call(
 
         # embeddings
         if inputs_embeds is None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.embeddings.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.embeddings.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
             inputs_embeds = self.embeddings(input_ids)
 
         tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index b5fc694148e7..2f51c032f150 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -46,7 +46,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -233,16 +233,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.config.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.config.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 080dd91f2301..52538ced57ed 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -39,7 +39,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
@@ -678,16 +678,7 @@ def call(
         if inputs_embeds is not None:
             word_emb_k = inputs_embeds
         else:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.word_embedding.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.word_embedding.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.word_embedding.vocab_size)
             word_emb_k = self.word_embedding(input_ids)
         output_h = self.dropout(word_emb_k, training=training)
         if target_mapping is not None:
diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py
index 20fe71d6ae5a..2d4fa6fda9b6 100644
--- a/src/transformers/tf_utils.py
+++ b/src/transformers/tf_utils.py
@@ -96,3 +96,23 @@ def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor:
     ) * encoder_extended_attention_mask.dtype.min
 
     return encoder_extended_attention_mask
+
+
+def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None:
+    """
+    `tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning
+    zeros instead. This function adds a check against that dangerous silent behavior.
+
+    Args:
+        tensor (`tf.Tensor`): The tensor of indices to check.
+        embed_dim (`int`): The embedding dimension.
+        tensor_name (`str`, *optional*): The name of the tensor to use in the error message.
+    """
+    tf.debugging.assert_less(
+        tensor,
+        tf.cast(embed_dim, dtype=tensor.dtype),
+        message=(
+            f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding "
+            f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time."
+        ),
+    )
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index df6adc3c4deb..ffe5e7de95b9 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -53,7 +53,7 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -126,16 +126,7 @@ def call(
         assert not (input_ids is None and inputs_embeds is None)
 
         if input_ids is not None:
-            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-            tf.debugging.assert_less(
-                input_ids,
-                tf.cast(self.vocab_size, dtype=input_ids.dtype),
-                message=(
-                    "input_ids must be smaller than the embedding layer's input dimension (got"
-                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
-                ),
-            )
+            check_embeddings_within_bounds(input_ids, self.vocab_size)
             inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
 
         input_shape = shape_list(inputs_embeds)[:-1]
@@ -1670,7 +1661,7 @@ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAn
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import ContextManagers, logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -2311,16 +2302,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
         embed_pos = self.embed_positions(input_shape)
@@ -2518,16 +2500,7 @@ def call(
             if hasattr(self.embed_tokens, "load_weight_prefix"):
                 context.append(tf.name_scope(self.embed_tokens.load_weight_prefix + "/"))
             with ContextManagers(context):
-                # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
-                # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
-                tf.debugging.assert_less(
-                    input_ids,
-                    tf.cast(self.embed_tokens.input_dim, dtype=input_ids.dtype),
-                    message=(
-                        "input_ids must be smaller than the embedding layer's input dimension (got"
-                        f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.input_dim})"
-                    ),
-                )
+                check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
                 inputs_embeds = self.embed_tokens(input_ids)
 
         hidden_states = inputs_embeds

From 939a65aba750402bca6928759858f8d90be44496 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 17 May 2023 18:14:29 +0200
Subject: [PATCH 148/935] Update Bigbird Pegasus tests (#23431)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../test_modeling_bigbird_pegasus.py          | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index d8036cb8275c..32b7bbcbb24a 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -513,9 +513,10 @@ def test_inference_block_sparse(self):
         self.assertEqual(prediction_logits.shape, torch.Size((1, 16, 96103)))
         # fmt: off
         expected_prediction_logits_slice = torch.tensor(
-            [[1.7769, 5.8479, 6.2375, 2.2745, 8.6157, 4.7483, 5.0647, 6.5358, 2.3393, 7.8333, 3.8403, 0.0255, 7.219, 5.2759, 3.097, 6.387, 4.9341, 7.1409, 5.1179, 0.1144, 6.8268, 0.7598, 0.6258, 2.373, 0.4627, -1.9919, 1.8422, 3.4578], [1.8026, 5.9604, 5.954, 2.8642, 9.0608, 4.394, 5.3779, 7.0216, 1.543, 7.8744, 4.4231, -0.0398, 7.6091, 5.6611, 3.3536, 6.8624, 4.7699, 6.5241, 4.8893, 0.5791, 6.8368, 0.1034, 0.0338, 2.9393, 0.5034, -2.5509, 2.0172, 3.2858], [1.8426, 5.9151, 5.5374, 3.0426, 9.1762, 3.6287, 5.3916, 7.4621, 1.2582, 7.9244, 4.694, -0.1308, 7.4725, 5.5385, 3.4598, 7.0422, 4.2455, 5.797, 4.5927, 0.7478, 6.7467, -0.2695, -0.3207, 3.0269, 0.4714, -2.8134, 2.0406, 3.1089], [1.6527, 5.8416, 5.4558, 3.0044, 9.3478, 3.2607, 5.3887, 7.52, 0.9362, 7.8877, 4.8465, -0.1705, 7.3932, 5.6352, 3.5744, 7.2623, 4.0485, 5.2788, 4.5859, 0.8325, 6.6088, -0.3676, -0.6287, 3.1731, 0.4483, -3.1573, 2.0522, 2.8868]],  # noqa: E231
+            [[1.5118, 5.5227, 4.8125, 1.7603, 8.1704, 3.996, 4.8118, 6.7806, 2.2297, 6.9834, 3.1906, 0.103, 7.1515, 6.3679, 3.1896, 6.3054, 3.9741, 6.3772, 5.0042, -0.6338, 6.7868, 0.592, 0.5363, 1.87, -0.331, -2.4518, 1.8263, 3.1899], [1.5702, 5.8135, 4.6675, 2.3674, 8.9828, 3.7913, 5.4027, 7.6567, 1.9007, 7.3706, 3.8824, 0.0247, 7.6094, 6.6985, 3.2826, 7.0094, 3.8713, 5.6555, 5.0439, -0.3519, 7.1525, 0.4062, -0.2419, 2.2194, -0.6447, -2.9614, 2.0713, 3.248], [1.4527, 5.6003, 4.5381, 2.6382, 9.2809, 3.2969, 5.6811, 8.4011, 1.6909, 7.4937, 4.3185, -0.0878, 7.61, 6.6822, 3.4753, 7.3962, 3.5336, 4.9216, 4.943, -0.2043, 7.3326, 0.2199, -0.6016, 2.4367, -0.7043, -3.0689, 2.3215, 3.0611], [1.1084, 5.6308, 4.4886, 2.717, 9.4103, 3.0733, 5.5825, 8.4325, 1.3075, 7.5495, 4.4782, -0.1092, 7.8115, 6.6285, 3.5311, 7.6853, 3.509, 4.4994, 4.9224, -0.1384, 7.3069, -0.0473, -0.8578, 2.4632, -0.5249, -3.4627, 2.2671, 2.8818]],  # noqa: E231
             device=torch_device,
         )
+
         # fmt: on
         self.assertTrue(
             torch.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
@@ -563,24 +564,23 @@ def test_seq_to_seq_generation(self):
         hypotheses_batch = model.generate(**inputs)
 
         EXPECTED_LEP = (
-            "motivated by some recent studies on the light cp - odd higgs boson @xmath0 in non - minimal"
-            " supersymmetric models, we investigate the rare @xmath1-decays @xmath2 ( @xmath3 ) in the two higgs"
-            " doublet model ( 2hdm ), the nearly minimal supersymmetric standard model ( nmssm ), the next - to -"
-            " minimal supersymmetric standard model ( nmssm ) and the minimal supersymmetric standard model ( mssm"
-            " ).<n> we find that the branching ratios of @xmath4 can reach @xmath5 in 2hdm, @xmath6 in nmssm and"
-            " @xmath7 in mssm, which are at the level of @xmath8 in 2hdm, @xmath9 in nmssm and @xmath10 in mssm,"
-            " respectively.<n> these rates can be significantly enhanced in new physics models which lie within the"
-            " expected sensitivity of the gigaz option of the international linear collider ( ilc ). <n> = # 1,nucl."
-            " <n> phys. <n> b * # 1"
+            "we study the rare decays @xmath0 ( @xmath1 ) at the gigaz option of the international linear collider "
+            "( ilc ).<n> we calculate the branching ratios of @xmath2 in the two higgs doublet model ( 2hdm ), the "
+            "minimal supersymmetric standard model ( mssm ), the next - to - minimal supersymmetric standard model "
+            "( nmssm ) and the nearly minimal supersymmetric standard model ( nmssm ).<n> we find that the branching "
+            "ratios of @xmath3 can reach @xmath4 in 2hdm, @xmath5 in mssm, @xmath6 in nmssm and @xmath7 in nmssm, "
+            "while they are much smaller than @xmath8 in 2hdm, @xmath9 in mssm, @xmath10 in nmssm and @xmath11 in "
+            "nmssm."
         )
 
         EXPECTED_MAGNET = (
-            "a positive, nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic -"
-            " field range in the surface state of a topological insulator having a positive and finite effective g -"
-            " factor. this linear magnetoresistance shows up in the system of high carrier concentration and low"
-            " mobility when electrons are in extended states and spread over many smeared landau levels, and persists"
-            " up to room temperature, providing a possible mechanism for the recently observed linear"
-            " magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons."
+            "we investigate the two - dimensional magnetotransport in the surface state of a topological insulator "
+            "( ti ).<n> we find that a positive, nonsaturating and dominantly linear magnetoresistance can appear "
+            "within quite wide magnetic - field range in the ti surface state having a positive and finite effective g "
+            "- factor.<n> this linear magnetoresistance shows up in the system of high carrier concentration and low "
+            "mobility when electrons are in extended states and spread over many smeared landau levels, and persists "
+            "up to room temperature, providing a possible mechanism for the recently observed linear magnetoresistance "
+            "in topological insulator bi@xmath0se@xmath1 nanoribbons."
         )
 
         generated = tokenizer.batch_decode(

From a574de302f8538d73c342ee946c2cbf8c64e7a6f Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 17 May 2023 17:42:54 +0100
Subject: [PATCH 149/935] Encoder-Decoder: add informative exception when the
 decoder is not compatible (#23426)

---
 .../models/encoder_decoder/modeling_encoder_decoder.py   | 8 ++++++++
 .../encoder_decoder/modeling_tf_encoder_decoder.py       | 9 ++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index b27b134ecf29..3548e48c595a 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -16,6 +16,7 @@
 
 
 import gc
+import inspect
 import os
 import tempfile
 import warnings
@@ -245,6 +246,13 @@ def __init__(
                 f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
             )
 
+        decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
         # tie encoder, decoder weights if config set accordingly
         self.tie_weights()
 
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 1c90245b696c..5ec7f2932f59 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ Classes to support TF Encoder-Decoder architectures"""
 
-
+import inspect
 import re
 import warnings
 from typing import Optional, Tuple, Union
@@ -266,6 +266,13 @@ def __init__(
                 f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
             )
 
+        decoder_signature = set(inspect.signature(self.decoder.call).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
     @property
     def dummy_inputs(self):
         """

From 0f2c7382076f9b935924c892c4f67bbcd84b640a Mon Sep 17 00:00:00 2001
From: Hugo Abonizio <hugo_abonizio@hotmail.com>
Date: Wed, 17 May 2023 14:08:12 -0300
Subject: [PATCH 150/935] Remove hardcoded prints in Trainer (#23432)

---
 src/transformers/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 1efc262a6292..ce4e1f240671 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1105,10 +1105,10 @@ def create_optimizer(self):
                     for module in opt_model.modules():
                         if isinstance(module, nn.Embedding):
                             skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
-                            print(f"skipped {module}: {skipped/2**20}M params")
+                            logger.info(f"skipped {module}: {skipped/2**20}M params")
                             manager.register_module_override(module, "weight", {"optim_bits": 32})
                             logger.debug(f"bitsandbytes: will optimize {module} in fp32")
-                    print(f"skipped: {skipped/2**20}M params")
+                    logger.info(f"skipped: {skipped/2**20}M params")
 
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)

From a8732e09bb230848f1ab0f3cab9e533b412ca143 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 17 May 2023 19:48:18 +0200
Subject: [PATCH 151/935] Fix device issue in
 `SwiftFormerModelIntegrationTest::test_inference_image_classification_head`
 (#23435)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/swiftformer/test_modeling_swiftformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index ca616976fd4a..f2785a6640ca 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -303,5 +303,5 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]])
+        expected_slice = torch.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]]).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From aea7b23b57ea0bbe15c42114d35b3c7a4d79d7eb Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 18 May 2023 11:04:51 +0100
Subject: [PATCH 152/935] Generate: skip left-padding tests on old models
 (#23437)

---
 tests/generation/test_utils.py                            | 4 +---
 tests/models/bart/test_modeling_bart.py                   | 4 ++++
 .../bigbird_pegasus/test_modeling_bigbird_pegasus.py      | 4 ++++
 tests/models/blenderbot/test_modeling_blenderbot.py       | 4 ++++
 .../blenderbot_small/test_modeling_blenderbot_small.py    | 4 ++++
 tests/models/ctrl/test_modeling_ctrl.py                   | 4 ++++
 tests/models/imagegpt/test_modeling_imagegpt.py           | 4 ++++
 tests/models/marian/test_modeling_marian.py               | 4 ++++
 tests/models/mbart/test_modeling_mbart.py                 | 4 ++++
 tests/models/mvp/test_modeling_mvp.py                     | 4 ++++
 tests/models/pegasus/test_modeling_pegasus.py             | 4 ++++
 tests/models/plbart/test_modeling_plbart.py               | 4 ++++
 tests/models/prophetnet/test_modeling_prophetnet.py       | 4 ++++
 tests/models/reformer/test_modeling_reformer.py           | 8 ++++++++
 tests/models/transfo_xl/test_modeling_transfo_xl.py       | 4 ++++
 tests/models/trocr/test_modeling_trocr.py                 | 4 ++++
 16 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 06e56498d925..96649b953c91 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1597,9 +1597,7 @@ def test_generate_with_head_masking(self):
                 attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
                 self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    # TODO (joao): this test is actually not slow :) However, it is not passing in some models (e.g. GPTNeoX) and the
-    # fix for some models is quite lengthy. Being slow means it doesn't block our push CI while we fix it.
-    @slow
+    @slow  # TODO (Joao): fix GPTBigCode
     def test_left_padding_compatibility(self):
         # The check done in this test is fairly difficult -- depending on the model architecture, passing the right
         # position index for the position embeddings can still result in a different output, due to numerical masking.
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index a4b77e8431bf..949e647e6fc3 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1516,3 +1516,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
     def test_save_load_fast_init_from_base(self):
         pass
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 32b7bbcbb24a..5d345db3fc5d 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -811,3 +811,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index 8762d11aaa08..499c7aa52155 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -561,3 +561,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index f8d247ba12e1..257aa1699c3e 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -566,3 +566,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 87d719853ad7..8941927f1732 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -249,6 +249,10 @@ def test_model_from_pretrained(self):
             model = CTRLModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class CTRLModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index e6c8524c6d6f..27d83f3eb8c1 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -520,6 +520,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index fe5f8606521e..877f5273c9bc 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -862,3 +862,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index d7094ca6e479..ec3d36f33d0e 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -692,3 +692,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index e996a998a80d..cc3986a3701a 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -818,3 +818,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index cb8b36c9af31..bde7477f9450 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -573,3 +573,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 6ad226747a33..05dbac6a2c80 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -658,3 +658,7 @@ def test_decoder_model_attn_mask_past(self):
     def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index baf7351bf5ab..fa717b274306 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -1146,6 +1146,10 @@ def test_retain_grad_hidden_states_attentions(self):
         # decoder cannot keep gradients
         return
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index 39e1389477b8..c84f729633cc 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -682,6 +682,10 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class ReformerLSHAttnModelTest(
@@ -839,6 +843,10 @@ def test_problem_types(self):
     def test_past_key_values_format(self):
         pass
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py
index c86cd704f159..970f87bf1063 100644
--- a/tests/models/transfo_xl/test_modeling_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_transfo_xl.py
@@ -486,6 +486,10 @@ def _mock_init_weights(self, module):
                 weight = getattr(module, param)
                 weight.data.fill_(3)
 
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass
+
 
 @require_torch
 class TransfoXLModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index f670f4e6d95e..0033f339ae58 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -192,3 +192,7 @@ def test_decoder_model_past(self):
     # decoder cannot keep gradients
     def test_retain_grad_hidden_states_attentions(self):
         return
+
+    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
+    def test_left_padding_compatibility(self):
+        pass

From f2d2880bbbd7769e12c37471af0b067b379dfc43 Mon Sep 17 00:00:00 2001
From: Chris Hammill <cfhammill@gmail.com>
Date: Thu, 18 May 2023 06:34:33 -0400
Subject: [PATCH 153/935] remove unnecessary print in gpt neox sequence
 classifier (#23433)

---
 src/transformers/models/gpt_neox/modeling_gpt_neox.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 2058dbc1a56b..488093594637 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -863,7 +863,6 @@ def forward(
                     loss = loss_fct(pooled_logits, labels)
             elif self.config.problem_type == "single_label_classification":
                 loss_fct = CrossEntropyLoss()
-                print(pooled_logits.shape, labels.shape)
                 loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
             elif self.config.problem_type == "multi_label_classification":
                 loss_fct = BCEWithLogitsLoss()

From 8cfae440937ca621475521544c13cbb1b4ce4814 Mon Sep 17 00:00:00 2001
From: Nayeon Han <nayeon2.han@gmail.com>
Date: Thu, 18 May 2023 21:52:17 +0900
Subject: [PATCH 154/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`tasks/zero=5Fshot=5Fobject=5Fdetection.mdx`=20to=20Korean?=
 =?UTF-8?q?=20(#23430)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

docs: ko: zero_shot_object_detection
---
 docs/source/ko/_toctree.yml                   |   4 +-
 .../ko/tasks/zero_shot_object_detection.mdx   | 303 ++++++++++++++++++
 2 files changed, 305 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tasks/zero_shot_object_detection.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index d00acd282efc..386723497f0c 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -58,8 +58,8 @@
         title: (번역중) Video classification
       - local: in_translation
         title: (번역중) Object detection
-      - local: in_translation
-        title: (번역중) Zero-shot object detection
+      - local: tasks/zero_shot_object_detection
+        title: 제로샷(zero-shot) 객체 탐지
       - local: tasks/zero_shot_image_classification
         title: 제로샷(zero-shot) 이미지 분류
       - local: in_translation
diff --git a/docs/source/ko/tasks/zero_shot_object_detection.mdx b/docs/source/ko/tasks/zero_shot_object_detection.mdx
new file mode 100644
index 000000000000..c1dc6c732d23
--- /dev/null
+++ b/docs/source/ko/tasks/zero_shot_object_detection.mdx
@@ -0,0 +1,303 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 제로샷(zero-shot) 객체 탐지[[zeroshot-object-detection]]
+
+[[open-in-colab]]
+
+일반적으로 [객체 탐지](object_detection)에 사용되는 모델을 학습하기 위해서는 레이블이 지정된 이미지 데이터 세트가 필요합니다.
+그리고 학습 데이터에 존재하는 클래스(레이블)만 탐지할 수 있다는 한계점이 있습니다.
+
+다른 방식을 사용하는 [OWL-ViT](../model_doc/owlvit) 모델로 제로샷 객체 탐지가 가능합니다.
+OWL-ViT는 개방형 어휘(open-vocabulary) 객체 탐지기입니다.
+즉, 레이블이 지정된 데이터 세트에 미세 조정하지 않고 자유 텍스트 쿼리를 기반으로 이미지에서 객체를 탐지할 수 있습니다.
+
+OWL-ViT 모델은 멀티 모달 표현을 활용해 개방형 어휘 탐지(open-vocabulary detection)를 수행합니다.
+[CLIP](../model_doc/clip) 모델에 경량화(lightweight)된 객체 분류와 지역화(localization) 헤드를 결합합니다.
+개방형 어휘 탐지는 CLIP의 텍스트 인코더로 free-text 쿼리를 임베딩하고, 객체 분류와 지역화 헤드의 입력으로 사용합니다.
+이미지와 해당 텍스트 설명을 연결하면 ViT가 이미지 패치(image patches)를 입력으로 처리합니다.
+OWL-ViT 모델의 저자들은 CLIP 모델을 처음부터 학습(scratch learning)한 후에, bipartite matching loss를 사용하여 표준 객체 인식 데이터셋으로 OWL-ViT 모델을 미세 조정했습니다.
+
+이 접근 방식을 사용하면 모델은 레이블이 지정된 데이터 세트에 대한 사전 학습 없이도 텍스트 설명을 기반으로 객체를 탐지할 수 있습니다.
+
+이번 가이드에서는 OWL-ViT 모델의 사용법을 다룰 것입니다:
+- 텍스트 프롬프트 기반 객체 탐지
+- 일괄 객체 탐지
+- 이미지 가이드 객체 탐지
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+```bash
+pip install -q transformers
+```
+
+## 제로샷(zero-shot) 객체 탐지 파이프라인[[zeroshot-object-detection-pipeline]]
+
+[`pipeline`]을 활용하면 가장 간단하게 OWL-ViT 모델을 추론해볼 수 있습니다.
+[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads)에서 제로샷(zero-shot) 객체 탐지용 파이프라인을 인스턴스화합니다:
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "google/owlvit-base-patch32"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-object-detection")
+```
+
+다음으로, 객체를 탐지하고 싶은 이미지를 선택하세요.
+여기서는 [NASA](https://www.nasa.gov/multimedia/imagegallery/index.html) Great Images 데이터 세트의 일부인 우주비행사 에일린 콜린스(Eileen Collins) 사진을 사용하겠습니다.
+
+```py
+>>> import skimage
+>>> import numpy as np
+>>> from PIL import Image
+
+>>> image = skimage.data.astronaut()
+>>> image = Image.fromarray(np.uint8(image)).convert("RGB")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_1.png" alt="Astronaut Eileen Collins"/>
+</div>
+
+이미지와 해당 이미지의 후보 레이블을 파이프라인으로 전달합니다.
+여기서는 이미지를 직접 전달하지만, 컴퓨터에 저장된 이미지의 경로나 url로 전달할 수도 있습니다.
+candidate_labels는 이 예시처럼 간단한 단어일 수도 있고 좀 더 설명적인 단어일 수도 있습니다.
+또한, 이미지를 검색(query)하려는 모든 항목에 대한 텍스트 설명도 전달합니다.
+
+```py
+>>> predictions = detector(
+...     image,
+...     candidate_labels=["human face", "rocket", "nasa badge", "star-spangled banner"],
+... )
+>>> predictions
+[{'score': 0.3571370542049408,
+  'label': 'human face',
+  'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}},
+ {'score': 0.28099656105041504,
+  'label': 'nasa badge',
+  'box': {'xmin': 129, 'ymin': 348, 'xmax': 206, 'ymax': 427}},
+ {'score': 0.2110239565372467,
+  'label': 'rocket',
+  'box': {'xmin': 350, 'ymin': -1, 'xmax': 468, 'ymax': 288}},
+ {'score': 0.13790413737297058,
+  'label': 'star-spangled banner',
+  'box': {'xmin': 1, 'ymin': 1, 'xmax': 105, 'ymax': 509}},
+ {'score': 0.11950037628412247,
+  'label': 'nasa badge',
+  'box': {'xmin': 277, 'ymin': 338, 'xmax': 327, 'ymax': 380}},
+ {'score': 0.10649408400058746,
+  'label': 'rocket',
+  'box': {'xmin': 358, 'ymin': 64, 'xmax': 424, 'ymax': 280}}]
+```
+
+이제 예측값을 시각화해봅시다:
+
+```py
+>>> from PIL import ImageDraw
+
+>>> draw = ImageDraw.Draw(image)
+
+>>> for prediction in predictions:
+...     box = prediction["box"]
+...     label = prediction["label"]
+...     score = prediction["score"]
+
+...     xmin, ymin, xmax, ymax = box.values()
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_2.png" alt="Visualized predictions on NASA image"/>
+</div>
+
+## 텍스트 프롬프트 기반 객체 탐지[[textprompted-zeroshot-object-detection-by-hand]]
+
+제로샷 객체 탐지 파이프라인 사용법에 대해 살펴보았으니, 이제 동일한 결과를 복제해보겠습니다.
+
+[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?other=owlvit)에서 관련 모델과 프로세서를 가져오는 것으로 시작합니다.
+여기서는 이전과 동일한 체크포인트를 사용하겠습니다:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+다른 이미지를 사용해 보겠습니다:
+
+```py
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/oj0zeY2Ltk4/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MTR8fHBpY25pY3xlbnwwfHx8fDE2Nzc0OTE1NDk&force=true&w=640"
+>>> im = Image.open(requests.get(url, stream=True).raw)
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_3.png" alt="Beach photo"/>
+</div>
+
+프로세서를 사용해 모델의 입력을 준비합니다.
+프로세서는 모델의 입력으로 사용하기 위해 이미지 크기를 변환하고 정규화하는 이미지 프로세서와 텍스트 입력을 처리하는 [`CLIPTokenizer`]로 구성됩니다.
+
+```py
+>>> text_queries = ["hat", "book", "sunglasses", "camera"]
+>>> inputs = processor(text=text_queries, images=im, return_tensors="pt")
+```
+
+모델에 입력을 전달하고 결과를 후처리 및 시각화합니다.
+이미지 프로세서가 모델에 이미지를 입력하기 전에 이미지 크기를 조정했기 때문에, [`~OwlViTImageProcessor.post_process_object_detection`] 메소드를 사용해
+예측값의 바운딩 박스(bounding box)가 원본 이미지의 좌표와 상대적으로 동일한지 확인해야 합니다.
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([im.size[::-1]])
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(im)
+
+>>> scores = results["scores"].tolist()
+>>> labels = results["labels"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[label]}: {round(score,2)}", fill="white")
+
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## 일괄 처리[[batch-processing]]
+
+여러 이미지와 텍스트 쿼리를 전달하여 여러 이미지에서 서로 다른(또는 동일한) 객체를 검색할 수 있습니다.
+일괄 처리를 위해서 텍스트 쿼리는 이중 리스트로, 이미지는 PIL 이미지, PyTorch 텐서, 또는 NumPy 배열로 이루어진 리스트로 프로세서에 전달해야 합니다.
+
+```py
+>>> images = [image, im]
+>>> text_queries = [
+...     ["human face", "rocket", "nasa badge", "star-spangled banner"],
+...     ["hat", "book", "sunglasses", "camera"],
+... ]
+>>> inputs = processor(text=text_queries, images=images, return_tensors="pt")
+```
+
+이전에는 후처리를 위해 단일 이미지의 크기를 텐서로 전달했지만, 튜플을 전달할 수 있고, 여러 이미지를 처리하는 경우에는 튜플로 이루어진 리스트를 전달할 수도 있습니다.
+아래 두 예제에 대한 예측을 생성하고, 두 번째 이미지(`image_idx = 1`)를 시각화해 보겠습니다.
+
+```py
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = [x.size[::-1] for x in images]
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)
+
+>>> image_idx = 1
+>>> draw = ImageDraw.Draw(images[image_idx])
+
+>>> scores = results[image_idx]["scores"].tolist()
+>>> labels = results[image_idx]["labels"].tolist()
+>>> boxes = results[image_idx]["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[image_idx][label]}: {round(score,2)}", fill="white")
+
+>>> images[image_idx]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## 이미지 가이드 객체 탐지[[imageguided-object-detection]]
+
+텍스트 쿼리를 이용한 제로샷 객체 탐지 외에도 OWL-ViT 모델은 이미지 가이드 객체 탐지 기능을 제공합니다.
+이미지를 쿼리로 사용해 대상 이미지에서 유사한 객체를 찾을 수 있다는 의미입니다.
+텍스트 쿼리와 달리 하나의 예제 이미지에서만 가능합니다.
+
+소파에 고양이 두 마리가 있는 이미지를 대상 이미지(target image)로, 고양이 한 마리가 있는 이미지를 쿼리로 사용해보겠습니다:
+
+```py
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image_target = Image.open(requests.get(url, stream=True).raw)
+
+>>> query_url = "http://images.cocodataset.org/val2017/000000524280.jpg"
+>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
+```
+
+다음 이미지를 살펴보겠습니다:
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> fig, ax = plt.subplots(1, 2)
+>>> ax[0].imshow(image_target)
+>>> ax[1].imshow(query_image)
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_5.png" alt="Cats"/>
+</div>
+
+전처리 단계에서 텍스트 쿼리 대신에 `query_images`를 사용합니다:
+
+```py
+>>> inputs = processor(images=image_target, query_images=query_image, return_tensors="pt")
+```
+
+예측의 경우, 모델에 입력을 전달하는 대신 [`~OwlViTForObjectDetection.image_guided_detection`]에 전달합니다.
+레이블이 없다는 점을 제외하면 이전과 동일합니다.
+이전과 동일하게 이미지를 시각화합니다.
+
+```py
+>>> with torch.no_grad():
+...     outputs = model.image_guided_detection(**inputs)
+...     target_sizes = torch.tensor([image_target.size[::-1]])
+...     results = processor.post_process_image_guided_detection(outputs=outputs, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(image_target)
+
+>>> scores = results["scores"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4)
+
+>>> image_target
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_6.png" alt="Cats with bounding boxes"/>
+</div>
+
+OWL-ViT 모델을 추론하고 싶다면 아래 데모를 확인하세요:
+
+<iframe
+	src="https://adirik-owl-vit.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>

From 5777c3cb3f7a6a82bddeae378ad9001cd796c315 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 18 May 2023 14:54:23 +0200
Subject: [PATCH 155/935] Fix (skip) a pipeline test for `RwkvModel` (#23444)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/pipelines/test_pipelines_text_generation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 84b144905217..b4605164aefd 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -240,7 +240,11 @@ def run_pipeline_test(self, text_generator, _):
         # We don't care about infinite range models.
         # They already work.
         # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
-        if tokenizer.model_max_length < 10000 and "XGLM" not in tokenizer.__class__.__name__:
+        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM"]
+        if (
+            tokenizer.model_max_length < 10000
+            and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
+        ):
             # Handling of large generations
             with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)):
                 text_generator("This is a test" * 500, max_new_tokens=20)

From c618ab4fab1e700f903abb857a9e820531ed1cb1 Mon Sep 17 00:00:00 2001
From: joaoareis <34096208+joaoareis@users.noreply.github.com>
Date: Thu, 18 May 2023 14:07:10 +0100
Subject: [PATCH 156/935] Fix DecisionTransformerConfig doctring (#23450)

---
 .../configuration_decision_transformer.py                     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
index 91ef58665a1f..88ff005469cd 100644
--- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -57,9 +57,9 @@ class DecisionTransformerConfig(PretrainedConfig):
         n_positions (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_layer (`int`, *optional*, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 3):
             Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
+        n_head (`int`, *optional*, defaults to 1):
             Number of attention heads for each attention layer in the Transformer encoder.
         n_inner (`int`, *optional*):
             Dimensionality of the inner feed-forward layers. If unset, will default to 4 times `n_embd`.

From db136341836d6d9a4d599e2935253e0ae04cc1f1 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 18 May 2023 14:46:40 +0100
Subject: [PATCH 157/935] TF: GPT2 with native embedding layers (#23436)

---
 docs/source/en/internal/modeling_utils.mdx    |  3 --
 src/transformers/modeling_tf_utils.py         |  4 ++
 .../models/gpt2/modeling_tf_gpt2.py           | 39 +++++++++----------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/internal/modeling_utils.mdx b/docs/source/en/internal/modeling_utils.mdx
index 914b8ca36798..578740df0274 100644
--- a/docs/source/en/internal/modeling_utils.mdx
+++ b/docs/source/en/internal/modeling_utils.mdx
@@ -54,9 +54,6 @@ Most of those are only useful if you are studying the code of the models in the
 
 [[autodoc]] modeling_tf_utils.TFConv1D
 
-[[autodoc]] modeling_tf_utils.TFSharedEmbeddings
-    - call
-
 [[autodoc]] modeling_tf_utils.TFSequenceSummary
 
 ## TensorFlow loss functions
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 5618472744fb..630290d92161 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -3132,6 +3132,10 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+        warnings.warn(
+            "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `tf.keras.layers.Embedding` instead.",
+            DeprecationWarning,
+        )
 
     def build(self, input_shape):
         """
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index d0c731878d2d..b7cb1b6df2a2 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -34,7 +34,6 @@
     TFPreTrainedModel,
     TFSequenceClassificationLoss,
     TFSequenceSummary,
-    TFSharedEmbeddings,
     get_initializer,
     keras_serializable,
     unpack_inputs,
@@ -315,29 +314,27 @@ def __init__(self, config, *inputs, **kwargs):
         self.n_positions = config.n_positions
         self.initializer_range = config.initializer_range
 
-        self.wte = TFSharedEmbeddings(
-            config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
+        self.wte = tf.keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wte",
+        )
+        self.wpe = tf.keras.layers.Embedding(
+            input_dim=config.n_positions,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wpe",
         )
         self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
         self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
         self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
 
-    def build(self, input_shape):
-        with tf.name_scope("wpe"):
-            self.wpe = self.add_weight(
-                name="embeddings",
-                shape=[self.n_positions, self.n_embd],
-                initializer=get_initializer(self.initializer_range),
-            )
-
-        super().build(input_shape)
-
     def get_input_embeddings(self):
         return self.wte
 
-    def set_input_embeddings(self, value):
-        self.wte.weight = value
-        self.wte.vocab_size = shape_list(value)[0]
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -438,13 +435,13 @@ def call(
 
         if inputs_embeds is None:
             check_embeddings_within_bounds(input_ids, self.config.vocab_size)
-            inputs_embeds = self.wte(input_ids, mode="embedding")
+            inputs_embeds = self.wte(input_ids)
 
-        position_embeds = tf.gather(self.wpe, position_ids)
+        position_embeds = self.wpe(position_ids)
 
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.wte(token_type_ids, mode="embedding")
+            token_type_embeds = self.wte(token_type_ids)
         else:
             token_type_embeds = tf.constant(0.0)
 
@@ -904,7 +901,7 @@ def call(
             training=training,
         )
         hidden_states = transformer_outputs[0]
-        logits = self.transformer.wte(hidden_states, mode="linear")
+        logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
 
         loss = None
         if labels is not None:
@@ -1048,7 +1045,7 @@ def call(
             all_hidden_states = transformer_outputs.hidden_states[:-1] + (hidden_states,)
         else:
             all_hidden_states = None
-        lm_logits = self.transformer.wte(hidden_states, mode="linear")
+        lm_logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
         mc_logits = tf.squeeze(mc_logits, axis=-1)
 

From cf43200861c74daf00a74e9ec287734f9cc31d1c Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 18 May 2023 11:09:55 -0400
Subject: [PATCH 158/935] Add local agent (#23438)

* Add local agent

* Document LocalAgent
---
 docs/source/en/main_classes/agent.mdx |   6 +-
 src/transformers/__init__.py          |  13 ++-
 src/transformers/tools/__init__.py    |   4 +-
 src/transformers/tools/agents.py      | 113 ++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/main_classes/agent.mdx b/docs/source/en/main_classes/agent.mdx
index 953857c410cb..ecf1747d12bd 100644
--- a/docs/source/en/main_classes/agent.mdx
+++ b/docs/source/en/main_classes/agent.mdx
@@ -24,12 +24,16 @@ contains the API docs for the underlying classes.
 
 ## Agents
 
-We provide two types of agents: [`HfAgent`] uses inference endpoints for opensource models and [`OpenAiAgent`] uses OpenAI closed models.
+We provide three types of agents: [`HfAgent`] uses inference endpoints for opensource models, [`LocalAgent`] uses a model of your choice locally and [`OpenAiAgent`] uses OpenAI closed models.
 
 ### HfAgent
 
 [[autodoc]] HfAgent
 
+### LocalAgent
+
+[[autodoc]] LocalAgent
+
 ### OpenAiAgent
 
 [[autodoc]] OpenAiAgent
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ca476f30c291..c079981490b7 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -614,6 +614,7 @@
     "tools": [
         "Agent",
         "HfAgent",
+        "LocalAgent",
         "OpenAiAgent",
         "PipelineTool",
         "RemoteTool",
@@ -4361,7 +4362,17 @@
     )
 
     # Tools
-    from .tools import Agent, HfAgent, OpenAiAgent, PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
+    from .tools import (
+        Agent,
+        HfAgent,
+        LocalAgent,
+        OpenAiAgent,
+        PipelineTool,
+        RemoteTool,
+        Tool,
+        launch_gradio_demo,
+        load_tool,
+    )
 
     # Trainer
     from .trainer_callback import (
diff --git a/src/transformers/tools/__init__.py b/src/transformers/tools/__init__.py
index 8465d7370d3c..a5b3c7dc05eb 100644
--- a/src/transformers/tools/__init__.py
+++ b/src/transformers/tools/__init__.py
@@ -24,7 +24,7 @@
 
 
 _import_structure = {
-    "agents": ["Agent", "HfAgent", "OpenAiAgent"],
+    "agents": ["Agent", "HfAgent", "LocalAgent", "OpenAiAgent"],
     "base": ["PipelineTool", "RemoteTool", "Tool", "launch_gradio_demo", "load_tool"],
 }
 
@@ -46,7 +46,7 @@
     _import_structure["translation"] = ["TranslationTool"]
 
 if TYPE_CHECKING:
-    from .agents import Agent, HfAgent, OpenAiAgent
+    from .agents import Agent, HfAgent, LocalAgent, OpenAiAgent
     from .base import PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
 
     try:
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 79413954df10..a41e93398dd7 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -24,6 +24,8 @@
 import requests
 from huggingface_hub import HfFolder, hf_hub_download, list_spaces
 
+from ..generation import StoppingCriteria, StoppingCriteriaList
+from ..models.auto import AutoModelForCausalLM, AutoTokenizer
 from ..utils import is_openai_available, logging
 from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
 from .prompts import CHAT_MESSAGE_PROMPT, CHAT_PROMPT_TEMPLATE, RUN_PROMPT_TEMPLATE
@@ -492,3 +494,114 @@ def generate_one(self, prompt, stop):
             if result.endswith(stop_seq):
                 return result[: -len(stop_seq)]
         return result
+
+
+class LocalAgent(Agent):
+    """
+    Agent that uses a local model and tokenizer to generate code.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The model to use for the agent.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer to use for the agent.
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer, LocalAgent
+
+    checkpoint = "bigcode/starcoder"
+    model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+    agent = LocalAgent(model, tokenizer)
+    agent.run("Draw me a picture of rivers and lakes.")
+    ```
+    """
+
+    def __init__(self, model, tokenizer, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
+        self.model = model
+        self.tokenizer = tokenizer
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """
+        Convenience method to build a `LocalAgent` from a pretrained checkpoint.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The name of a repo on the Hub or a local path to a folder containing both model and tokenizer.
+            kwargs:
+                Keyword arguments passed along to [`~PreTrainedModel.from_pretrained`].
+
+        Example:
+
+        ```py
+        import torch
+        from transformers import LocalAgent
+
+        agent = LocalAgent.from_pretrained("bigcode/starcoder", device_map="auto", torch_dtype=torch.bfloat16)
+        agent.run("Draw me a picture of rivers and lakes.")
+        ```
+        """
+        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(model, tokenizer)
+
+    @property
+    def _model_device(self):
+        if hasattr(self.model, "hf_device_map"):
+            return list(self.model.hf_device_map.values())[0]
+        for param in self.mode.parameters():
+            return param.device
+
+    def generate_one(self, prompt, stop):
+        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self._model_device)
+        src_len = encoded_inputs["input_ids"].shape[1]
+        stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop, self.tokenizer)])
+        outputs = self.model.generate(
+            encoded_inputs["input_ids"], max_new_tokens=200, stopping_criteria=stopping_criteria
+        )
+
+        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
+        # Inference API returns the stop sequence
+        for stop_seq in stop:
+            if result.endswith(stop_seq):
+                result = result[: -len(stop_seq)]
+        return result
+
+
+class StopSequenceCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever a sequence of tokens is encountered.
+
+    Args:
+        stop_sequences (`str` or `List[str]`):
+            The sequence (or list of sequences) on which to stop execution.
+        tokenizer:
+            The tokenizer used to decode the model outputs.
+    """
+
+    def __init__(self, stop_sequences, tokenizer):
+        if isinstance(stop_sequences, str):
+            stop_sequences = [stop_sequences]
+        self.stop_sequences = stop_sequences
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        decoded_output = self.tokenizer.decode(input_ids.tolist()[0])
+        return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences)

From 21f7e81b6bb7ef9ffdd8816b3804d966736e4a2d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 18 May 2023 17:14:25 +0200
Subject: [PATCH 159/935] Make `RwkvModel` accept `attention_mask` but discard
 it internally (#23442)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/rwkv/modeling_rwkv.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index 3733f0db99db..d40509638d9c 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -565,6 +565,15 @@ class RwkvCausalLMOutput(ModelOutput):
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            This is currently not used by `RwkvModel`, but will be supported in the future.
+
+            [What are attention masks?](../glossary#attention-mask)
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -617,6 +626,7 @@ def set_input_embeddings(self, new_embeddings):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # noqa
         inputs_embeds: Optional[torch.FloatTensor] = None,
         state: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
@@ -750,7 +760,7 @@ def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=Non
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # noqa
         inputs_embeds: Optional[torch.FloatTensor] = None,
         state: Optional[List[torch.FloatTensor]] = None,
         labels: Optional[torch.LongTensor] = None,

From 2406dbdcfa4433b0e82486dab00fdc02de2dd38b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 18 May 2023 17:28:22 +0200
Subject: [PATCH 160/935] Less flaky
 `test_assisted_decoding_matches_greedy_search` (#23451)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/generation/test_utils.py | 85 +++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 96649b953c91..5f835917ea02 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1477,46 +1477,57 @@ def test_assisted_decoding_matches_greedy_search(self):
             ):
                 return
 
-            # enable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+            # This for loop is a naive and temporary effort to make the test less flaky.
+            failed = 0
+            for i in range(10):
+                # enable cache
+                config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+
+                # NOTE: assisted generation only works with cache on at the moment.
+                if not hasattr(config, "use_cache"):
+                    return
+
+                config.use_cache = True
+                config.is_decoder = True
+                model = model_class(config).to(torch_device).eval()
+                output_greedy = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=1,
+                    do_sample=False,
+                    output_scores=True,
+                    output_hidden_states=True,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                )
+                # Note: with assisted generate, if the same model is used as assistant, then all assistant tokens will
+                # be correct
+                output_assisted = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=1,
+                    do_sample=False,
+                    assistant_model=model,
+                    output_scores=True,
+                    output_hidden_states=True,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                )
 
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                return
+                try:
+                    self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
 
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            output_greedy = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_beams=1,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            # Note: with assisted generate, if the same model is used as assistant, then all assistant tokens will
-            # be correct
-            output_assisted = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_beams=1,
-                do_sample=False,
-                assistant_model=model,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
+                    for output in (output_greedy, output_assisted):
+                        self._check_outputs(output, input_ids, model.config, use_cache=True)
+                except AssertionError:
+                    failed += 1
+                    if failed > 1:
+                        self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
 
-            self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
-
-            for output in (output_greedy, output_assisted):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
+                        for output in (output_greedy, output_assisted):
+                            self._check_outputs(output, input_ids, model.config, use_cache=True)
 
     def test_assisted_decoding_sample(self):
         # Seeded assisted decoding will not match sample for the same seed, as the forward pass does not return the

From ffad4f13736ee0996e232135d699983a4a8b91b3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 18 May 2023 17:29:04 +0200
Subject: [PATCH 161/935] Update tiny models and pipeline tests (#23446)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/biogpt/test_modeling_biogpt.py   |   3 +-
 tests/models/cpmant/test_modeling_cpmant.py   |   6 +-
 .../models/focalnet/test_modeling_focalnet.py |   8 +-
 tests/models/gpt_neo/test_modeling_gpt_neo.py |   2 +-
 .../models/gpt_neox/test_modeling_gpt_neox.py |   2 +-
 tests/models/rwkv/test_modeling_rwkv.py       |   7 +-
 .../swiftformer/test_modeling_swiftformer.py  |   1 -
 tests/test_pipeline_mixin.py                  |  27 ++++
 tests/utils/tiny_model_summary.json           | 143 ++++++++++++++++++
 9 files changed, 187 insertions(+), 12 deletions(-)

diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index 7e64fc07dad5..e50410930887 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -289,9 +289,10 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     pipeline_model_mapping = (
         {
             "feature-extraction": BioGptModel,
+            "text-classification": BioGptForSequenceClassification,
             "text-generation": BioGptForCausalLM,
             "token-classification": BioGptForTokenClassification,
-            "text-classification": BioGptForSequenceClassification,
+            "zero-shot": BioGptForSequenceClassification,
         }
         if is_torch_available()
         else {}
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index 61cd0ec17e2d..f3ebeaad3e9b 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -21,6 +21,7 @@
 from ...generation.test_utils import torch_device
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -133,8 +134,11 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class CpmAntModelTest(ModelTesterMixin, unittest.TestCase):
+class CpmAntModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CpmAntModel, "text-generation": CpmAntForCausalLM} if is_torch_available() else {}
+    )
 
     test_pruning = False
     test_missing_keys = False
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index 75127e5fd382..ce96f0ade414 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -25,6 +25,7 @@
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -226,7 +227,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class FocalNetModelTest(ModelTesterMixin, unittest.TestCase):
+class FocalNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             FocalNetModel,
@@ -237,6 +238,11 @@ class FocalNetModelTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    pipeline_model_mapping = (
+        {"feature-extraction": FocalNetModel, "image-classification": FocalNetForImageClassification}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
 
     test_pruning = False
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index c0334bd05aa3..a79cf5b25d7d 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -413,8 +413,8 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
             "feature-extraction": GPTNeoModel,
             "question-answering": GPTNeoForQuestionAnswering,
             "text-classification": GPTNeoForSequenceClassification,
-            "token-classification": GPTNeoForTokenClassification,
             "text-generation": GPTNeoForCausalLM,
+            "token-classification": GPTNeoForTokenClassification,
             "zero-shot": GPTNeoForSequenceClassification,
         }
         if is_torch_available()
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 927d097691c5..c2bbbe58e11c 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -239,8 +239,8 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             "feature-extraction": GPTNeoXModel,
             "question-answering": GPTNeoXForQuestionAnswering,
             "text-classification": GPTNeoXForSequenceClassification,
-            "token-classification": GPTNeoXForTokenClassification,
             "text-generation": GPTNeoXForCausalLM,
+            "token-classification": GPTNeoXForTokenClassification,
             "zero-shot": GPTNeoXForSequenceClassification,
         }
         if is_torch_available()
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 4afcc9b41f86..638180cbcf9b 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -262,12 +262,7 @@ def prepare_config_and_inputs_for_common(self):
 class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (RwkvModel, RwkvForCausalLM) if is_torch_available() else ()
     pipeline_model_mapping = (
-        {
-            "feature-extraction": RwkvModel,
-            "text-generation": RwkvForCausalLM,
-        }
-        if is_torch_available()
-        else {}
+        {"feature-extraction": RwkvModel, "text-generation": RwkvForCausalLM} if is_torch_available() else {}
     )
     # all_generative_model_classes = (RwkvForCausalLM,) if is_torch_available() else ()
     fx_compatible = False
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index f2785a6640ca..159d5ec0fa9a 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -139,7 +139,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
     """
 
     all_model_classes = (SwiftFormerModel, SwiftFormerForImageClassification) if is_torch_available() else ()
-
     pipeline_model_mapping = (
         {"feature-extraction": SwiftFormerModel, "image-classification": SwiftFormerForImageClassification}
         if is_torch_available()
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 904d94e4b2c9..05ea27121a54 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -263,6 +263,15 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
             )
             return
 
+        pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
+        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
+            logger.warning(
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
+                f"`{tokenizer_name}` | processor `{processor_name}`."
+            )
+            return
+
         # validate
         validate_test_components(self, task, model, tokenizer, processor)
 
@@ -440,6 +449,10 @@ def test_pipeline_zero_shot_object_detection(self):
     def is_pipeline_test_to_skip(
         self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
     ):
+        """Skip some tests based on the classes or their names without the instantiated objects.
+
+        This is to avoid calling `from_pretrained` (so reducing the runtime) if we already know the tests will fail.
+        """
         # No fix is required for this case.
         if (
             pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
@@ -451,6 +464,20 @@ def is_pipeline_test_to_skip(
 
         return False
 
+    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor):  # noqa
+        """Skip some more tests based on the information from the instantiated objects."""
+        # No fix is required for this case.
+        if (
+            pipeline_test_casse_name == "QAPipelineTests"
+            and tokenizer is not None
+            and getattr(tokenizer, "pad_token", None) is None
+            and not tokenizer.__class__.__name__.endswith("Fast")
+        ):
+            # `QAPipelineTests` doesn't work with a slow tokenizer that has no pad token.
+            return True
+
+        return False
+
 
 def validate_test_components(test_case, task, model, tokenizer, processor):
     # TODO: Move this to tiny model creation script
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index 127ef7a9fcf0..a186bf17e196 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -477,6 +477,16 @@
         ],
         "sha": "07073b31da84054fd12226e3cae4cb3beb2547f9"
     },
+    "BioGptForSequenceClassification": {
+        "tokenizer_classes": [
+            "BioGptTokenizer"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "BioGptForSequenceClassification"
+        ],
+        "sha": "8e18ad6218abd795e050dec324a8c827ccedacb4"
+    },
     "BioGptForTokenClassification": {
         "tokenizer_classes": [
             "BioGptTokenizer"
@@ -2131,6 +2141,46 @@
         ],
         "sha": "31ebf1b7a0ef1fd5059b98e28e5ab1c366d2c482"
     },
+    "FocalNetBackbone": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetBackbone"
+        ],
+        "sha": "eb8c580969443cb87de7dd9a256deaface03692f"
+    },
+    "FocalNetForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetForImageClassification"
+        ],
+        "sha": "28d30ded26a3213e8fb7011a455afc3aa98b0a95"
+    },
+    "FocalNetForMaskedImageModeling": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetForMaskedImageModeling"
+        ],
+        "sha": "0ea7626d19c9dd2f3113d977f643a1babc720bd3"
+    },
+    "FocalNetModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "FocalNetModel"
+        ],
+        "sha": "107b004e6aa14108a359b7d22bdb9aa141ec05d5"
+    },
     "FunnelBaseModel": {
         "tokenizer_classes": [
             "FunnelTokenizer",
@@ -2247,6 +2297,17 @@
         ],
         "sha": "24a8dbb48b1aa0ba2eba44324fcd0c78cca64dd4"
     },
+    "GPT2ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPT2ForQuestionAnswering"
+        ],
+        "sha": "a5bdd6bd4d79feece85ea9a8bd4ee5fe54c1d45b"
+    },
     "GPT2ForSequenceClassification": {
         "tokenizer_classes": [
             "GPT2Tokenizer",
@@ -2397,6 +2458,17 @@
         ],
         "sha": "e88934e402c15195dd99b2947632415dd7645268"
     },
+    "GPTNeoForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoForQuestionAnswering"
+        ],
+        "sha": "623883e94bd08caf9b3f839b98debeea72d5bc2b"
+    },
     "GPTNeoForSequenceClassification": {
         "tokenizer_classes": [
             "GPT2Tokenizer",
@@ -2408,6 +2480,17 @@
         ],
         "sha": "bf2090d5d91a70eb37ba51fbdcf23afc7031fea8"
     },
+    "GPTNeoForTokenClassification": {
+        "tokenizer_classes": [
+            "GPT2Tokenizer",
+            "GPT2TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoForTokenClassification"
+        ],
+        "sha": "d5208e73e24a1671219776b50fe5f96e0e4cd218"
+    },
     "GPTNeoModel": {
         "tokenizer_classes": [
             "GPT2Tokenizer",
@@ -2429,6 +2512,16 @@
         ],
         "sha": "0229cfaaa843c6b492ac2abffabb00f1ff1936f8"
     },
+    "GPTNeoXForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXForQuestionAnswering"
+        ],
+        "sha": "7d2f08c959c211129952ee03b5562add09fe6864"
+    },
     "GPTNeoXForSequenceClassification": {
         "tokenizer_classes": [
             "GPTNeoXTokenizerFast"
@@ -2439,6 +2532,16 @@
         ],
         "sha": "17c4b845ee2e0bb780ca2dea2d59a3d9d5d3c651"
     },
+    "GPTNeoXForTokenClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "GPTNeoXForTokenClassification"
+        ],
+        "sha": "3aa4fe8a562f32230041d6d3616aa5ecc3f30192"
+    },
     "GPTNeoXJapaneseForCausalLM": {
         "tokenizer_classes": [
             "GPTNeoXJapaneseTokenizer"
@@ -4922,6 +5025,26 @@
         ],
         "sha": "4830db38fd310404c5ab70bd00684eca0bc06ca8"
     },
+    "RwkvForCausalLM": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RwkvForCausalLM"
+        ],
+        "sha": "2f452fd46b39e39b1a6a95fa1d8232405bbb3e96"
+    },
+    "RwkvModel": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "RwkvModel"
+        ],
+        "sha": "88a52c9437dc3c06f65a8252490be7eb91197804"
+    },
     "SEWDForCTC": {
         "tokenizer_classes": [
             "Wav2Vec2CTCTokenizer"
@@ -5196,6 +5319,26 @@
         ],
         "sha": "e0a3ac56a4047da3f921638252ead5e44438bbdb"
     },
+    "SwiftFormerForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwiftFormerForImageClassification"
+        ],
+        "sha": "a249b14a525d29e675b6e4af4baacd9ba7df7598"
+    },
+    "SwiftFormerModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "ViTImageProcessor"
+        ],
+        "model_classes": [
+            "SwiftFormerModel"
+        ],
+        "sha": "25ba2d88c770533f8c69811d2a454a00c1d09f5d"
+    },
     "Swin2SRModel": {
         "tokenizer_classes": [],
         "processor_classes": [

From 167aa76cfae7eac01d6cae98aeba4d1e2810a1ce Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 18 May 2023 12:17:17 -0400
Subject: [PATCH 162/935] Properly guard PyTorch stuff (#23452)

* Properly guard PyTorch stuff

* [all-test]

* [all-test] Fix model imports as well

* Making sure StoppingCriteria is always defined

* [all-test]
---
 src/transformers/tools/agents.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index a41e93398dd7..5d46ff49bc28 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -24,9 +24,8 @@
 import requests
 from huggingface_hub import HfFolder, hf_hub_download, list_spaces
 
-from ..generation import StoppingCriteria, StoppingCriteriaList
-from ..models.auto import AutoModelForCausalLM, AutoTokenizer
-from ..utils import is_openai_available, logging
+from ..models.auto import AutoTokenizer
+from ..utils import is_openai_available, is_torch_available, logging
 from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
 from .prompts import CHAT_MESSAGE_PROMPT, CHAT_PROMPT_TEMPLATE, RUN_PROMPT_TEMPLATE
 from .python_interpreter import evaluate
@@ -38,6 +37,12 @@
 if is_openai_available():
     import openai
 
+if is_torch_available():
+    from ..generation import StoppingCriteria, StoppingCriteriaList
+    from ..models.auto import AutoModelForCausalLM
+else:
+    StoppingCriteria = object
+
 _tools_are_initialized = False
 
 

From f69589d1bc50857b874b42a3d1bab7c891275e96 Mon Sep 17 00:00:00 2001
From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com>
Date: Thu, 18 May 2023 10:14:28 -0700
Subject: [PATCH 163/935] add cleanlab to awesome-transformers tools list
 (#23440)

* add tool to awesome-transformers list

* add keyword list

* sgugger wording suggestion

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 awesome-transformers.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/awesome-transformers.md b/awesome-transformers.md
index beec5379a53b..36059eafa62a 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -582,3 +582,9 @@ Keywords: IR, Information Retrieval, Dense, Sparse
 
 Keywords: Active Learning, Research, Labeling
 
+## [cleanlab](https://github.com/cleanlab/cleanlab)
+
+[cleanlab](https://github.com/cleanlab/cleanlab) is the standard data-centric AI package for data quality and machine learning with messy, real-world data and labels. For text, image, tabular, audio (among others) datasets, you can use cleanlab to automatically: detect data issues (outliers, label errors, near duplicates, etc), train robust ML models, infer consensus + annotator-quality for multi-annotator data, suggest data to (re)label next (active learning).
+
+Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active Learning  
+

From 40ed18ae1500de92e2c2acfc2944188d56df88ee Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 18 May 2023 14:06:49 -0400
Subject: [PATCH 164/935] Add an option to log result from the Agent (#23454)

---
 src/transformers/tools/agents.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 5d46ff49bc28..b04f622e15cf 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -207,6 +207,7 @@ def __init__(self, chat_prompt_template=None, run_prompt_template=None, addition
         self.chat_prompt_template = CHAT_PROMPT_TEMPLATE if chat_prompt_template is None else chat_prompt_template
         self.run_prompt_template = RUN_PROMPT_TEMPLATE if run_prompt_template is None else run_prompt_template
         self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
+        self.log = print
         if additional_tools is not None:
             if isinstance(additional_tools, (list, tuple)):
                 additional_tools = {t.name: t for t in additional_tools}
@@ -244,6 +245,15 @@ def format_prompt(self, task, chat_mode=False):
             prompt = prompt.replace("<<prompt>>", task)
         return prompt
 
+    def set_stream(self, streamer):
+        """
+        Set the function use to stream results (which is `print` by default).
+
+        Args:
+            streamer (`callable`): The function to call when streaming results from the LLM.
+        """
+        self.log = streamer
+
     def chat(self, task, *, return_code=False, remote=False, **kwargs):
         """
         Sends a new request to the agent in a chat. Will use the previous ones in its history.
@@ -273,12 +283,12 @@ def chat(self, task, *, return_code=False, remote=False, **kwargs):
         self.chat_history = prompt + result.strip() + "\n"
         explanation, code = clean_code_for_chat(result)
 
-        print(f"==Explanation from the agent==\n{explanation}")
+        self.log(f"==Explanation from the agent==\n{explanation}")
 
         if code is not None:
-            print(f"\n\n==Code generated by the agent==\n{code}")
+            self.log(f"\n\n==Code generated by the agent==\n{code}")
             if not return_code:
-                print("\n\n==Result==")
+                self.log("\n\n==Result==")
                 self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
                 self.chat_state.update(kwargs)
                 return evaluate(code, self.cached_tools, self.chat_state, chat_mode=True)
@@ -320,11 +330,11 @@ def run(self, task, *, return_code=False, remote=False, **kwargs):
         result = self.generate_one(prompt, stop=["Task:"])
         explanation, code = clean_code_for_run(result)
 
-        print(f"==Explanation from the agent==\n{explanation}")
+        self.log(f"==Explanation from the agent==\n{explanation}")
 
-        print(f"\n\n==Code generated by the agent==\n{code}")
+        self.log(f"\n\n==Code generated by the agent==\n{code}")
         if not return_code:
-            print("\n\n==Result==")
+            self.log("\n\n==Result==")
             self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
             return evaluate(code, self.cached_tools, state=kwargs.copy())
         else:
@@ -487,7 +497,7 @@ def generate_one(self, prompt, stop):
 
         response = requests.post(self.url_endpoint, json=inputs, headers=headers)
         if response.status_code == 429:
-            print("Getting rate-limited, waiting a tiny bit before trying again.")
+            logger.info("Getting rate-limited, waiting a tiny bit before trying again.")
             time.sleep(1)
             return self._generate_one(prompt)
         elif response.status_code != 200:

From b7b81d934430d213ce3e051bb42520b7fa664ce0 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 18 May 2023 14:14:43 -0400
Subject: [PATCH 165/935] Clean up CUDA kernels (#23455)

---
 .../deformable_detr}/cpu/ms_deform_attn_cpu.cpp        |  0
 .../deformable_detr}/cpu/ms_deform_attn_cpu.h          |  0
 .../deformable_detr}/cuda/ms_deform_attn_cuda.cu       |  0
 .../deformable_detr}/cuda/ms_deform_attn_cuda.cuh      |  0
 .../deformable_detr}/cuda/ms_deform_attn_cuda.h        |  0
 .../deformable_detr}/cuda/ms_deform_im2col_cuda.cuh    |  0
 .../deformable_detr}/ms_deform_attn.h                  |  0
 .../deformable_detr}/vision.cpp                        |  0
 src/transformers/{models => kernels}/yoso/common.h     |  0
 .../{models => kernels}/yoso/common_cuda.h             |  0
 .../{models => kernels}/yoso/common_cuda_device.h      |  0
 .../{models => kernels}/yoso/fast_lsh_cumulation.cu    |  0
 .../{models => kernels}/yoso/fast_lsh_cumulation.h     |  0
 .../yoso/fast_lsh_cumulation_cuda.cu                   |  0
 .../yoso/fast_lsh_cumulation_cuda.h                    |  0
 .../yoso/fast_lsh_cumulation_torch.cpp                 |  0
 src/transformers/models/deformable_detr/load_custom.py | 10 ++++------
 src/transformers/models/yoso/modeling_yoso.py          |  6 +++---
 utils/check_build.py                                   |  4 ++--
 19 files changed, 9 insertions(+), 11 deletions(-)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/cpu/ms_deform_attn_cpu.cpp (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/cpu/ms_deform_attn_cpu.h (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/cuda/ms_deform_attn_cuda.cu (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/cuda/ms_deform_attn_cuda.cuh (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/cuda/ms_deform_attn_cuda.h (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/cuda/ms_deform_im2col_cuda.cuh (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/ms_deform_attn.h (100%)
 rename src/transformers/{models/deformable_detr/custom_kernel => kernels/deformable_detr}/vision.cpp (100%)
 rename src/transformers/{models => kernels}/yoso/common.h (100%)
 rename src/transformers/{models => kernels}/yoso/common_cuda.h (100%)
 rename src/transformers/{models => kernels}/yoso/common_cuda_device.h (100%)
 rename src/transformers/{models => kernels}/yoso/fast_lsh_cumulation.cu (100%)
 rename src/transformers/{models => kernels}/yoso/fast_lsh_cumulation.h (100%)
 rename src/transformers/{models => kernels}/yoso/fast_lsh_cumulation_cuda.cu (100%)
 rename src/transformers/{models => kernels}/yoso/fast_lsh_cumulation_cuda.h (100%)
 rename src/transformers/{models => kernels}/yoso/fast_lsh_cumulation_torch.cpp (100%)

diff --git a/src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.cpp b/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.cpp
rename to src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.h b/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cpu/ms_deform_attn_cpu.h
rename to src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cu b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cu
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cuh b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.cuh
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.h b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_attn_cuda.h
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
diff --git a/src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_im2col_cuda.cuh b/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/cuda/ms_deform_im2col_cuda.cuh
rename to src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
diff --git a/src/transformers/models/deformable_detr/custom_kernel/ms_deform_attn.h b/src/transformers/kernels/deformable_detr/ms_deform_attn.h
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/ms_deform_attn.h
rename to src/transformers/kernels/deformable_detr/ms_deform_attn.h
diff --git a/src/transformers/models/deformable_detr/custom_kernel/vision.cpp b/src/transformers/kernels/deformable_detr/vision.cpp
similarity index 100%
rename from src/transformers/models/deformable_detr/custom_kernel/vision.cpp
rename to src/transformers/kernels/deformable_detr/vision.cpp
diff --git a/src/transformers/models/yoso/common.h b/src/transformers/kernels/yoso/common.h
similarity index 100%
rename from src/transformers/models/yoso/common.h
rename to src/transformers/kernels/yoso/common.h
diff --git a/src/transformers/models/yoso/common_cuda.h b/src/transformers/kernels/yoso/common_cuda.h
similarity index 100%
rename from src/transformers/models/yoso/common_cuda.h
rename to src/transformers/kernels/yoso/common_cuda.h
diff --git a/src/transformers/models/yoso/common_cuda_device.h b/src/transformers/kernels/yoso/common_cuda_device.h
similarity index 100%
rename from src/transformers/models/yoso/common_cuda_device.h
rename to src/transformers/kernels/yoso/common_cuda_device.h
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation.cu b/src/transformers/kernels/yoso/fast_lsh_cumulation.cu
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation.cu
rename to src/transformers/kernels/yoso/fast_lsh_cumulation.cu
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation.h b/src/transformers/kernels/yoso/fast_lsh_cumulation.h
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation.h
rename to src/transformers/kernels/yoso/fast_lsh_cumulation.h
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation_cuda.cu b/src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation_cuda.cu
rename to src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation_cuda.h b/src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation_cuda.h
rename to src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h
diff --git a/src/transformers/models/yoso/fast_lsh_cumulation_torch.cpp b/src/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp
similarity index 100%
rename from src/transformers/models/yoso/fast_lsh_cumulation_torch.cpp
rename to src/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp
diff --git a/src/transformers/models/deformable_detr/load_custom.py b/src/transformers/models/deformable_detr/load_custom.py
index d2a8bc0cb2c0..c3a822e27641 100644
--- a/src/transformers/models/deformable_detr/load_custom.py
+++ b/src/transformers/models/deformable_detr/load_custom.py
@@ -13,16 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Loading of Deformable DETR's CUDA kernels"""
-
 import os
+from pathlib import Path
 
 
 def load_cuda_kernels():
     from torch.utils.cpp_extension import load
 
-    root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "custom_kernel")
+    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
     src_files = [
-        os.path.join(root, filename)
+        root / filename
         for filename in [
             "vision.cpp",
             os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
@@ -33,10 +33,8 @@ def load_cuda_kernels():
     load(
         "MultiScaleDeformableAttention",
         src_files,
-        # verbose=True,
         with_cuda=True,
-        extra_include_paths=[root],
-        # build_directory=os.path.dirname(os.path.realpath(__file__)),
+        extra_include_paths=[str(root)],
         extra_cflags=["-DWITH_CUDA=1"],
         extra_cuda_cflags=[
             "-DCUDA_HAS_FP16=1",
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index cf0c814ff950..d549b0080a3e 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -16,7 +16,7 @@
 
 
 import math
-import os
+from pathlib import Path
 from typing import Optional, Tuple, Union
 
 import torch
@@ -56,8 +56,8 @@ def load_cuda_kernels():
         from torch.utils.cpp_extension import load
 
         def append_root(files):
-            src_folder = os.path.dirname(os.path.realpath(__file__))
-            return [os.path.join(src_folder, file) for file in files]
+            src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "yoso"
+            return [src_folder / file for file in files]
 
         src_files = append_root(
             ["fast_lsh_cumulation_torch.cpp", "fast_lsh_cumulation.cu", "fast_lsh_cumulation_cuda.cu"]
diff --git a/utils/check_build.py b/utils/check_build.py
index a699ed4f7e0f..e3cca31f837f 100644
--- a/utils/check_build.py
+++ b/utils/check_build.py
@@ -21,8 +21,8 @@
 FILES_TO_FIND = [
     "kernels/rwkv/wkv_cuda.cu",
     "kernels/rwkv/wkv_op.cpp",
-    "models/deformable_detr/custom_kernel/ms_deform_attn.h",
-    "models/deformable_detr/custom_kernel/cuda/ms_deform_im2col_cuda.cuh",
+    "kernels/deformable_detr/ms_deform_attn.h",
+    "kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh",
     "models/graphormer/algos_graphormer.pyx",
 ]
 

From a7920065f2cfd2549b838f9a30afd7c265fcdd88 Mon Sep 17 00:00:00 2001
From: Boda Sadallah <32247544+BodaSadalla98@users.noreply.github.com>
Date: Thu, 18 May 2023 21:22:30 +0300
Subject: [PATCH 166/935] fix bug in group_texts function, that was inserting
 short batches (#23429)

* fix bug in group_texts function, that was inserting short batches

* fully exclude short batches and return empty dict instead

* fix style
---
 examples/pytorch/language-modeling/run_clm.py            | 7 +++----
 examples/pytorch/language-modeling/run_clm_no_trainer.py | 7 +++----
 examples/pytorch/language-modeling/run_mlm.py            | 7 +++----
 examples/pytorch/language-modeling/run_mlm_no_trainer.py | 7 +++----
 examples/pytorch/language-modeling/run_plm.py            | 7 +++----
 5 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 3ffa7a5bba3e..ddf37d35f0bd 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -491,10 +491,9 @@ def group_texts(examples):
         # Concatenate all texts.
         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index e7f5e88b2912..c31f2867c37f 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -434,10 +434,9 @@ def group_texts(examples):
         # Concatenate all texts.
         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
         # Split by chunks of max_len.
         result = {
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index eced8377bcdc..16ef57d1a9a9 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -506,10 +506,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 30138bfbc31a..29a2559851d0 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -472,10 +472,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 1af82f5fc2da..779cdaabe619 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -450,10 +450,9 @@ def group_texts(examples):
             # Concatenate all texts.
             concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= max_seq_length:
-                total_length = (total_length // max_seq_length) * max_seq_length
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
             # Split by chunks of max_len.
             result = {
                 k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]

From 2acedf47214d7a634c193846124832a4686cc8fd Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Fri, 19 May 2023 04:33:11 -0400
Subject: [PATCH 167/935] feat: Whisper prompting (#22496)

* initial working additions

* clean and rename, add cond stripping initial prompt to decode

* cleanup, edit create_initial_prompt_ids, add tests

* repo consistency, flip order of conditional

* fix error, move the processor fn to the tokenizer

* repo consistency, update test ids to corresponding tokenizer

* use convert_tokens_to_ids not get_vocab...

* use actual conditional in generate

* make sytle

* initial address comments

* initial working add new params to pipeline

* first draft of sequential generation for condition_on_previous_text

* add/update tests, make compatible with timestamps

* make compatible with diff. input kwargs and max length

* add None check

* add temperature check

* flip temp check operand

* refocusing to prev pr scope

* remove the params too

* make style

* edits, move max length incorporating prompt to whisper

* address comments

* remove asr pipeline prompt decoding, fix indexing

* address comments (more tests, validate prompt)

* un-comment out tests (from debug)

* remove old comment

* address comments

* fix typo

* remove timestamp token from test

* make style

* cleanup

* copy method to fast tokenizer, set max_new_tokens for test

* prompt_ids type just pt

* address Amy's comments

* make style
---
 .../models/whisper/modeling_whisper.py        | 75 ++++++++++++---
 .../models/whisper/processing_whisper.py      |  4 +
 .../models/whisper/tokenization_whisper.py    | 30 ++++++
 .../whisper/tokenization_whisper_fast.py      | 32 +++++++
 tests/models/whisper/test_modeling_whisper.py | 96 +++++++++++++++++++
 .../models/whisper/test_processor_whisper.py  | 31 ++++++
 .../whisper/test_tokenization_whisper.py      | 19 ++++
 7 files changed, 272 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 91de6810b17e..96f91a0a43dd 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -34,7 +34,12 @@
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_whisper import WhisperConfig
 from .tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
 
@@ -1464,6 +1469,7 @@ def generate(
         task=None,
         language=None,
         is_multilingual=None,
+        prompt_ids: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         """
@@ -1521,6 +1527,11 @@ def generate(
                 find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
             is_multilingual (`bool`, *optional*):
                 Whether or not the model is multilingual.
+            prompt_ids (`torch.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
             kwargs:
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -1567,8 +1578,21 @@ def generate(
         if task is not None:
             generation_config.task = task
 
-        forced_decoder_ids = []
-        if task is not None or language is not None:
+        forced_decoder_ids = None
+
+        # Legacy code for backward compatibility
+        if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
+            forced_decoder_ids = self.config.forced_decoder_ids
+        elif (
+            hasattr(self.generation_config, "forced_decoder_ids")
+            and self.generation_config.forced_decoder_ids is not None
+        ):
+            forced_decoder_ids = self.generation_config.forced_decoder_ids
+        else:
+            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
+
+        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
+            forced_decoder_ids = []
             if hasattr(generation_config, "language"):
                 if generation_config.language in generation_config.lang_to_id.keys():
                     language_token = generation_config.language
@@ -1593,27 +1617,48 @@ def generate(
                     raise ValueError(
                         f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
                     )
-            else:
+            elif hasattr(generation_config, "task_to_id"):
                 forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
             if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
                 idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
                 forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
 
-        # Legacy code for backward compatibility
-        elif hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
-            forced_decoder_ids = self.config.forced_decoder_ids
-        elif (
-            hasattr(self.generation_config, "forced_decoder_ids")
-            and self.generation_config.forced_decoder_ids is not None
-        ):
-            forced_decoder_ids = self.generation_config.forced_decoder_ids
+        if forced_decoder_ids is not None:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        if prompt_ids is not None:
+            if kwargs.get("decoder_start_token_id") is not None:
+                raise ValueError(
+                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
+                )
+            prompt_ids = prompt_ids.tolist()
+            decoder_start_token_id, *text_prompt_ids = prompt_ids
+            # Set the decoder_start_token_id to <|startofprev|>
+            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
+
+            # Update the max generation length to include the prompt
+            specified_max_length = kwargs.pop("max_new_tokens", None) or kwargs.pop("max_length", None)
+            default_max_length = generation_config.max_new_tokens or generation_config.max_length
+            non_prompt_max_length = specified_max_length or default_max_length
+            kwargs["max_new_tokens"] = non_prompt_max_length + len(text_prompt_ids)
+
+            # Reformat the forced_decoder_ids to incorporate the prompt
+            non_prompt_forced_decoder_ids = (
+                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
+            )
+            forced_decoder_ids = [
+                # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
+                # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
+                *text_prompt_ids[-self.config.max_length // 2 - 1 :],
+                generation_config.decoder_start_token_id,
+                *[token for _rank, token in non_prompt_forced_decoder_ids],
+            ]
+            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
+            generation_config.forced_decoder_ids = forced_decoder_ids
 
         if generation_config.return_timestamps:
             logits_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
 
-        if len(forced_decoder_ids) > 0:
-            generation_config.forced_decoder_ids = forced_decoder_ids
-
         return super().generate(
             inputs,
             generation_config,
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 8c158b041f7c..b0d0d6c95450 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -16,6 +16,7 @@
 Speech processor class for Whisper
 """
 
+
 from ...processing_utils import ProcessorMixin
 
 
@@ -91,3 +92,6 @@ def decode(self, *args, **kwargs):
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
+
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 24eb72a0b0f9..4c7c9c89fd3b 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -606,6 +606,11 @@ def _decode(
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
+        if skip_special_tokens:
+            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
+            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
+
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
 
         # To avoid mixing byte-level and unicode for byte-level BPT
@@ -714,6 +719,31 @@ def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time
             time_precision=time_precision,
         )
 
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
+        batch_encoding = self("<|startofprev|>", text.strip(), add_prefix_space=True, add_special_tokens=False)
+
+        # Check for special tokens
+        prompt_text_ids = batch_encoding["input_ids"][1:]
+        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
+        if special_token_id is not None:
+            token = self.convert_ids_to_tokens(special_token_id)
+            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
+
+        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
+        return batch_encoding["input_ids"]
+
+    @staticmethod
+    def _strip_prompt(token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
+        has_prompt = isinstance(token_ids, list) and token_ids and token_ids[0] == prompt_token_id
+        if has_prompt:
+            if decoder_start_token_id in token_ids:
+                return token_ids[token_ids.index(decoder_start_token_id) :]
+            else:
+                return []
+
+        return token_ids
+
 
 def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language, time_precision):
     """
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index fb1bf89ed606..be4ad842a7f6 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -312,6 +312,11 @@ def decode(
         return text
 
     def _decode(self, *args, normalize: bool = False, **kwargs) -> str:
+        if kwargs["skip_special_tokens"]:
+            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
+            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+            kwargs["token_ids"] = self._strip_prompt(kwargs["token_ids"], prompt_token_id, decoder_start_token_id)
+
         text = super()._decode(*args, **kwargs)
 
         if normalize:
@@ -485,3 +490,30 @@ def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time
             return_language=return_language,
             time_precision=time_precision,
         )
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
+        batch_encoding = self("<|startofprev|>", text.strip(), add_prefix_space=True, add_special_tokens=False)
+
+        # Check for special tokens
+        prompt_text_ids = batch_encoding["input_ids"][1:]
+        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
+        if special_token_id is not None:
+            token = self.convert_ids_to_tokens(special_token_id)
+            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
+
+        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
+        return batch_encoding["input_ids"]
+
+    @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._strip_prompt
+    def _strip_prompt(token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
+        has_prompt = isinstance(token_ids, list) and token_ids and token_ids[0] == prompt_token_id
+        if has_prompt:
+            if decoder_start_token_id in token_ids:
+                return token_ids[token_ids.index(decoder_start_token_id) :]
+            else:
+                return []
+
+        return token_ids
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 883a2021b9bb..98bbbb3214a7 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1013,6 +1013,48 @@ def test_mask_time_prob(self):
             encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
             self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
 
+    def test_generate_with_prompt_ids_and_task_and_language(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.arange(5)
+        language = "<|de|>"
+        task = "translate"
+        lang_id = 6
+        task_id = 7
+        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
+        model.generation_config.__setattr__("task_to_id", {task: task_id})
+
+        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            lang_id,
+            task_id,
+        ]
+        for row in output.tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
+    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.asarray(range(5))
+        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
+
+        output = model.generate(
+            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
+        )
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            *[token for _rank, token in forced_decoder_ids],
+        ]
+        for row in output.tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
 
 @require_torch
 @require_torchaudio
@@ -1429,6 +1471,60 @@ def test_tiny_specaugment_librispeech(self):
         # fmt: on
         self.assertTrue(torch.allclose(logits[0][0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
+    @slow
+    def test_generate_with_prompt_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(4)[-1:]
+        input_features = processor(input_speech, return_tensors="pt").input_features
+
+        output_without_prompt = model.generate(input_features)
+        prompt_ids = processor.get_prompt_ids("Leighton")
+        output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
+
+        expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
+        expected_with_prompt = "<|startofprev|> Leighton<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
+        self.assertEqual(processor.decode(output_without_prompt[0]), expected_without_prompt)
+        self.assertEqual(processor.decode(output_with_prompt[0]), expected_with_prompt)
+
+    @slow
+    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(1)
+        input_features = processor(input_speech, return_tensors="pt").input_features
+        task = "translate"
+        language = "de"
+        expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
+        prompt = "test prompt"
+        prompt_ids = processor.get_prompt_ids(prompt)
+
+        output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
+        text = processor.decode(output[0])
+
+        self.assertTrue(prompt in text)
+        self.assertTrue(all([token in text for token in expected_tokens]))
+
+    @slow
+    def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(1)
+        input_features = processor(input_speech, return_tensors="pt").input_features
+        prompt = "test prompt"
+        prompt_ids = processor.get_prompt_ids(prompt)
+
+        model.generation_config.forced_decoder_ids = None
+        model.config.forced_decoder_ids = None
+
+        output = model.generate(input_features, prompt_ids=prompt_ids, return_timestamps=True)
+        text = processor.decode(output[0])
+
+        self.assertTrue(prompt in text)
+
 
 def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
     if head_mask is None:
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index b844d433ed33..e96f4260e94c 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -16,6 +16,8 @@
 import tempfile
 import unittest
 
+import pytest
+
 from transformers import WhisperTokenizer, is_speech_available
 from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
 
@@ -146,3 +148,32 @@ def test_get_decoder_prompt_ids(self):
 
         expected_ids = [TRANSCRIBE, NOTIMESTAMPS]
         self.assertListEqual([ids[-1] for ids in forced_decoder_ids], expected_ids)
+
+    def test_get_prompt_ids(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        prompt_ids = processor.get_prompt_ids("Mr. Quilter")
+        decoded_prompt = processor.tokenizer.decode(prompt_ids)
+
+        self.assertListEqual(prompt_ids.tolist(), [50360, 1770, 13, 2264, 346, 353])
+        self.assertEqual(decoded_prompt, "<|startofprev|> Mr. Quilter")
+
+    def test_empty_get_prompt_ids(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        prompt_ids = processor.get_prompt_ids("")
+        decoded_prompt = processor.tokenizer.decode(prompt_ids)
+
+        self.assertListEqual(prompt_ids.tolist(), [50360, 220])
+        self.assertEqual(decoded_prompt, "<|startofprev|> ")
+
+    def test_get_prompt_ids_with_special_tokens(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+
+        def _test_prompt_error_raised_helper(prompt, special_token):
+            with pytest.raises(ValueError) as excinfo:
+                processor.get_prompt_ids(prompt)
+            expected = f"Encountered text in the prompt corresponding to disallowed special token: {special_token}."
+            self.assertEqual(expected, str(excinfo.value))
+
+        _test_prompt_error_raised_helper("<|startofprev|> test", "<|startofprev|>")
+        _test_prompt_error_raised_helper("test <|notimestamps|>", "<|notimestamps|>")
+        _test_prompt_error_raised_helper("test <|zh|> test <|transcribe|>", "<|zh|>")
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 9ceef149fab9..5022d29b730e 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -194,6 +194,25 @@ def test_find_longest_common_subsequence(self):
         merge = _find_longest_common_sequence([seq1, seq2, seq3])
         self.assertEqual(merge, [1, 2, 3, 4, 5, 6, 7, 8])
 
+    def test_skip_special_tokens_skips_prompt_ids(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        # fmt: off
+        encoded_input = [
+            50361, 2221, 13, 2326, 388, 391, 50258, 50259, 50359,
+            50363, 1282, 264, 2674, 9156, 295, 1523, 11, 2221, 13,
+            2326, 388, 391, 13657, 365, 2681, 21296, 17711, 13, 50257,
+        ]
+        # fmt: on
+        expected_with_special_tokens = "<|startofprev|> Mr. Quilter<|startoftranscript|><|en|><|transcribe|><|notimestamps|> On the general principles of art, Mr. Quilter writes with equal lucidity.<|endoftext|>"
+        expected_without_special_tokens = " On the general principles of art, Mr. Quilter writes with equal lucidity."
+        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
+        self.assertEqual(tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens)
+        self.assertEqual(rust_tokenizer.decode(encoded_input, skip_special_tokens=False), expected_with_special_tokens)
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
+        )
+
 
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "openai/whisper-small.en"

From 3cf01b2060625de99f19f29e48f889c76cbf5cdf Mon Sep 17 00:00:00 2001
From: Julien Chaumond <julien@huggingface.co>
Date: Fri, 19 May 2023 11:03:07 +0200
Subject: [PATCH 168/935] README: Fix affiliation for MEGA (#23394)

* README: Fix affiliation for MEGA

* Fix quality

---------

Co-authored-by: Lysandre <lysandre@huggingface.co>
---
 README.md                | 2 +-
 docs/source/en/index.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e2f1b083ab0f..35ede94067a4 100644
--- a/README.md
+++ b/README.md
@@ -395,7 +395,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index d0a3c0babbbe..244b94cebbb5 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -156,7 +156,7 @@ The documentation is organized into five sections:
 1. **[MatCha](model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
 1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[MEGA](model_doc/mega)** (from Facebook) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[MEGA](model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
 1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.

From 8aa8513f715faaa84cef1abd57ea4ded96c80e44 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <alanwake.tan@gmail.com>
Date: Fri, 19 May 2023 04:41:51 -0700
Subject: [PATCH 169/935] Remove .data usages in optimizations.py (#23417)

Patched the optimizers
---
 src/transformers/optimization.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 8c9430fb6f2c..1019dab85430 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -422,6 +422,7 @@ def __init__(
         defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
         super().__init__(params, defaults)
 
+    @torch.no_grad()
     def step(self, closure: Callable = None):
         """
         Performs a single optimization step.
@@ -437,7 +438,7 @@ def step(self, closure: Callable = None):
             for p in group["params"]:
                 if p.grad is None:
                     continue
-                grad = p.grad.data
+                grad = p.grad
                 if grad.is_sparse:
                     raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
 
@@ -447,9 +448,9 @@ def step(self, closure: Callable = None):
                 if len(state) == 0:
                     state["step"] = 0
                     # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_avg"] = torch.zeros_like(p)
                     # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p)
 
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                 beta1, beta2 = group["betas"]
@@ -468,7 +469,7 @@ def step(self, closure: Callable = None):
                     bias_correction2 = 1.0 - beta2 ** state["step"]
                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 
-                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+                p.addcdiv_(exp_avg, denom, value=-step_size)
 
                 # Just adding the square of the weights to the loss function is *not*
                 # the correct way of using L2 regularization/weight decay with Adam,
@@ -479,7 +480,7 @@ def step(self, closure: Callable = None):
                 # of the weights to the loss with plain (non-momentum) SGD.
                 # Add weight decay at the end (fixed version)
                 if group["weight_decay"] > 0.0:
-                    p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
+                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
 
         return loss
 
@@ -630,6 +631,7 @@ def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col):
         c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
         return torch.mul(r_factor, c_factor)
 
+    @torch.no_grad()
     def step(self, closure=None):
         """
         Performs a single optimization step
@@ -646,7 +648,7 @@ def step(self, closure=None):
             for p in group["params"]:
                 if p.grad is None:
                     continue
-                grad = p.grad.data
+                grad = p.grad
                 if grad.dtype in {torch.float16, torch.bfloat16}:
                     grad = grad.float()
                 if grad.is_sparse:
@@ -679,8 +681,8 @@ def step(self, closure=None):
                     else:
                         state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
 
-                p_data_fp32 = p.data
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                p_data_fp32 = p
+                if p.dtype in {torch.float16, torch.bfloat16}:
                     p_data_fp32 = p_data_fp32.float()
 
                 state["step"] += 1
@@ -718,8 +720,8 @@ def step(self, closure=None):
 
                 p_data_fp32.add_(-update)
 
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
+                if p.dtype in {torch.float16, torch.bfloat16}:
+                    p.copy_(p_data_fp32)
 
         return loss
 

From 1c460a52733205799ba117ca04442100572c3b09 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 19 May 2023 14:14:13 +0100
Subject: [PATCH 170/935] TF port of the Segment Anything Model (SAM) (#22970)

* First commit

* Add auto-translation with GPT-4

* make fixup

* Add a functional layernorm for TF

* Add all the auxiliary imports etc.

* Add the extra processor and tests

* rebase to main

* Add all the needed fixes to the GPT code

* make fixup

* Make convolutions channels-last so they run on CPU

* make fixup

* Fix final issues

* Fix other models affected by test change

* Clarify comment on the sparse_prompt_embeddings check

* Refactor functional_layernorm, use shape_list in place of .shape in some places

* Remove deprecated torch-alike code

* Update tests/models/sam/test_modeling_tf_sam.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/sam/test_modeling_tf_sam.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Refactor processor with common methods and separated private methods

* make fixup

* Quietly delete the file that didn't do anything (sorry Sylvain)

* Refactor the processor tests into one file

* make fixup

* Clean up some unnecessary indirection

* Fix TF mask postprocessing

* Add more processor equivalence tests

* Refactor generate_crop_boxes to use framework-neutral np code

* Make the serving output correctly conditional

* Fix error message line length

* Use dict keys rather than indices internally in both TF and PT SAM call/forward

* Return dicts internally in the call/forward methods

* Revert changes to common tests and just override check_pt_tf_outputs

* Revert changes to other model tests

* Clarify comments for functional layernorm

* Add missing transpose from PT code

* Removed unused copied from in PT code

* Remove overrides for tests that don't exist in TF

* Fix transpose and update tests for PT and TF to check pred_masks

* Add training flag

* Update tests to use TF checkpoints

* Update index.mdx

* Add missing cross-test decorator

* Remove optional extra asterisks

* Revert return_dict changes in PT code

* Update src/transformers/models/sam/modeling_tf_sam.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Remove None return annotations on init methods

* Update tests/models/sam/test_processor_sam.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Fix input_boxes shapes

* make fixup

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/index.mdx                      |    2 +-
 docs/source/en/model_doc/sam.mdx              |    6 +
 src/transformers/__init__.py                  |   12 +
 .../models/auto/modeling_tf_auto.py           |   14 +
 src/transformers/models/sam/__init__.py       |   27 +-
 .../models/sam/image_processing_sam.py        |  456 ++++-
 src/transformers/models/sam/modeling_sam.py   |   23 +-
 .../models/sam/modeling_tf_sam.py             | 1497 +++++++++++++++++
 src/transformers/models/sam/processing_sam.py |   25 +-
 src/transformers/tf_utils.py                  |   50 +
 src/transformers/utils/dummy_tf_objects.py    |   17 +
 tests/models/sam/test_modeling_sam.py         |   16 +-
 tests/models/sam/test_modeling_tf_sam.py      |  671 ++++++++
 tests/models/sam/test_processor_sam.py        |  168 +-
 14 files changed, 2940 insertions(+), 44 deletions(-)
 create mode 100644 src/transformers/models/sam/modeling_tf_sam.py
 create mode 100644 tests/models/sam/test_modeling_tf_sam.py

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 244b94cebbb5..497803322f89 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -399,7 +399,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             RWKV              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              SAM              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              SAM              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/sam.mdx b/docs/source/en/model_doc/sam.mdx
index 969b7e2b2290..ac8aa4a57302 100644
--- a/docs/source/en/model_doc/sam.mdx
+++ b/docs/source/en/model_doc/sam.mdx
@@ -99,3 +99,9 @@ Resources:
 
 [[autodoc]] SamModel
     - forward
+
+
+## TFSamModel
+
+[[autodoc]] TFSamModel
+    - call
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c079981490b7..10bf93633abd 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3406,6 +3406,13 @@
             "TFRoFormerPreTrainedModel",
         ]
     )
+    _import_structure["models.sam"].extend(
+        [
+            "TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFSamModel",
+            "TFSamPreTrainedModel",
+        ]
+    )
     _import_structure["models.segformer"].extend(
         [
             "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6657,6 +6664,11 @@
             TFRoFormerModel,
             TFRoFormerPreTrainedModel,
         )
+        from .models.sam import (
+            TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFSamModel,
+            TFSamPreTrainedModel,
+        )
         from .models.segformer import (
             TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFSegformerDecodeHead,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 5d637271f98d..bfc29f2dd35f 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -76,6 +76,7 @@
         ("roberta", "TFRobertaModel"),
         ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
         ("roformer", "TFRoFormerModel"),
+        ("sam", "TFSamModel"),
         ("segformer", "TFSegformerModel"),
         ("speech_to_text", "TFSpeech2TextModel"),
         ("swin", "TFSwinModel"),
@@ -426,6 +427,11 @@
         ("mobilebert", "TFMobileBertForNextSentencePrediction"),
     ]
 )
+TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("sam", "TFSamModel"),
+    ]
+)
 
 TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
 TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
@@ -476,6 +482,14 @@
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
 )
 
+TF_MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+)
+
+
+class TFAutoModelForMaskGeneration(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING
+
 
 class TFAutoModel(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_MAPPING
diff --git a/src/transformers/models/sam/__init__.py b/src/transformers/models/sam/__init__.py
index d9bbf5f5eaf2..e8006e89e0f1 100644
--- a/src/transformers/models/sam/__init__.py
+++ b/src/transformers/models/sam/__init__.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {
@@ -39,6 +45,17 @@
         "SamModel",
         "SamPreTrainedModel",
     ]
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_sam"] = [
+        "TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFSamModel",
+        "TFSamPreTrainedModel",
+    ]
 try:
     if not is_vision_available():
         raise OptionalDependencyNotAvailable()
@@ -66,6 +83,14 @@
     else:
         from .modeling_sam import SAM_PRETRAINED_MODEL_ARCHIVE_LIST, SamModel, SamPreTrainedModel
 
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_sam import TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST, TFSamModel, TFSamPreTrainedModel
+
     try:
         if not is_vision_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index 53852961041b..64f3bae22218 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -34,7 +34,14 @@
     to_numpy_array,
     valid_images,
 )
-from ...utils import TensorType, is_torch_available, is_torchvision_available, logging, requires_backends
+from ...utils import (
+    TensorType,
+    is_tf_available,
+    is_torch_available,
+    is_torchvision_available,
+    logging,
+    requires_backends,
+)
 
 
 if is_torch_available():
@@ -44,6 +51,12 @@
 if is_torchvision_available():
     from torchvision.ops.boxes import batched_nms
 
+if is_tf_available():
+    import tensorflow as tf
+    from tensorflow.experimental import numpy as tnp
+
+    from ...tf_utils import flatten, shape_list
+
 logger = logging.get_logger(__name__)
 
 
@@ -372,6 +385,61 @@ def preprocess(
         return encoded_outputs
 
     def post_process_masks(
+        self,
+        masks,
+        original_sizes,
+        reshaped_input_sizes,
+        mask_threshold=0.0,
+        binarize=True,
+        pad_size=None,
+        return_tensors="pt",
+    ):
+        """
+        Remove padding and upscale masks to the original image size.
+
+        Args:
+            masks (`Union[List[torch.Tensor], List[np.ndarray], List[tf.Tensor]]`):
+                Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
+            original_sizes (`Union[torch.Tensor, tf.Tensor, List[Tuple[int,int]]]`):
+                The original sizes of each image before it was resized to the model's expected input shape, in (height,
+                width) format.
+            reshaped_input_sizes (`Union[torch.Tensor, tf.Tensor, List[Tuple[int,int]]]`):
+                The size of each image as it is fed to the model, in (height, width) format. Used to remove padding.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                The threshold to use for binarizing the masks.
+            binarize (`bool`, *optional*, defaults to `True`):
+                Whether to binarize the masks.
+            pad_size (`int`, *optional*, defaults to `self.pad_size`):
+                The target size the images were padded to before being passed to the model. If None, the target size is
+                assumed to be the processor's `pad_size`.
+            return_tensors (`str`, *optional*, defaults to `"pt"`):
+                If `"pt"`, return PyTorch tensors. If `"tf"`, return TensorFlow tensors.
+        Returns:
+            (`Union[torch.Tensor, tf.Tensor]`): Batched masks in batch_size, num_channels, height, width) format, where
+            (height, width) is given by original_size.
+        """
+        if return_tensors == "pt":
+            return self._post_process_masks_pt(
+                masks=masks,
+                original_sizes=original_sizes,
+                reshaped_input_sizes=reshaped_input_sizes,
+                mask_threshold=mask_threshold,
+                binarize=binarize,
+                pad_size=pad_size,
+            )
+        elif return_tensors == "tf":
+            return self._post_process_masks_tf(
+                masks=masks,
+                original_sizes=original_sizes,
+                reshaped_input_sizes=reshaped_input_sizes,
+                mask_threshold=mask_threshold,
+                binarize=binarize,
+                pad_size=pad_size,
+            )
+        else:
+            raise ValueError("return_tensors must be either 'pt' or 'tf'")
+
+    def _post_process_masks_pt(
         self, masks, original_sizes, reshaped_input_sizes, mask_threshold=0.0, binarize=True, pad_size=None
     ):
         """
@@ -418,21 +486,70 @@ def post_process_masks(
 
         return output_masks
 
-    def post_process_for_mask_generation(self, all_masks, all_scores, all_boxes, crops_nms_thresh):
+    def _post_process_masks_tf(
+        self, masks, original_sizes, reshaped_input_sizes, mask_threshold=0.0, binarize=True, pad_size=None
+    ):
+        """
+        Remove padding and upscale masks to the original image size.
+
+        Args:
+            masks (`tf.Tensor`):
+                Batched masks from the mask_decoder in (batch_size, num_channels, height, width) format.
+            original_sizes (`tf.Tensor`):
+                The original size of the images before resizing for input to the model, in (height, width) format.
+            reshaped_input_sizes (`tf.Tensor`):
+                The size of the image input to the model, in (height, width) format. Used to remove padding.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                The threshold to use for binarizing the masks.
+            binarize (`bool`, *optional*, defaults to `True`):
+                Whether to binarize the masks.
+            pad_size (`int`, *optional*, defaults to `self.pad_size`):
+                The target size the images were padded to before being passed to the model. If None, the target size is
+                assumed to be the processor's `pad_size`.
+        Returns:
+            (`tf.Tensor`): Batched masks in batch_size, num_channels, height, width) format, where (height, width) is
+            given by original_size.
+        """
+        requires_backends(self, ["tf"])
+        pad_size = self.pad_size if pad_size is None else pad_size
+        target_image_size = (pad_size["height"], pad_size["width"])
+
+        output_masks = []
+        for i, original_size in enumerate(original_sizes):
+            # tf.image expects NHWC, we transpose the NCHW inputs for it
+            mask = tf.transpose(masks[i], perm=[0, 2, 3, 1])
+            interpolated_mask = tf.image.resize(mask, target_image_size, method="bilinear")
+            interpolated_mask = interpolated_mask[:, : reshaped_input_sizes[i][0], : reshaped_input_sizes[i][1], :]
+            interpolated_mask = tf.image.resize(interpolated_mask, original_size, method="bilinear")
+            if binarize:
+                interpolated_mask = interpolated_mask > mask_threshold
+            # And then we transpose them back at the end
+            output_masks.append(tf.transpose(interpolated_mask, perm=[0, 3, 1, 2]))
+
+        return output_masks
+
+    def post_process_for_mask_generation(
+        self, all_masks, all_scores, all_boxes, crops_nms_thresh, return_tensors="pt"
+    ):
         """
         Post processes mask that are generated by calling the Non Maximum Suppression algorithm on the predicted masks.
 
         Args:
-            all_masks (`List[torch.Tensor]`):
+            all_masks (`Union[List[torch.Tensor], List[tf.Tensor]]`):
                 List of all predicted segmentation masks
-            all_scores (`List[torch.Tensor]`):
+            all_scores (`Union[List[torch.Tensor], List[tf.Tensor]]`):
                 List of all predicted iou scores
-            all_boxes (`List[torch.Tensor]`):
+            all_boxes (`Union[List[torch.Tensor], List[tf.Tensor]]`):
                 List of all bounding boxes of the predicted masks
             crops_nms_thresh (`float`):
                 Threshold for NMS (Non Maximum Suppression) algorithm.
+            return_tensors (`str`, *optional*, defaults to `pt`):
+                If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
         """
-        return _postprocess_for_mg(all_masks, all_scores, all_boxes, crops_nms_thresh)
+        if return_tensors == "pt":
+            return _postprocess_for_mg(all_masks, all_scores, all_boxes, crops_nms_thresh)
+        elif return_tensors == "tf":
+            return _postprocess_for_mg_tf(all_masks, all_scores, all_boxes, crops_nms_thresh)
 
     def generate_crop_boxes(
         self,
@@ -443,6 +560,7 @@ def generate_crop_boxes(
         points_per_crop: Optional[int] = 32,
         crop_n_points_downscale_factor: Optional[List[int]] = 1,
         device: Optional["torch.device"] = None,
+        return_tensors: str = "pt",
     ):
         """
         Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
@@ -464,10 +582,35 @@ def generate_crop_boxes(
                 The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
             device (`torch.device`, *optional*, defaults to None):
                 Device to use for the computation. If None, cpu will be used.
+            return_tensors (`str`, *optional*, defaults to `pt`):
+                If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
         """
-        return _generate_crop_boxes(
-            image, target_size, crop_n_layers, overlap_ratio, points_per_crop, crop_n_points_downscale_factor, device
+        crop_boxes, points_per_crop, cropped_images, input_labels = _generate_crop_boxes(
+            image,
+            target_size,
+            crop_n_layers,
+            overlap_ratio,
+            points_per_crop,
+            crop_n_points_downscale_factor,
         )
+        if return_tensors == "pt":
+            if device is None:
+                device = torch.device("cpu")
+            crop_boxes = torch.tensor(crop_boxes, device=device)
+            points_per_crop = torch.tensor(points_per_crop, device=device)
+            # cropped_images stays as np
+            input_labels = torch.tensor(input_labels, device=device)
+
+        elif return_tensors == "tf":
+            if device is not None:
+                raise ValueError("device is not a supported argument when return_tensors is tf!")
+            crop_boxes = tf.convert_to_tensor(crop_boxes)
+            points_per_crop = tf.convert_to_tensor(points_per_crop)
+            # cropped_images stays as np
+            input_labels = tf.convert_to_tensor(input_labels)
+        else:
+            raise ValueError("return_tensors must be either 'pt' or 'tf'.")
+        return crop_boxes, points_per_crop, cropped_images, input_labels
 
     def filter_masks(
         self,
@@ -479,6 +622,67 @@ def filter_masks(
         stability_score_thresh=0.95,
         mask_threshold=0,
         stability_score_offset=1,
+        return_tensors="pt",
+    ):
+        """
+        Filters the predicted masks by selecting only the ones that meets several criteria. The first criterion being
+        that the iou scores needs to be greater than `pred_iou_thresh`. The second criterion is that the stability
+        score needs to be greater than `stability_score_thresh`. The method also converts the predicted masks to
+        bounding boxes and pad the predicted masks if necessary.
+
+        Args:
+            masks (`Union[torch.Tensor, tf.Tensor]`):
+                Input masks.
+            iou_scores (`Union[torch.Tensor, tf.Tensor]`):
+                List of IoU scores.
+            original_size (`Tuple[int,int]`):
+                Size of the orginal image.
+            cropped_box_image (`np.array`):
+                The cropped image.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                The threshold for the iou scores.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                The threshold for the stability score.
+            mask_threshold (`float`, *optional*, defaults to 0):
+                The threshold for the predicted masks.
+            stability_score_offset (`float`, *optional*, defaults to 1):
+                The offset for the stability score used in the `_compute_stability_score` method.
+            return_tensors (`str`, *optional*, defaults to `pt`):
+                If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
+        """
+        if return_tensors == "pt":
+            return self._filter_masks_pt(
+                masks=masks,
+                iou_scores=iou_scores,
+                original_size=original_size,
+                cropped_box_image=cropped_box_image,
+                pred_iou_thresh=pred_iou_thresh,
+                stability_score_thresh=stability_score_thresh,
+                mask_threshold=mask_threshold,
+                stability_score_offset=stability_score_offset,
+            )
+        elif return_tensors == "tf":
+            return self._filter_masks_tf(
+                masks=masks,
+                iou_scores=iou_scores,
+                original_size=original_size,
+                cropped_box_image=cropped_box_image,
+                pred_iou_thresh=pred_iou_thresh,
+                stability_score_thresh=stability_score_thresh,
+                mask_threshold=mask_threshold,
+                stability_score_offset=stability_score_offset,
+            )
+
+    def _filter_masks_pt(
+        self,
+        masks,
+        iou_scores,
+        original_size,
+        cropped_box_image,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
     ):
         """
         Filters the predicted masks by selecting only the ones that meets several criteria. The first criterion being
@@ -525,7 +729,7 @@ def filter_masks(
 
         # compute stability score
         if stability_score_thresh > 0.0:
-            stability_scores = _compute_stability_score(masks, mask_threshold, stability_score_offset)
+            stability_scores = _compute_stability_score_pt(masks, mask_threshold, stability_score_offset)
             keep_mask = keep_mask & (stability_scores > stability_score_thresh)
 
         scores = iou_scores[keep_mask]
@@ -549,8 +753,85 @@ def filter_masks(
 
         return masks, scores, converted_boxes
 
+    def _filter_masks_tf(
+        self,
+        masks,
+        iou_scores,
+        original_size,
+        cropped_box_image,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+    ):
+        """
+        Filters the predicted masks by selecting only the ones that meets several criteria. The first criterion being
+        that the iou scores needs to be greater than `pred_iou_thresh`. The second criterion is that the stability
+        score needs to be greater than `stability_score_thresh`. The method also converts the predicted masks to
+        bounding boxes and pad the predicted masks if necessary.
 
-def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
+        Args:
+            masks (`tf.Tensor`):
+                Input masks.
+            iou_scores (`tf.Tensor`):
+                List of IoU scores.
+            original_size (`Tuple[int,int]`):
+                Size of the orginal image.
+            cropped_box_image (`np.array`):
+                The cropped image.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                The threshold for the iou scores.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                The threshold for the stability score.
+            mask_threshold (`float`, *optional*, defaults to 0):
+                The threshold for the predicted masks.
+            stability_score_offset (`float`, *optional*, defaults to 1):
+                The offset for the stability score used in the `_compute_stability_score` method.
+
+        """
+        requires_backends(self, ["tf"])
+        original_height, original_width = original_size
+        iou_scores = tf.reshape(iou_scores, [iou_scores.shape[0] * iou_scores.shape[1], iou_scores.shape[2:]])
+        masks = tf.reshape(masks, [masks.shape[0] * masks.shape[1], masks.shape[2:]])
+
+        if masks.shape[0] != iou_scores.shape[0]:
+            raise ValueError("masks and iou_scores must have the same batch size.")
+
+        batch_size = masks.shape[0]
+
+        keep_mask = tf.ones(batch_size, dtype=tf.bool)
+
+        if pred_iou_thresh > 0.0:
+            keep_mask = keep_mask & (iou_scores > pred_iou_thresh)
+
+        # compute stability score
+        if stability_score_thresh > 0.0:
+            stability_scores = _compute_stability_score_tf(masks, mask_threshold, stability_score_offset)
+            keep_mask = keep_mask & (stability_scores > stability_score_thresh)
+
+        scores = iou_scores[keep_mask]
+        masks = masks[keep_mask]
+
+        # binarize masks
+        masks = masks > mask_threshold
+        converted_boxes = _batched_mask_to_box_tf(masks)
+
+        keep_mask = ~_is_box_near_crop_edge_tf(
+            converted_boxes, cropped_box_image, [0, 0, original_width, original_height]
+        )
+
+        scores = scores[keep_mask]
+        masks = masks[keep_mask]
+        converted_boxes = converted_boxes[keep_mask]
+
+        masks = _pad_masks_tf(masks, cropped_box_image, original_height, original_width)
+        # conversion to rle is necessary to run non-maximum suppresion
+        masks = _mask_to_rle_tf(masks)
+
+        return masks, scores, converted_boxes
+
+
+def _compute_stability_score_pt(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
     # One mask is always contained inside the other.
     # Save memory by preventing unnecesary cast to torch.int64
     intersections = (
@@ -561,6 +842,17 @@ def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stabi
     return stability_scores
 
 
+def _compute_stability_score_tf(masks: "tf.Tensor", mask_threshold: float, stability_score_offset: int):
+    # Torch does Py3-style division but TF does floor division with ints. We cast to float32 in TF to make sure
+    # we get the right division results.
+    intersections = tf.count_nonzero(
+        masks > (mask_threshold + stability_score_offset), axis=[-1, -2], dtype=tf.float32
+    )
+    unions = tf.count_nonzero(masks > (mask_threshold - stability_score_offset), axis=[-1, -2], dtype=tf.float32)
+    stability_scores = intersections / unions
+    return stability_scores
+
+
 def _build_point_grid(n_per_side: int) -> np.ndarray:
     """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
     offset = 1 / (2 * n_per_side)
@@ -606,7 +898,6 @@ def _generate_crop_boxes(
     overlap_ratio: float = 512 / 1500,
     points_per_crop: Optional[int] = 32,
     crop_n_points_downscale_factor: Optional[List[int]] = 1,
-    device: Optional["torch.device"] = None,
 ) -> Tuple[List[List[int]], List[int]]:
     """
     Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
@@ -626,11 +917,7 @@ def _generate_crop_boxes(
             Number of points to sample per crop.
         crop_n_points_downscale_factor (`int`, *optional*):
             The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
-        device (`torch.device`, *optional*):
-            Device to run the crop generation on. Defaults to CPU.
     """
-    if device is None:
-        device = torch.device("cpu")
 
     if isinstance(image, list):
         raise ValueError("Only one image is allowed for crop generation.")
@@ -648,12 +935,11 @@ def _generate_crop_boxes(
         crop_boxes, image, points_grid, layer_idxs, target_size, original_size
     )
 
-    crop_boxes = torch.tensor(crop_boxes, dtype=torch.float32, device=device)
-    point_grid_per_crop = np.array([point_grid_per_crop])
-    points_per_crop = torch.tensor(point_grid_per_crop, device=device)
-    points_per_crop = points_per_crop.permute(0, 2, 1, 3)
+    crop_boxes = crop_boxes.astype(np.float32)
+    points_per_crop = np.array([point_grid_per_crop])
+    points_per_crop = np.transpose(points_per_crop, axes=(0, 2, 1, 3))
 
-    input_labels = torch.ones_like(points_per_crop[:, :, :, 0], dtype=torch.long, device=device)
+    input_labels = np.ones_like(points_per_crop[:, :, :, 0], dtype=np.int64)
 
     return crop_boxes, points_per_crop, cropped_images, input_labels
 
@@ -730,6 +1016,16 @@ def _pad_masks(masks, crop_box: List[int], orig_height: int, orig_width: int):
     return torch.nn.functional.pad(masks, pad, value=0)
 
 
+def _pad_masks_tf(masks, crop_box: List[int], orig_height: int, orig_width: int):
+    left, top, right, bottom = crop_box
+    if left == 0 and top == 0 and right == orig_width and bottom == orig_height:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_width - (right - left), orig_height - (bottom - top)
+    pad = (left, pad_x - left, top, pad_y - top)
+    return tf.pad(masks, pad, constant_values=0)
+
+
 def _is_box_near_crop_edge(boxes, crop_box, orig_box, atol=20.0):
     """Filter masks at the edge of a crop, but not at the edge of the original image."""
     crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
@@ -748,6 +1044,24 @@ def _is_box_near_crop_edge(boxes, crop_box, orig_box, atol=20.0):
     return torch.any(near_crop_edge, dim=1)
 
 
+def _is_box_near_crop_edge_tf(boxes, crop_box, orig_box, atol=20.0):
+    """Filter masks at the edge of a crop, but not at the edge of the original image."""
+    crop_box_tf = tf.convert_to_tensor(crop_box, dtype=tf.float32)
+    orig_box_tf = tf.convert_to_tensor(orig_box, dtype=tf.float32)
+
+    left, top, _, _ = crop_box
+    offset = tf.convert_to_tensor([[left, top, left, top]])
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = tf.expand_dims(offset, 1)
+    boxes = tf.cast(boxes + offset, tf.float32)
+
+    near_crop_edge = tnp.isclose(boxes, crop_box_tf[None, :], atol=atol, rtol=0)
+    near_image_edge = tnp.isclose(boxes, orig_box_tf[None, :], atol=atol, rtol=0)
+    near_crop_edge = tf.math.logical_and(near_crop_edge, ~near_image_edge)
+    return tf.reduce_any(near_crop_edge, axis=1)
+
+
 def _batched_mask_to_box(masks: "torch.Tensor"):
     """
     Computes the bounding boxes around the given input masks. The bounding boxes are in the XYXY format which
@@ -797,6 +1111,54 @@ def _batched_mask_to_box(masks: "torch.Tensor"):
     return out
 
 
+def _batched_mask_to_box_tf(masks: "tf.Tensor"):
+    """
+    Computes the bounding boxes around the given input masks. The bounding boxes are in the XYXY format which
+    corresponds the following required indices:
+        - LEFT: left hand side of the bounding box
+        - TOP: top of the bounding box
+        - RIGHT: right of the bounding box
+        - BOTTOM: bottom of the bounding box
+
+    Return [0,0,0,0] for an empty mask. For input shape channel_1 x channel_2 x ... x height x width, the output shape
+    is channel_1 x channel_2 x ... x 4.
+
+    Args:
+        - masks (`tf.Tensor` of shape `(batch, nb_mask, height, width)`)
+    """
+
+    if tf.size(masks) == 0:
+        return tf.zeros([*masks.shape[:-2], 4])
+
+    # Normalize shape to Cxheightxwidth
+    shape = shape_list(masks)
+    height, width = shape[-2:]
+
+    # Get top and bottom edges
+    in_height = tf.reduce_max(masks, axis=-1)
+    in_height_coords = in_height * tf.range(height)[None, :]
+    bottom_edges = tf.reduce_max(in_height_coords, axis=-1)
+    in_height_coords = in_height_coords + height * (~in_height)
+    top_edges = tf.reduce_min(in_height_coords, axis=-1)
+
+    # Get left and right edges
+    in_width, _ = tf.reduce_max(masks, axis=-2)
+    in_width_coords = in_width * tf.range(width)[None, :]
+    right_edges, _ = tf.reduce_max(in_width_coords, axis=-1)
+    in_width_coords = in_width_coords + width * (~in_width)
+    left_edges, _ = tf.reduce_min(in_width_coords, axis=-1)
+
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = tf.stack([left_edges, top_edges, right_edges, bottom_edges], axis=-1)
+    out = out * tf.expand_dims(~empty_filter, -1)
+
+    # Return to original shape
+    out = tf.reshape(out, *shape[:-2], 4)
+    return out
+
+
 def _mask_to_rle_pytorch(input_mask: "torch.Tensor"):
     """
     Encodes masks the run-length encoding (RLE), in the format expected by pycoco tools.
@@ -820,6 +1182,29 @@ def _mask_to_rle_pytorch(input_mask: "torch.Tensor"):
     return out
 
 
+def _mask_to_rle_tf(input_mask: "tf.Tensor"):
+    """
+    Encodes masks the run-length encoding (RLE), in the format expected by pycoco tools.
+    """
+    # Put in fortran order and flatten height and width
+    batch_size, height, width = input_mask.shape
+    input_mask = flatten(tf.transpose(input_mask, perm=(0, 2, 1)), 1)
+
+    # Compute change indices
+    diff = input_mask[:, 1:] ^ input_mask[:, :-1]
+    change_indices = tf.where(diff)
+
+    # Encode run length
+    out = []
+    for i in range(batch_size):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1] + 1
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if input_mask[i, 0] == 0 else [0]
+        counts += [cur_idxs[0].item()] + btw_idxs.tolist() + [height * width - cur_idxs[-1]]
+        out.append({"size": [height, width], "counts": counts})
+    return out
+
+
 def _rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
     """Compute a binary mask from an uncompressed RLE."""
     height, width = rle["size"]
@@ -836,7 +1221,7 @@ def _rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
 
 def _postprocess_for_mg(rle_masks, iou_scores, mask_boxes, amg_crops_nms_thresh=0.7):
     """
-    Perform NMS (Non Maxium Suppression) on the outputs.
+    Perform NMS (Non Maximum Suppression) on the outputs.
 
     Args:
             rle_masks (`torch.Tensor`):
@@ -861,3 +1246,32 @@ def _postprocess_for_mg(rle_masks, iou_scores, mask_boxes, amg_crops_nms_thresh=
     masks = [_rle_to_mask(rle) for rle in rle_masks]
 
     return masks, iou_scores, rle_masks, mask_boxes
+
+
+def _postprocess_for_mg_tf(rle_masks, iou_scores, mask_boxes, amg_crops_nms_thresh=0.7):
+    """
+    Perform NMS (Non Maximum Suppression) on the outputs.
+
+    Args:
+            rle_masks (`tf.Tensor`):
+                binary masks in the RLE format
+            iou_scores (`tf.Tensor` of shape (nb_masks, 1)):
+                iou_scores predicted by the model
+            mask_boxes (`tf.Tensor`):
+                The bounding boxes corresponding to segmentation masks
+            amg_crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                NMS threshold.
+    """
+    keep_by_nms = tf.image.combined_non_max_suppression(
+        boxes=mask_boxes.float(),
+        scores=iou_scores,
+        idxs=torch.zeros(mask_boxes.shape[0]),
+        iou_threshold=amg_crops_nms_thresh,
+    )
+
+    iou_scores = iou_scores[keep_by_nms]
+    rle_masks = [rle_masks[i] for i in keep_by_nms]
+    mask_boxes = mask_boxes[keep_by_nms]
+    masks = [_rle_to_mask(rle) for rle in rle_masks]
+
+    return masks, iou_scores, rle_masks, mask_boxes
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index bf14a4b2413a..7df461175097 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -111,7 +111,6 @@ class SamImageSegmentationOutput(ModelOutput):
     mask_decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from src.models.modeling_vit_mae.ViTMAEPatchEmbeddings with ViTMAEPatchEmbeddings->SamVisionEmbeddings,x->embeddings
 class SamPatchEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -198,7 +197,7 @@ class SamAttention(nn.Module):
     values.
     """
 
-    def __init__(self, config, downsample_rate=None) -> None:
+    def __init__(self, config, downsample_rate=None):
         super().__init__()
         self.hidden_size = config.hidden_size
 
@@ -252,7 +251,7 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor:
 
 
 class SamTwoWayAttentionBlock(nn.Module):
-    def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False) -> None:
+    def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False):
         """
         A transformer block with four layers:
             (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
@@ -476,7 +475,7 @@ def forward(
                 the embeddings of the mask inputs
             multimask_output (bool):
                 Whether to return multiple masks or a single mask.
-            output_attentions (bool, **optional**):
+            output_attentions (bool, *optional*):
                 Whether or not to return the attentions tensors of all attention layers.
         """
         batch_size, num_channels, height, width = image_embeddings.shape
@@ -668,11 +667,11 @@ def forward(
         Embeds different types of prompts, returning both sparse and dense embeddings.
 
         Args:
-            points (`torch.Tensor`, **optionnal**):
+            points (`torch.Tensor`, *optional*):
                 point coordinates and labels to embed.
-            boxes (`torch.Tensor`, **optionnal**):
+            boxes (`torch.Tensor`, *optional*):
                 boxes to embed
-            masks (`torch.Tensor`, **optionnal**):
+            masks (`torch.Tensor`, *optional*):
                 masks to embed
         """
         sparse_embeddings = None
@@ -707,7 +706,7 @@ def forward(
 class SamVisionAttention(nn.Module):
     """Multi-head Attention block with relative position embeddings."""
 
-    def __init__(self, config, window_size) -> None:
+    def __init__(self, config, window_size):
         super().__init__()
         input_size = (
             (config.image_size // config.patch_size, config.image_size // config.patch_size)
@@ -845,7 +844,7 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch
 
 
 class SamVisionLayer(nn.Module):
-    def __init__(self, config, window_size) -> None:
+    def __init__(self, config, window_size):
         super().__init__()
         self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.attn = SamVisionAttention(config, window_size)
@@ -1166,7 +1165,7 @@ def _init_weights(self, module):
 class SamModel(SamPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
 
-    def __init__(self, config) -> None:
+    def __init__(self, config):
         super().__init__(config)
         self.shared_image_embedding = SamPositionalEmbedding(config.vision_config)
 
@@ -1334,7 +1333,6 @@ def forward(
         image_positional_embeddings = image_positional_embeddings.repeat(batch_size, 1, 1, 1)
 
         vision_attentions = None
-        mask_decoder_attentions = None
         vision_hidden_states = None
 
         if pixel_values is not None:
@@ -1359,7 +1357,8 @@ def forward(
                 "The batch size of the image embeddings and the input points must be the same. ",
                 "Got {} and {} respectively.".format(image_embeddings.shape[0], input_points.shape[0]),
                 " if you want to pass multiple points for the same image, make sure that you passed ",
-                " input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and input_labels of shape (batch_size, point_batch_size, num_points_per_image)",
+                " input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and ",
+                " input_labels of shape (batch_size, point_batch_size, num_points_per_image)",
             )
 
         sparse_embeddings, dense_embeddings = self.prompt_encoder(
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
new file mode 100644
index 000000000000..ddd8e526a79a
--- /dev/null
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -0,0 +1,1497 @@
+# coding=utf-8
+# Copyright 2023 The Meta AI Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TensorFlow SAM model. This file was mostly generated by auto-translation from the PyTorch original. In the event of a
+discrepancy, the original file should be regarded as the 'reference' version.
+"""
+
+import collections
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...modeling_tf_outputs import TFBaseModelOutput
+from ...modeling_tf_utils import TFPreTrainedModel, shape_list, unpack_inputs
+from ...tf_utils import flatten, functional_layernorm
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SamConfig"
+_CHECKPOINT_FOR_DOC = "facebook/sam-vit-huge"
+
+TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/sam-vit-huge",
+    "facebook/sam-vit-large",
+    "facebook/sam-vit-base",
+    # See all SAM models at https://huggingface.co/models?filter=sam
+]
+
+
+@dataclass
+class TFSamVisionEncoderOutput(ModelOutput):
+    """
+    Base class for sam vision model's outputs that also contains image embeddings obtained by applying the projection
+    layer to the pooler_output.
+
+    Args:
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[tf.Tensor] = None
+    last_hidden_state: tf.Tensor = None
+    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[tf.Tensor]] = None
+
+
+@dataclass
+class TFSamImageSegmentationOutput(ModelOutput):
+    """
+    Base class for Segment-Anything model's output
+
+    Args:
+        iou_scores (`tf.Tensor` of shape `(batch_size, num_masks)`):
+            The iou scores of the predicted masks.
+        pred_masks (`tf.Tensor` of shape `(batch_size, num_masks, height, width)`):
+            The predicted low resolutions masks. Needs to be post-processed by the processor
+        vision_hidden_states  (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the vision model at the output of each layer plus the optional initial embedding outputs.
+        vision_attentions  (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        mask_decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    iou_scores: tf.Tensor = None
+    pred_masks: tf.Tensor = None
+    vision_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    vision_attentions: Optional[Tuple[tf.Tensor]] = None
+    mask_decoder_attentions: Optional[Tuple[tf.Tensor]] = None
+
+
+class TFSamPatchEmbeddings(tf.keras.layers.Layer):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = tf.keras.layers.Conv2D(
+            hidden_size, kernel_size=patch_size, strides=patch_size, name="projection"
+        )
+
+    def call(self, pixel_values):
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(tf.transpose(pixel_values, perm=[0, 2, 3, 1]))
+        return embeddings
+
+
+class TFSamMLPBlock(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1")
+        self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2")
+        self.act = ACT2FN[config.hidden_act]
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
+
+
+class TFSamLayerNorm(tf.keras.layers.Layer):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last", **kwargs):
+        super().__init__(**kwargs)
+        self.eps = eps
+        self.data_format = data_format
+        self.normalized_shape = normalized_shape
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(shape=self.normalized_shape, initializer="ones", name="weight")
+        self.bias = self.add_weight(shape=self.normalized_shape, initializer="zeros", name="bias")
+        super().build(input_shape)
+
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        if self.data_format == "channels_last":
+            x = functional_layernorm(x, weight=self.weight, bias=self.bias, epsilon=self.eps, axis=-1)
+        elif self.data_format == "channels_first":
+            x = functional_layernorm(x, weight=self.weight, bias=self.bias, epsilon=self.eps, axis=1)
+        return x
+
+
+class TFSamAttention(tf.keras.layers.Layer):
+    """
+    SAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
+    values.
+    """
+
+    def __init__(self, config, downsample_rate=None, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = config.hidden_size
+
+        downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate
+
+        self.internal_dim = config.hidden_size // downsample_rate
+        self.num_attention_heads = config.num_attention_heads
+        if self.internal_dim % config.num_attention_heads != 0:
+            raise ValueError("num_attention_heads must divide hidden_size.")
+
+        self.q_proj = tf.keras.layers.Dense(self.internal_dim, name="q_proj")
+        self.k_proj = tf.keras.layers.Dense(self.internal_dim, name="k_proj")
+        self.v_proj = tf.keras.layers.Dense(self.internal_dim, name="v_proj")
+        self.out_proj = tf.keras.layers.Dense(self.hidden_size, name="out_proj")
+
+    def _separate_heads(self, hidden_states: tf.Tensor, num_attention_heads: int) -> tf.Tensor:
+        batch, point_batch_size, n_tokens, channel = shape_list(hidden_states)
+        c_per_head = channel // num_attention_heads
+        hidden_states = tf.reshape(
+            hidden_states, (batch * point_batch_size, n_tokens, num_attention_heads, c_per_head)
+        )
+        return tf.transpose(hidden_states, perm=[0, 2, 1, 3])
+
+    def _recombine_heads(self, hidden_states: tf.Tensor, point_batch_size: int) -> tf.Tensor:
+        batch, n_heads, n_tokens, c_per_head = shape_list(hidden_states)
+        hidden_states = tf.transpose(hidden_states, perm=[0, 2, 1, 3])
+        return tf.reshape(
+            hidden_states, (batch // max(1, point_batch_size), point_batch_size, n_tokens, n_heads * c_per_head)
+        )
+
+    def call(self, query: tf.Tensor, key: tf.Tensor, value: tf.Tensor) -> tf.Tensor:
+        # Input projections
+        query = self.q_proj(query)
+        key = self.k_proj(key)
+        value = self.v_proj(value)
+
+        point_batch_size = shape_list(query)[1]
+        # Separate into heads
+        query = self._separate_heads(query, self.num_attention_heads)
+        key = self._separate_heads(key, self.num_attention_heads)
+        value = self._separate_heads(value, self.num_attention_heads)
+
+        # SamAttention
+        _, _, _, c_per_head = shape_list(query)
+        attn = tf.matmul(
+            query, tf.transpose(key, perm=[0, 1, 3, 2])
+        )  # batch_size * point_batch_size  x N_heads x N_tokens x N_tokens
+        attn = attn / tf.math.sqrt(float(c_per_head))
+        attn = tf.nn.softmax(attn, axis=-1)
+
+        # Get output
+        out = tf.matmul(attn, value)
+        out = self._recombine_heads(out, point_batch_size)
+        out = self.out_proj(out)
+
+        return out
+
+
+class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer):
+    def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False, **kwargs):
+        """
+        A transformer block with four layers:
+            (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
+            sparse inputs (4) cross attention of dense inputs -> sparse inputs
+
+        Arguments:
+            config (`SamMaskDecoderConfig`):
+                The configuration file used to instantiate the block
+            attention_downsample_rate (*optionalk*, int, defaults to 2):
+                The downsample ratio of the block used to reduce the inner dim of the attention.
+            skip_first_layer_pe (*optional*, bool, defaults to `False`):
+                Whether or not to skip the addition of the query_point_embedding on the first layer.
+        """
+        super().__init__(**kwargs)
+
+        self.hidden_size = config.hidden_size
+        self.layer_norm_eps = config.layer_norm_eps
+
+        self.self_attn = TFSamAttention(config, downsample_rate=1, name="self_attn")
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm1")
+
+        self.cross_attn_token_to_image = TFSamAttention(
+            config, downsample_rate=attention_downsample_rate, name="cross_attn_token_to_image"
+        )
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm2")
+
+        self.mlp = TFSamMLPBlock(config, name="mlp")
+        self.layer_norm3 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm3")
+
+        self.layer_norm4 = tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps, name="layer_norm4")
+        self.cross_attn_image_to_token = TFSamAttention(
+            config, downsample_rate=attention_downsample_rate, name="cross_attn_image_to_token"
+        )
+
+        self.skip_first_layer_pe = skip_first_layer_pe
+
+    def call(
+        self,
+        queries: tf.Tensor,
+        keys: tf.Tensor,
+        query_point_embedding: tf.Tensor,
+        key_point_embedding: tf.Tensor,
+        output_attentions: bool = False,
+    ):
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(query=queries, key=queries, value=queries)
+        else:
+            query = queries + query_point_embedding
+            attn_out = self.self_attn(query=query, key=query, value=queries)
+            queries = queries + attn_out
+        queries = self.layer_norm1(queries)
+
+        # Cross attention block, tokens attending to image embedding
+        query = queries + query_point_embedding
+        key = keys + key_point_embedding
+
+        attn_out = self.cross_attn_token_to_image(query=query, key=key, value=keys)
+        queries = queries + attn_out
+
+        queries = self.layer_norm2(queries)
+
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.layer_norm3(queries)
+
+        # Cross attention block, image embedding attending to tokens
+        query = queries + query_point_embedding
+        key = keys + key_point_embedding
+
+        attn_out = self.cross_attn_image_to_token(query=key, key=query, value=queries)
+        keys = keys + attn_out
+
+        keys = self.layer_norm4(keys)
+
+        outputs = (queries, keys)
+
+        if output_attentions:
+            outputs = outputs + (attn_out,)
+        else:
+            outputs = outputs + (None,)
+
+        return outputs
+
+
+class TFSamTwoWayTransformer(tf.keras.layers.Layer):
+    def __init__(self, config: SamMaskDecoderConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.layers = []
+
+        for i in range(self.num_hidden_layers):
+            self.layers.append(TFSamTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0), name=f"layers_._{i}"))
+
+        self.final_attn_token_to_image = TFSamAttention(config, name="final_attn_token_to_image")
+        self.layer_norm_final_attn = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_eps, name="layer_norm_final_attn"
+        )
+
+    def call(
+        self,
+        point_embeddings: tf.Tensor,
+        image_embeddings: tf.Tensor,
+        image_positional_embeddings: tf.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        all_attentions = ()
+
+        if image_embeddings is None:
+            raise ValueError("You have to specify an image_embedding")
+
+        image_embeddings = tf.transpose(flatten(image_embeddings, 2), perm=(0, 2, 1))[:, None]
+        image_positional_embeddings = tf.transpose(flatten(image_positional_embeddings, 2), (0, 2, 1))[:, None]
+
+        # Prepare queries
+        queries = point_embeddings
+        keys = image_embeddings
+
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys, attention_outputs = layer(
+                queries=queries,
+                keys=keys,
+                query_point_embedding=point_embeddings,
+                key_point_embedding=image_positional_embeddings,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                all_attentions = all_attentions + (attention_outputs,)
+
+        # Apply the final attenion layer from the points to the image
+        query = queries + point_embeddings
+        key = keys + image_positional_embeddings
+
+        attn_out = self.final_attn_token_to_image(query=query, key=key, value=keys)
+
+        queries = queries + attn_out
+        queries = self.layer_norm_final_attn(queries)
+        return queries, keys, all_attentions
+
+
+class TFSamFeedForward(tf.keras.layers.Layer):
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, sigmoid_output: bool = False, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_layers = num_layers
+        self.activation = tf.keras.layers.ReLU()
+        self.proj_in = tf.keras.layers.Dense(hidden_dim, input_shape=(input_dim,), name="proj_in")
+        self.proj_out = tf.keras.layers.Dense(output_dim, input_shape=(hidden_dim,), name="proj_out")
+        self.layers = [
+            tf.keras.layers.Dense(hidden_dim, input_shape=(hidden_dim,), name=f"layers_._{i}")
+            for i in range(num_layers - 2)
+        ]
+        self.sigmoid_output = sigmoid_output
+
+    def call(self, hidden_states):
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        for layer in self.layers:
+            hidden_states = self.activation(layer(hidden_states))
+
+        hidden_states = self.proj_out(hidden_states)
+        if self.sigmoid_output:
+            hidden_states = tf.sigmoid(hidden_states)
+        return hidden_states
+
+
+class TFSamMaskDecoder(tf.keras.layers.Layer):
+    def __init__(self, config: SamMaskDecoderConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.hidden_size = config.hidden_size
+
+        self.num_multimask_outputs = config.num_multimask_outputs
+        self.num_mask_tokens = config.num_multimask_outputs + 1
+
+        self.transformer = TFSamTwoWayTransformer(config, name="transformer")
+
+        self.upscale_conv1 = tf.keras.layers.Conv2DTranspose(
+            self.hidden_size // 4, kernel_size=2, strides=2, name="upscale_conv1", data_format="channels_first"
+        )
+        self.upscale_conv2 = tf.keras.layers.Conv2DTranspose(
+            self.hidden_size // 8, kernel_size=2, strides=2, name="upscale_conv2", data_format="channels_first"
+        )
+        self.upscale_layer_norm = TFSamLayerNorm(
+            self.hidden_size // 4, data_format="channels_first", name="upscale_layer_norm"
+        )
+        self.activation = tf.nn.gelu
+
+        mlps_list = []
+        for i in range(self.num_mask_tokens):
+            mlps_list += [
+                TFSamFeedForward(
+                    self.hidden_size,
+                    self.hidden_size,
+                    self.hidden_size // 8,
+                    3,
+                    name=f"output_hypernetworks_mlps_._{i}",
+                )
+            ]
+        self.output_hypernetworks_mlps = mlps_list
+
+        self.iou_prediction_head = TFSamFeedForward(
+            self.hidden_size,
+            config.iou_head_hidden_dim,
+            self.num_mask_tokens,
+            config.iou_head_depth,
+            name="iou_prediction_head",
+        )
+
+    def build(self, input_shape):
+        self.iou_token = self.add_weight(shape=(1, self.hidden_size), name="iou_token.weight", trainable=True)
+        self.mask_tokens = self.add_weight(
+            shape=(self.num_mask_tokens, self.hidden_size), name="mask_tokens.weight", trainable=True
+        )
+        super().build(input_shape)
+
+    def call(
+        self,
+        image_embeddings: tf.Tensor,
+        image_positional_embeddings: tf.Tensor,
+        sparse_prompt_embeddings: tf.Tensor,
+        dense_prompt_embeddings: tf.Tensor,
+        multimask_output: bool,
+        output_attentions: Optional[bool] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        batch_size, num_channels, height, width = shape_list(image_embeddings)
+        point_batch_size = tf.math.maximum(1, tf.shape(sparse_prompt_embeddings)[1])
+
+        output_tokens = tf.concat([self.iou_token, self.mask_tokens], axis=0)  # Should be (1, 32) + (4, 32) = (5, 32)
+        output_tokens = tf.tile(
+            output_tokens[None, None, :], [batch_size, point_batch_size, 1, 1]
+        )  # Should be (batch_size, point_size, 5, 32)
+
+        # Matt: The original Torch code checked that the sum of sparse_prompt_embeddings equalled 0. However, this only
+        #       happens when the sparse prompt embeddings are an empty tensor with shape[1] == 0. I replaced
+        #       it with an explicit shape check to avoid data-dependent control flow which breaks XLA.
+        if sparse_prompt_embeddings.shape[1] != 0:
+            tokens = tf.concat((output_tokens, sparse_prompt_embeddings), axis=2)
+        else:
+            tokens = output_tokens
+        point_embeddings = tf.cast(tokens, self.iou_token.dtype)
+
+        image_embeddings = image_embeddings + dense_prompt_embeddings
+        image_embeddings = tf.tile(image_embeddings, [point_batch_size, 1, 1, 1])
+        image_positional_embeddings = tf.tile(image_positional_embeddings, [point_batch_size, 1, 1, 1])
+
+        point_embedding, image_embeddings, attentions = self.transformer(
+            point_embeddings=point_embeddings,
+            image_embeddings=image_embeddings,
+            image_positional_embeddings=image_positional_embeddings,
+            output_attentions=output_attentions,
+        )
+        iou_token_out = point_embedding[:, :, 0, :]
+        mask_tokens_out = point_embedding[:, :, 1 : (1 + self.num_mask_tokens), :]
+
+        image_embeddings = tf.transpose(image_embeddings, perm=(0, 1, 3, 2))
+        image_embeddings = tf.reshape(image_embeddings, [batch_size * point_batch_size, num_channels, height, width])
+
+        upscaled_embedding = self.upscale_conv1(image_embeddings)
+        upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding))
+        upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding))
+
+        hyper_in_list = []
+        for i in range(self.num_mask_tokens):
+            current_mlp = self.output_hypernetworks_mlps[i]
+            hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])]
+        hyper_in = tf.stack(hyper_in_list, axis=2)
+
+        _, num_channels, height, width = shape_list(upscaled_embedding)
+        upscaled_embedding = tf.reshape(
+            upscaled_embedding, [batch_size, point_batch_size, num_channels, height * width]
+        )
+        masks = tf.reshape(hyper_in @ upscaled_embedding, [batch_size, point_batch_size, -1, height, width])
+
+        iou_pred = self.iou_prediction_head(iou_token_out)
+
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, :, mask_slice, :, :]
+        iou_pred = iou_pred[:, :, mask_slice]
+
+        outputs = (masks, iou_pred)
+
+        if output_attentions:
+            outputs = outputs + (attentions,)
+        else:
+            outputs = outputs + (None,)
+
+        return outputs
+
+
+class TFSamPositionalEmbedding(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.scale = config.hidden_size // 2
+        self.config = config
+
+    def build(self, input_shape):
+        # TODO Matt: What is going on here? Why is a non-trainable weight randomly initialized?
+        self.positional_embedding = self.add_weight(
+            name="positional_embedding",
+            shape=(2, self.config.num_pos_feats),
+            initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.scale),
+            trainable=False,
+        )
+
+    def call(self, input_coords, input_shape=None):
+        """Positionally encode points that are normalized to [0,1]."""
+        coordinates = tf.identity(input_coords)
+
+        if input_shape is not None:
+            coordinates = tf.stack(
+                [
+                    tf.cast(coordinates[:, :, :, 0], tf.float32) / input_shape[1],
+                    tf.cast(coordinates[:, :, :, 1], tf.float32) / input_shape[0],
+                ],
+                axis=-1,
+            )
+
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coordinates = 2 * coordinates - 1
+        coordinates = tf.cast(coordinates, self.positional_embedding.dtype)
+        coordinates = tf.matmul(coordinates, self.positional_embedding)
+        coordinates = 2 * np.pi * coordinates
+        # outputs d_1 x ... x d_n x channel shape
+        return tf.concat([tf.sin(coordinates), tf.cos(coordinates)], axis=-1)
+
+
+class TFSamMaskEmbedding(tf.keras.layers.Layer):
+    def __init__(self, config: SamPromptEncoderConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.mask_input_channels = config.mask_input_channels // 4
+        self.activation = ACT2FN[config.hidden_act]
+        self.conv1 = tf.keras.layers.Conv2D(self.mask_input_channels, kernel_size=2, strides=2, name="conv1")
+        self.conv2 = tf.keras.layers.Conv2D(config.mask_input_channels, kernel_size=2, strides=2, name="conv2")
+        self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3")
+        self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1")
+        self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2")
+
+    def call(self, masks):
+        masks = tf.transpose(masks, perm=(0, 2, 3, 1))  # Convert to channels-last
+        hidden_states = self.conv1(masks)
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        dense_embeddings = self.conv3(hidden_states)
+        dense_embeddings = tf.transpose(dense_embeddings, perm=(0, 3, 1, 2))  # Convert back to channels-first
+        return dense_embeddings
+
+    def build(self, input_shape):
+        # This class needs an explicit build method because it isn't called with the standard dummy inputs
+        conv1_shape = [None, None, None, 1]
+        conv2_shape = [None, None, None, self.mask_input_channels]
+        conv3_shape = [None, None, None, self.mask_input_channels * 4]
+        layer_norm1_shape = [None, None, None, self.mask_input_channels]
+        layer_norm2_shape = [None, None, None, self.mask_input_channels * 4]
+        with tf.name_scope("conv1"):
+            self.conv1.build(conv1_shape)
+        with tf.name_scope("conv2"):
+            self.conv2.build(conv2_shape)
+        with tf.name_scope("conv3"):
+            self.conv3.build(conv3_shape)
+        with tf.name_scope("layer_norm1"):
+            self.layer_norm1.build(layer_norm1_shape)
+        with tf.name_scope("layer_norm2"):
+            self.layer_norm2.build(layer_norm2_shape)
+        super().build(input_shape)
+
+
+class TFSamPromptEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding, **kwargs):
+        super().__init__(**kwargs)
+        self.shared_embedding = shared_patch_embedding
+        self.mask_embed = TFSamMaskEmbedding(config, name="mask_embed")
+        self.no_mask_embed = None
+
+        self.image_embedding_size = (config.image_embedding_size, config.image_embedding_size)
+        self.input_image_size = config.image_size
+
+        self.point_embed = []
+        self.hidden_size = config.hidden_size
+        self.not_a_point_embed = None
+        self.config = config
+
+    def build(self, input_shape):
+        self.no_mask_embed = self.add_weight(
+            name="no_mask_embed.weight",
+            shape=(1, self.hidden_size),
+            initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02),
+            trainable=True,
+        )
+        self.point_embed = [
+            self.add_weight(
+                name=f"point_embed_._{i}.weight",
+                shape=(1, self.hidden_size),
+                initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02),
+                trainable=True,
+            )
+            for i in range(self.config.num_point_embeddings)
+        ]
+        self.not_a_point_embed = self.add_weight(
+            name="not_a_point_embed.weight",
+            shape=(1, self.hidden_size),
+            initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02),
+            trainable=True,
+        )
+        with tf.name_scope("mask_embed"):
+            # We must explicitly build the mask embed because it isn't touched by the standard dummy inputs
+            self.mask_embed.build(
+                (None, self.config.mask_input_channels, self.config.image_size, self.config.image_size)
+            )
+        super().build(input_shape)
+
+    def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            target_point_shape = (points.shape[0], points.shape[1], 1, points.shape[-1])
+            target_labels_shape = (points.shape[0], points.shape[1], 1)
+            padding_point = tf.zeros(target_point_shape, dtype=points.dtype)
+            padding_label = -tf.ones(target_labels_shape, dtype=labels.dtype)
+            points = tf.concat([points, padding_point], axis=2)
+            labels = tf.concat([labels, padding_label], axis=2)
+        input_shape = (self.input_image_size, self.input_image_size)
+        point_embedding = self.shared_embedding(points, input_shape)
+
+        point_embedding = tf.where(labels[..., None] == -1, self.not_a_point_embed[0], point_embedding)
+
+        point_embedding = tf.where(
+            labels[..., None] != -10,
+            point_embedding,
+            tf.zeros_like(point_embedding),
+        )
+        point_embedding = tf.where(
+            (labels == 0)[:, :, :, None], point_embedding + self.point_embed[0], point_embedding
+        )
+        point_embedding = tf.where(
+            (labels == 1)[:, :, :, None], point_embedding + self.point_embed[1], point_embedding
+        )
+        return point_embedding
+
+    def _embed_boxes(self, boxes: tf.Tensor) -> tf.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        batch_size, nb_boxes = boxes.shape[:2]
+        coords = tf.reshape(boxes, (batch_size, nb_boxes, 2, 2))
+        input_shape = (self.input_image_size, self.input_image_size)
+        corner_embedding = self.shared_embedding(coords, input_shape)
+        corner_embedding += tf.where(
+            tf.range(corner_embedding.shape[2])[None, None, :, None] == 0,
+            self.point_embed[2][0],
+            self.point_embed[3][0],
+        )
+        return corner_embedding
+
+    def call(
+        self,
+        batch_size: Optional[int],
+        input_points: Optional[Tuple[tf.Tensor, tf.Tensor]],
+        input_labels: Optional[tf.Tensor],
+        input_boxes: Optional[tf.Tensor],
+        input_masks: Optional[tf.Tensor],
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense embeddings.
+
+        Args:
+            points (`tf.Tensor`, *optional*):
+                point coordinates and labels to embed.
+            boxes (`tf.Tensor`, *optional*):
+                boxes to embed
+            masks (`tf.Tensor`, *optional*):
+                masks to embed
+        """
+        sparse_embeddings = None
+        if input_points is not None:
+            batch_size, point_batch_size = input_points.shape[:2]
+            if input_labels is None:
+                raise ValueError("If points are provided, labels must also be provided.")
+            point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None))
+            sparse_embeddings = tf.zeros(
+                (batch_size, point_batch_size, 0, self.hidden_size), dtype=point_embeddings.dtype
+            )
+            sparse_embeddings = tf.concat([sparse_embeddings, point_embeddings], axis=2)
+        if input_boxes is not None:
+            batch_size = input_boxes.shape[0]
+            box_embeddings = self._embed_boxes(input_boxes)
+            if sparse_embeddings is None:
+                sparse_embeddings = box_embeddings
+            else:
+                sparse_embeddings = tf.concat([sparse_embeddings, box_embeddings], axis=2)
+        if input_masks is not None:
+            dense_embeddings = self.mask_embed(input_masks)
+        else:
+            dense_embeddings = self.no_mask_embed[0]
+            dense_embeddings = tf.reshape(dense_embeddings, (1, -1, 1, 1))
+            dense_embeddings = tf.tile(
+                dense_embeddings, (batch_size, 1, self.image_embedding_size[0], self.image_embedding_size[1])
+            )
+        if sparse_embeddings is None:
+            sparse_embeddings = tf.zeros((batch_size, 0, 1, self.hidden_size), dtype=dense_embeddings.dtype)
+
+        return sparse_embeddings, dense_embeddings
+
+
+class TFSamVisionAttention(tf.keras.layers.Layer):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, window_size, **kwargs):
+        super().__init__(**kwargs)
+        input_size = (
+            (config.image_size // config.patch_size, config.image_size // config.patch_size)
+            if window_size == 0
+            else (window_size, window_size)
+        )
+        self.input_size = input_size
+
+        self.num_attention_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = tf.keras.layers.Dense(config.hidden_size * 3, use_bias=config.qkv_bias, name="qkv")
+        self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj")
+
+        self.use_rel_pos = config.use_rel_pos
+        if self.use_rel_pos:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
+
+    def build(self, input_shape):
+        if self.input_size is not None:
+            # initialize relative positional embeddings
+            self.rel_pos_h = self.add_weight(
+                shape=(2 * self.input_size[0] - 1, self.head_dim), initializer="zeros", name="rel_pos_h"
+            )
+            self.rel_pos_w = self.add_weight(
+                shape=(2 * self.input_size[1] - 1, self.head_dim), initializer="zeros", name="rel_pos_w"
+            )
+        super().build(input_shape)
+
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
+
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`tf.Tensor`):
+                relative position embeddings (L, channel).
+
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos if needed.
+        if rel_pos.shape[0] != max_rel_dist:
+            # Interpolate rel pos.
+            rel_pos_resized = tf.image.resize(
+                tf.reshape(rel_pos, (1, rel_pos.shape[0], -1)),
+                size=(max_rel_dist, rel_pos.shape[1]),
+                method="bilinear",
+            )
+            rel_pos_resized = tf.reshape(rel_pos_resized, (-1, max_rel_dist))
+        else:
+            rel_pos_resized = rel_pos
+
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = tf.expand_dims(tf.range(q_size, dtype=tf.float32), 1) * max(k_size / q_size, 1.0)
+        k_coords = tf.expand_dims(tf.range(k_size, dtype=tf.float32), 0) * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+        return tf.gather(rel_pos_resized, tf.cast(relative_coords, tf.int32))
+
+    def add_decomposed_rel_pos(
+        self,
+        attn: tf.Tensor,
+        query: tf.Tensor,
+        rel_pos_h: tf.Tensor,
+        rel_pos_w: tf.Tensor,
+        q_size: Tuple[int, int],
+        k_size: Tuple[int, int],
+    ) -> tf.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+
+        Args:
+            attn (`tf.Tensor`):
+                attention map.
+            query (`tf.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`tf.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`tf.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
+
+        Returns:
+            attn (`tf.Tensor`):
+                attention map with added relative positional embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+
+        batch_size, _, dim = shape_list(query)
+        reshaped_query = tf.reshape(query, (batch_size, query_height, query_width, dim))
+        rel_h = tf.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = tf.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+        attn = tf.reshape(attn, (batch_size, query_height, query_width, key_height, key_width))
+        attn = attn + tf.expand_dims(rel_h, axis=-1) + tf.expand_dims(rel_w, axis=-2)
+        attn = tf.reshape(attn, (batch_size, query_height * query_width, key_height * key_width))
+        return attn
+
+    def call(self, hidden_states: tf.Tensor, output_attentions=False, training=False) -> tf.Tensor:
+        batch_size, height, width, _ = shape_list(hidden_states)
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = tf.reshape(self.qkv(hidden_states), (batch_size, height * width, 3, self.num_attention_heads, -1))
+        qkv = tf.transpose(qkv, perm=(2, 0, 3, 1, 4))
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = tf.unstack(
+            tf.reshape(qkv, (3, batch_size * self.num_attention_heads, height * width, -1)), axis=0
+        )
+        attn_weights = tf.matmul(query * self.scale, key, transpose_b=True)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+
+        if training:
+            attn_probs = tf.nn.dropout(attn_weights, rate=self.dropout)
+        else:
+            attn_probs = attn_weights
+
+        attn_output = tf.reshape(attn_probs @ value, (batch_size, self.num_attention_heads, height, width, -1))
+        attn_output = tf.transpose(attn_output, perm=(0, 2, 3, 1, 4))
+        attn_output = tf.reshape(attn_output, (batch_size, height, width, -1))
+
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+
+        return outputs
+
+
+class TFSamVisionLayer(tf.keras.layers.Layer):
+    def __init__(self, config, window_size, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.attn = TFSamVisionAttention(config, window_size, name="attn")
+        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+        self.mlp = TFSamMLPBlock(config, name="mlp")
+        self.window_size = window_size
+
+    def window_partition(self, hidden_states: tf.Tensor, window_size: int) -> Tuple[tf.Tensor, Tuple[int, int]]:
+        batch_size, height, width, channel = shape_list(hidden_states)
+
+        pad_h = (window_size - height % window_size) % window_size
+        pad_w = (window_size - width % window_size) % window_size
+        if pad_h > 0 or pad_w > 0:
+            hidden_states = tf.pad(hidden_states, [[0, 0], [0, pad_h], [0, pad_w], [0, 0]])
+        pad_height, pad_width = height + pad_h, width + pad_w
+
+        hidden_states = tf.reshape(
+            hidden_states,
+            [batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel],
+        )
+        windows = tf.reshape(
+            tf.transpose(hidden_states, perm=[0, 1, 3, 2, 4, 5]), [-1, window_size, window_size, channel]
+        )
+        return windows, (pad_height, pad_width)
+
+    def window_unpartition(
+        self, windows: tf.Tensor, window_size: int, padding_shape: Tuple[int, int], original_shape: Tuple[int, int]
+    ) -> tf.Tensor:
+        pad_height, pad_width = padding_shape
+        height, width = original_shape
+        batch_size = shape_list(windows)[0] // (pad_height * pad_width // window_size // window_size)
+        hidden_states = tf.reshape(
+            windows, [batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1]
+        )
+        hidden_states = tf.reshape(
+            tf.transpose(hidden_states, perm=[0, 1, 3, 2, 4, 5]), [batch_size, pad_height, pad_width, -1]
+        )
+
+        if pad_height > height or pad_width > width:
+            hidden_states = hidden_states[:, :height, :width, :]
+        return hidden_states
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_attentions: Optional[bool] = False,
+        training: Optional[bool] = False,
+    ) -> Tuple[tf.Tensor]:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
+
+        hidden_states, attn_weights = self.attn(
+            hidden_states=hidden_states,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        if self.window_size > 0:
+            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
+
+        hidden_states = residual + hidden_states
+        layernorm_output = self.layer_norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(layernorm_output)
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class TFSamVisionNeck(tf.keras.layers.Layer):
+    def __init__(self, config: SamVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.conv1 = tf.keras.layers.Conv2D(
+            config.output_channels,
+            kernel_size=1,
+            use_bias=False,
+            name="conv1",
+        )
+        self.layer_norm1 = TFSamLayerNorm(config.output_channels, name="layer_norm1")
+        self.conv2 = tf.keras.layers.Conv2D(
+            config.output_channels,
+            kernel_size=3,
+            padding="same",
+            use_bias=False,
+            name="conv2",
+        )
+        self.layer_norm2 = TFSamLayerNorm(config.output_channels, name="layer_norm2")
+
+    def call(self, hidden_states):
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = tf.transpose(hidden_states, perm=[0, 3, 1, 2])
+        return hidden_states
+
+
+class TFSamVisionEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: SamVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.image_size = config.image_size
+
+        self.patch_embed = TFSamPatchEmbeddings(config, name="patch_embed")
+
+        self.pos_embed = None
+
+        self.layers = []
+        for i in range(config.num_hidden_layers):
+            layer = TFSamVisionLayer(
+                config,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+                name=f"layers_._{i}",
+            )
+            self.layers.append(layer)
+
+        self.neck = TFSamVisionNeck(config, name="neck")
+
+    def build(self, input_shape):
+        if self.config.use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = self.add_weight(
+                shape=[
+                    1,
+                    self.config.image_size // self.config.patch_size,
+                    self.config.image_size // self.config.patch_size,
+                    self.config.hidden_size,
+                ],
+                initializer="zeros",
+                trainable=True,
+                name="pos_embed",
+            )
+        super().build(input_shape)
+
+    def get_input_embeddings(self):
+        return self.patch_embed
+
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFSamVisionEncoderOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.patch_embed(pixel_values)
+        if self.pos_embed is not None:
+            hidden_states = hidden_states + self.pos_embed
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, output_attentions=output_attentions, training=training)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.neck(hidden_states)
+
+        if not return_dict:
+            outputs = (hidden_states,)
+            if output_hidden_states:
+                outputs = outputs + (all_hidden_states,)
+            if output_attentions:
+                outputs = outputs + (all_self_attentions,)
+            return outputs
+
+        return TFSamVisionEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class TFSamPreTrainedModel(TFPreTrainedModel):
+    config_class = SamConfig
+    base_model_prefix = "sam"
+    main_input_name = "pixel_values"
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(
+                3,
+                self.config.vision_config.num_channels,
+                self.config.vision_config.image_size,
+                self.config.vision_config.image_size,
+            ),
+            dtype=tf.float32,
+        )
+        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
+
+    @tf.function(
+        input_signature=[
+            {
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
+
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        output = self.call(inputs)
+
+        return self.serving_output(output)
+
+
+SAM_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a TensorFlow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)
+    subclass. Use it as a regular TensorFlow Model and refer to the TensorFlow documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`SamConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SAM_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`SamProcessor`]. See [`SamProcessor.__call__`] for
+            details.
+        input_points (`tf.Tensor` of shape `(batch_size, num_points, 2)`):
+            Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
+            better results. The points can be obtained by passing a list of list of list to the processor that will
+            create corresponding `tf` tensors of dimension 4. The first dimension is the image batch size, the second
+            dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict per
+            input point), the third dimension is the number of points per segmentation mask (it is possible to pass
+            multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
+            coordinates of the point. If a different number of points is passed either for each image, or for each
+            mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
+            computation of the embedding will be skipped for these points using the labels.
+        input_labels (`tf.Tensor` of shape `(batch_size, point_batch_size, num_points)`):
+            Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
+            official implementation, there are 3 types of labels
+
+            - `1`: the point is a point that contains the object of interest
+            - `0`: the point is a point that does not contain the object of interest
+            - `-1`: the point corresponds to the background
+
+            We added the label:
+
+            - `-10`: the point is a padding point, thus should be ignored by the prompt encoder
+
+            The padding labels should be automatically done by the processor.
+        input_boxes (`tf.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
+            much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
+            that will generate a `tf` tensor, with each dimension corresponding respectively to the image batch size,
+            the number of boxes per image and the coordinates of the top left and botton right point of the box. In the
+            order (`x1`, `y1`, `x2`, `y2`):
+
+            - `x1`: the x coordinate of the top left point of the input box
+            - `y1`: the y coordinate of the top left point of the input box
+            - `x2`: the x coordinate of the bottom right point of the input box
+            - `y2`: the y coordinate of the bottom right point of the input box
+
+        input_masks (`tf.Tensor` of shape `(batch_size, image_size, image_size)`):
+            SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
+            generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
+            manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).
+
+        image_embeddings (`tf.Tensor` of shape `(batch_size, output_channels, window_size, window_size)`):
+            Image embeddings, this is used by the mask decder to generate masks and iou scores. For more memory
+            efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
+            method, and then feed them to the `call` method instead of feeding the `pixel_values`.
+        multimask_output (`bool`, *optional*):
+            In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
+            bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
+            "best" mask, by specifying `multimask_output=False`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "Segment Anything Model (SAM) for generating segmentation masks, given an input image and ",
+    " optional 2D location and bounding boxes.",
+    SAM_START_DOCSTRING,
+)
+class TFSamModel(TFSamPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.shared_image_embedding = TFSamPositionalEmbedding(config.vision_config, name="shared_image_embedding")
+
+        self.vision_encoder = TFSamVisionEncoder(config.vision_config, name="vision_encoder")
+        self.prompt_encoder = TFSamPromptEncoder(
+            config.prompt_encoder_config, self.shared_image_embedding, name="prompt_encoder"
+        )
+        self.mask_decoder = TFSamMaskDecoder(config.mask_decoder_config, name="mask_decoder")
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.vision_encoder.get_input_embeddings()
+
+    def get_image_wide_positional_embeddings(self):
+        size = self.config.prompt_encoder_config.image_embedding_size
+        grid = tf.ones((size, size))
+        y_embed = tf.math.cumsum(grid, axis=0) - 0.5
+        x_embed = tf.math.cumsum(grid, axis=1) - 0.5
+        y_embed = y_embed / size
+        x_embed = x_embed / size
+
+        positional_embedding = self.shared_image_embedding(tf.stack([x_embed, y_embed], axis=-1))
+        return tf.expand_dims(tf.transpose(positional_embedding, perm=[2, 0, 1]), axis=0)  # channel x height x width
+
+    def get_image_embeddings(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns the image embeddings by passing the pixel values through the vision encoder.
+
+        Args:
+            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Input pixel values
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.TFModelOutput`] instead of a plain tuple.
+
+        """
+        vision_output = self.vision_encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeddings = vision_output[0]
+        return image_embeddings
+
+    def get_prompt_embeddings(
+        self,
+        input_points: Optional[tf.Tensor] = None,
+        input_labels: Optional[tf.Tensor] = None,
+        input_boxes: Optional[tf.Tensor] = None,
+        input_masks: Optional[tf.Tensor] = None,
+    ):
+        r"""
+        Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.
+
+        Args:
+            input_points (`tf.Tensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
+                Optional input points for the prompt encoder. The padding of the point is automatically done by the
+                processor. `point_batch_size` refers to the number of masks that we want the model to predict per
+                point. The model will output `point_batch_size` times 3 masks in total.
+            input_labels (`tf.Tensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
+                Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
+                processor, or can be fed by the user.
+            input_boxes (`tf.Tensor` of shape `(batch_size, num_boxes_per_image, 4)`):
+                Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
+                processor. users can also pass manually the input boxes.
+            input_masks (`tf.Tensor` of shape `(batch_size, image_size, image_size)`):
+                Optional input masks for the prompt encoder.
+        """
+        prompt_output = self.prompt_encoder(
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            input_masks=input_masks,
+        )
+        return prompt_output
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SAM_INPUTS_DOCSTRING)
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        input_points: Optional[tf.Tensor] = None,
+        input_labels: Optional[tf.Tensor] = None,
+        input_boxes: Optional[tf.Tensor] = None,
+        input_masks: Optional[tf.Tensor] = None,
+        image_embeddings: Optional[tf.Tensor] = None,
+        multimask_output: bool = True,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict=None,
+        training=False,
+        **kwargs,
+    ) -> List[Dict[str, tf.Tensor]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and image_embeddings is None:
+            raise ValueError("Either pixel_values or image_embeddings must be provided.")
+
+        if pixel_values is not None and image_embeddings is not None:
+            raise ValueError("Only one of pixel_values and image_embeddings can be provided.")
+
+        if input_points is not None and len(input_points.shape) != 4:
+            raise ValueError(
+                "The input_points must be a 4D tensor. Of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.",
+                " got {}.".format(input_points.shape),
+            )
+        if input_boxes is not None and len(input_boxes.shape) != 3:
+            raise ValueError(
+                "The input_points must be a 3D tensor. Of shape `batch_size`, `nb_boxes`, `4`.",
+                " got {}.".format(input_boxes.shape),
+            )
+        if input_points is not None and input_boxes is not None:
+            point_batch_size = input_points.shape[1]
+            box_batch_size = input_boxes.shape[1]
+            if point_batch_size != box_batch_size:
+                raise ValueError(
+                    "You should provide as many bounding boxes as input points per box. Got {} and {}.".format(
+                        point_batch_size, box_batch_size
+                    )
+                )
+        if pixel_values is not None:
+            # Ensures that later checks pass even with an all-None shape from the serving signature
+            pixel_values = tf.ensure_shape(
+                pixel_values,
+                [
+                    None,
+                    self.config.vision_config.num_channels,
+                    self.config.vision_config.image_size,
+                    self.config.vision_config.image_size,
+                ],
+            )
+        image_positional_embeddings = self.get_image_wide_positional_embeddings()
+        # repeat with batch size
+        batch_size = shape_list(pixel_values)[0] if pixel_values is not None else shape_list(image_embeddings)[0]
+        image_positional_embeddings = tf.repeat(image_positional_embeddings, batch_size, axis=0)
+
+        vision_attentions = None
+        vision_hidden_states = None
+
+        if pixel_values is not None:
+            vision_outputs = self.vision_encoder(
+                pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+                training=training,
+            )
+            image_embeddings = vision_outputs["last_hidden_state"]
+
+            if output_hidden_states:
+                vision_hidden_states = vision_outputs["hidden_states"]
+            if output_attentions:
+                vision_attentions = vision_outputs["attentions"]
+
+        if input_points is not None and input_labels is None:
+            input_labels = tf.ones_like(input_points[:, :, :, 0], dtype=tf.int32)
+
+        if input_points is not None and image_embeddings.shape[0] != input_points.shape[0]:
+            raise ValueError(
+                "The batch size of the image embeddings and the input points must be the same. ",
+                "Got {} and {} respectively.".format(image_embeddings.shape[0], input_points.shape[0]),
+                " if you want to pass multiple points for the same image, make sure that you passed ",
+                " input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and ",
+                " input_labels of shape (batch_size, point_batch_size, num_points_per_image)",
+            )
+
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            batch_size=shape_list(image_embeddings)[0],
+            input_points=input_points,
+            input_labels=input_labels,
+            input_boxes=input_boxes,
+            input_masks=input_masks,
+        )
+
+        low_res_masks, iou_predictions, mask_decoder_attentions = self.mask_decoder(
+            image_embeddings=image_embeddings,
+            image_positional_embeddings=image_positional_embeddings,
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            output_attentions=output_attentions,
+        )
+
+        if not return_dict:
+            output = (iou_predictions, low_res_masks)
+            if output_hidden_states:
+                output = output + (vision_hidden_states,)
+
+            if output_attentions:
+                output = output + (vision_attentions, mask_decoder_attentions)
+            return output
+
+        return TFSamImageSegmentationOutput(
+            iou_scores=iou_predictions,
+            pred_masks=low_res_masks,
+            vision_hidden_states=vision_hidden_states,
+            vision_attentions=vision_attentions,
+            mask_decoder_attentions=mask_decoder_attentions,
+        )
+
+    def serving_output(self, output: TFSamImageSegmentationOutput) -> TFSamImageSegmentationOutput:
+        hs = tf.convert_to_tensor(output.vision_hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.vision_attentions) if self.config.output_attentions else None
+
+        return TFSamImageSegmentationOutput(
+            iou_scores=output.iou_scores,
+            pred_masks=output.pred_masks,
+            vision_hidden_states=hs if self.config.output_hidden_states else None,
+            vision_attentions=attns if self.config.output_attentions else None,
+            mask_decoder_attentions=output.mask_decoder_attentions if self.config.output_attentions else None,
+        )
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 919ed685a861..fd73260f9401 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -22,12 +22,15 @@
 
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import TensorType, is_torch_available
+from ...utils import TensorType, is_tf_available, is_torch_available
 
 
 if is_torch_available():
     import torch
 
+if is_tf_available():
+    import tensorflow as tf
+
 
 class SamProcessor(ProcessorMixin):
     r"""
@@ -72,7 +75,7 @@ def __call__(
         # pop arguments that are not used in the foward but used nevertheless
         original_sizes = encoding_image_processor["original_sizes"]
 
-        if isinstance(original_sizes, torch.Tensor):
+        if hasattr(original_sizes, "numpy"):  # Checks if Torch or TF tensor
             original_sizes = original_sizes.numpy()
 
         input_points, input_labels, input_boxes = self._check_and_preprocess_points(
@@ -139,18 +142,30 @@ def _normalize_and_convert(
                 input_boxes = torch.from_numpy(input_boxes)
                 # boxes batch size of 1 by default
                 input_boxes = input_boxes.unsqueeze(1) if len(input_boxes.shape) != 3 else input_boxes
+            elif return_tensors == "tf":
+                input_boxes = tf.convert_to_tensor(input_boxes)
+                # boxes batch size of 1 by default
+                input_boxes = tf.expand_dims(input_boxes, 1) if len(input_boxes.shape) != 3 else input_boxes
             encoding_image_processor.update({"input_boxes": input_boxes})
         if input_points is not None:
             if return_tensors == "pt":
                 input_points = torch.from_numpy(input_points)
                 # point batch size of 1 by default
                 input_points = input_points.unsqueeze(1) if len(input_points.shape) != 4 else input_points
+            elif return_tensors == "tf":
+                input_points = tf.convert_to_tensor(input_points)
+                # point batch size of 1 by default
+                input_points = tf.expand_dims(input_points, 1) if len(input_points.shape) != 4 else input_points
             encoding_image_processor.update({"input_points": input_points})
         if input_labels is not None:
             if return_tensors == "pt":
                 input_labels = torch.from_numpy(input_labels)
                 # point batch size of 1 by default
                 input_labels = input_labels.unsqueeze(1) if len(input_labels.shape) != 3 else input_labels
+            elif return_tensors == "tf":
+                input_labels = tf.convert_to_tensor(input_labels)
+                # point batch size of 1 by default
+                input_labels = tf.expand_dims(input_labels, 1) if len(input_labels.shape) != 3 else input_labels
             encoding_image_processor.update({"input_labels": input_labels})
 
         return encoding_image_processor
@@ -204,7 +219,7 @@ def _check_and_preprocess_points(
         it is converted to a `numpy.ndarray` and then to a `list`.
         """
         if input_points is not None:
-            if isinstance(input_points, torch.Tensor):
+            if hasattr(input_points, "numpy"):  # Checks for TF or Torch tensor
                 input_points = input_points.numpy().tolist()
 
             if not isinstance(input_points, list) or not isinstance(input_points[0], list):
@@ -214,7 +229,7 @@ def _check_and_preprocess_points(
             input_points = None
 
         if input_labels is not None:
-            if isinstance(input_labels, torch.Tensor):
+            if hasattr(input_labels, "numpy"):
                 input_labels = input_labels.numpy().tolist()
 
             if not isinstance(input_labels, list) or not isinstance(input_labels[0], list):
@@ -224,7 +239,7 @@ def _check_and_preprocess_points(
             input_labels = None
 
         if input_boxes is not None:
-            if isinstance(input_boxes, torch.Tensor):
+            if hasattr(input_boxes, "numpy"):
                 input_boxes = input_boxes.numpy().tolist()
 
             if (
diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py
index 2d4fa6fda9b6..306f73c0b1ba 100644
--- a/src/transformers/tf_utils.py
+++ b/src/transformers/tf_utils.py
@@ -70,6 +70,56 @@ def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional
     return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
 
 
+def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1):
+    # This is a very simplified functional layernorm, designed to duplicate
+    # the functionality of PyTorch nn.functional.layer_norm when this is needed to port
+    # models in Transformers.
+
+    if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int):
+        raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.")
+
+    # Get mean and variance on the axis to be normalized
+    mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True)
+
+    if axis != -1:
+        # Reshape scale and weight to have the same rank as inputs, but with 1 dimensions
+        # on every dimension except axis
+        shape = [1] * inputs.shape.rank
+        shape[axis] = shape_list(inputs)[axis]
+        weight = tf.reshape(weight, shape)
+        bias = tf.reshape(bias, shape)
+
+    # Compute layer normalization using the batch_normalization
+    # function.
+    outputs = tf.nn.batch_normalization(
+        inputs,
+        mean,
+        variance,
+        offset=bias,
+        scale=weight,
+        variance_epsilon=epsilon,
+    )
+    return outputs
+
+
+def flatten(input, start_dim=0, end_dim=-1):
+    # Replicates the behavior of torch.flatten in TF
+
+    # If end_dim or start_dim is negative, count them from the end
+    if end_dim < 0:
+        end_dim += input.shape.rank
+    if start_dim < 0:
+        start_dim += input.shape.rank
+
+    if start_dim == end_dim:
+        return input
+
+    in_shape = tf.shape(input)
+    flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1])
+    out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0)
+    return tf.reshape(input, out_shape)
+
+
 def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor:
     """
     Invert an attention mask (e.g., switches 0. and 1.).
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 2a043f50f350..658d7f689fce 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -2317,6 +2317,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFSamModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSamPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index a45e5c4bf545..8507c0a6381b 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -436,6 +436,9 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_hidden_states_output(self):
         pass
 
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True, tol=5e-4):
+        super().test_pt_tf_model_equivalence(allow_missing_keys=True, tol=tol)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in SAM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -470,8 +473,10 @@ def test_inference_mask_generation_no_point(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
 
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.5798), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.5798), atol=2e-4))
+        self.assertTrue(torch.allclose(masks, torch.tensor([-6.6381, -6.0734, -7.5308]).to(torch_device), atol=2e-4))
 
     def test_inference_mask_generation_one_point_one_bb(self):
         model = SamModel.from_pretrained("facebook/sam-vit-huge")
@@ -491,8 +496,12 @@ def test_inference_mask_generation_one_point_one_bb(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
 
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9935), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9935), atol=2e-4))
+        self.assertTrue(
+            torch.allclose(masks, torch.tensor([-21.5465, -23.1122, -22.3331]).to(torch_device), atol=2e-4)
+        )
 
     def test_inference_mask_generation_batched_points_batched_images(self):
         model = SamModel.from_pretrained("facebook/sam-vit-huge")
@@ -514,6 +523,7 @@ def test_inference_mask_generation_batched_points_batched_images(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze().cpu()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
 
         EXPECTED_SCORES = torch.tensor(
             [
@@ -531,7 +541,9 @@ def test_inference_mask_generation_batched_points_batched_images(self):
                 ],
             ]
         )
+        EXPECTED_MASKS = torch.tensor([-26.5424, -34.0901, -30.6406])
         self.assertTrue(torch.allclose(scores, EXPECTED_SCORES, atol=1e-3))
+        self.assertTrue(torch.allclose(masks, EXPECTED_MASKS, atol=1e-3))
 
     def test_inference_mask_generation_one_point_one_bb_zero(self):
         model = SamModel.from_pretrained("facebook/sam-vit-huge")
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
new file mode 100644
index 000000000000..a07398365fff
--- /dev/null
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -0,0 +1,671 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow SAM model. """
+
+
+import inspect
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
+from transformers.testing_utils import require_tf, slow
+from transformers.utils import is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import SamProcessor, TFSamModel
+    from transformers.models.sam.modeling_tf_sam import TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_vision_available():
+    from PIL import Image
+
+
+class TFSamPromptEncoderTester:
+    def __init__(
+        self,
+        hidden_size=32,
+        input_image_size=24,
+        patch_size=2,
+        mask_input_channels=4,
+        num_point_embeddings=4,
+        hidden_act="gelu",
+    ):
+        self.hidden_size = hidden_size
+        self.input_image_size = input_image_size
+        self.patch_size = patch_size
+        self.mask_input_channels = mask_input_channels
+        self.num_point_embeddings = num_point_embeddings
+        self.hidden_act = hidden_act
+
+    def get_config(self):
+        return SamPromptEncoderConfig(
+            image_size=self.input_image_size,
+            patch_size=self.patch_size,
+            mask_input_channels=self.mask_input_channels,
+            hidden_size=self.hidden_size,
+            num_point_embeddings=self.num_point_embeddings,
+            hidden_act=self.hidden_act,
+        )
+
+    def prepare_config_and_inputs(self):
+        dummy_points = floats_tensor([self.batch_size, 3, 2])
+        config = self.get_config()
+
+        return config, dummy_points
+
+
+class TFSamMaskDecoderTester:
+    def __init__(
+        self,
+        hidden_size=32,
+        hidden_act="relu",
+        mlp_dim=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        attention_downsample_rate=2,
+        num_multimask_outputs=3,
+        iou_head_depth=3,
+        iou_head_hidden_dim=32,
+        layer_norm_eps=1e-6,
+    ):
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_dim = mlp_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_downsample_rate = attention_downsample_rate
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_head_depth = iou_head_depth
+        self.iou_head_hidden_dim = iou_head_hidden_dim
+        self.layer_norm_eps = layer_norm_eps
+
+    def get_config(self):
+        return SamMaskDecoderConfig(
+            hidden_size=self.hidden_size,
+            hidden_act=self.hidden_act,
+            mlp_dim=self.mlp_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            attention_downsample_rate=self.attention_downsample_rate,
+            num_multimask_outputs=self.num_multimask_outputs,
+            iou_head_depth=self.iou_head_depth,
+            iou_head_hidden_dim=self.iou_head_hidden_dim,
+            layer_norm_eps=self.layer_norm_eps,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        dummy_inputs = {
+            "image_embedding": floats_tensor([self.batch_size, self.hidden_size]),
+        }
+
+        return config, dummy_inputs
+
+
+class TFSamModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_size=36,
+        intermediate_size=72,
+        projection_dim=62,
+        output_channels=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=24,
+        patch_size=2,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        rel_pos_zero_init=False,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        num_pos_feats=16,
+        mlp_dim=None,
+        batch_size=2,
+    ):
+        self.parent = parent
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.output_channels = output_channels
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.num_pos_feats = num_pos_feats
+        self.mlp_dim = mlp_dim
+        self.batch_size = batch_size
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+        self.prompt_encoder_tester = TFSamPromptEncoderTester()
+        self.mask_decoder_tester = TFSamMaskDecoderTester()
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        vision_config = SamVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            initializer_factor=self.initializer_factor,
+            output_channels=self.output_channels,
+            qkv_bias=self.qkv_bias,
+            mlp_ratio=self.mlp_ratio,
+            use_abs_pos=self.use_abs_pos,
+            use_rel_pos=self.use_rel_pos,
+            rel_pos_zero_init=self.rel_pos_zero_init,
+            window_size=self.window_size,
+            global_attn_indexes=self.global_attn_indexes,
+            num_pos_feats=self.num_pos_feats,
+            mlp_dim=self.mlp_dim,
+        )
+
+        prompt_encoder_config = self.prompt_encoder_tester.get_config()
+
+        mask_decoder_config = self.mask_decoder_tester.get_config()
+
+        return SamConfig(
+            vision_config=vision_config,
+            prompt_encoder_config=prompt_encoder_config,
+            mask_decoder_config=mask_decoder_config,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TFSamModel(config=config)
+        result = model(pixel_values)
+        self.parent.assertEqual(result.iou_scores.shape, (self.batch_size, 1, 3))
+        self.parent.assertEqual(result.pred_masks.shape[:3], (self.batch_size, 1, 3))
+
+    def create_and_check_get_image_features(self, config, pixel_values):
+        model = TFSamModel(config=config)
+        result = model.get_image_embeddings(pixel_values)
+        self.parent.assertEqual(result[0].shape, (self.output_channels, 12, 12))
+
+    def create_and_check_get_image_hidden_states(self, config, pixel_values):
+        model = TFSamModel(config=config)
+        result = model.vision_encoder(
+            pixel_values,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+
+        # after computing the convolutional features
+        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
+        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
+        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
+
+        result = model.vision_encoder(
+            pixel_values,
+            output_hidden_states=True,
+            return_dict=False,
+        )
+
+        # after computing the convolutional features
+        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
+        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
+        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TFSamModel,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TFSamModel, "mask-generation": TFSamModel} if is_tf_available() else {}
+    )
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = TFSamModelTester(self)
+        self.vision_config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
+        self.prompt_encoder_config_tester = ConfigTester(
+            self,
+            config_class=SamPromptEncoderConfig,
+            has_text_modality=False,
+            num_attention_heads=12,
+            num_hidden_layers=2,
+        )
+        self.mask_decoder_config_tester = ConfigTester(
+            self, config_class=SamMaskDecoderConfig, has_text_modality=False
+        )
+
+    def test_config(self):
+        self.vision_config_tester.run_common_tests()
+        self.prompt_encoder_config_tester.run_common_tests()
+        self.mask_decoder_config_tester.run_common_tests()
+
+    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Dense))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_get_image_features(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_get_image_features(*config_and_inputs)
+
+    def test_image_hidden_states(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_get_image_hidden_states(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        expected_vision_attention_shape = (
+            self.model_tester.batch_size * self.model_tester.num_attention_heads,
+            196,
+            196,
+        )
+        expected_mask_decoder_attention_shape = (self.model_tester.batch_size, 1, 144, 32)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            vision_attentions = outputs.vision_attentions
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
+
+            mask_decoder_attentions = outputs.mask_decoder_attentions
+            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            vision_attentions = outputs.vision_attentions
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
+
+            mask_decoder_attentions = outputs.mask_decoder_attentions
+            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(vision_attentions[0].shape[-4:]),
+                list(expected_vision_attention_shape),
+            )
+
+            self.assertListEqual(
+                list(mask_decoder_attentions[0].shape[-4:]),
+                list(expected_mask_decoder_attention_shape),
+            )
+
+    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFSamModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-4, name="outputs", attributes=None):
+        super().check_pt_tf_outputs(
+            tf_outputs=tf_outputs,
+            pt_outputs=pt_outputs,
+            model_class=model_class,
+            tol=tol,
+            name=name,
+            attributes=attributes,
+        )
+
+
+def prepare_image():
+    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+
+def prepare_dog_img():
+    img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+
+
+@slow
+class SamModelIntegrationTest(unittest.TestCase):
+    def test_inference_mask_generation_no_point(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+        inputs = processor(images=raw_image, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.5798), atol=2e-4))
+        self.assertTrue(np.allclose(masks.numpy(), np.array([-6.6381, -6.0734, -7.5308]), atol=1e-2))
+
+    def test_inference_mask_generation_one_point_one_bb(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+        input_boxes = [[[650, 900, 1000, 1250]]]
+        input_points = [[[820, 1080]]]
+
+        inputs = processor(images=raw_image, input_boxes=input_boxes, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+
+        self.assertTrue(np.allclose(scores[-1], np.array(0.9935), atol=2e-4))
+        self.assertTrue(np.allclose(masks.numpy(), np.array([-21.5465, -23.1122, -22.3331]), atol=2e-2))
+
+    def test_inference_mask_generation_batched_points_batched_images(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+        input_points = [
+            [[[820, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
+            [[[510, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
+        ]
+
+        inputs = processor(images=[raw_image, raw_image], input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+
+        EXPECTED_SCORES = np.array(
+            [
+                [
+                    [0.9673, 0.9441, 0.9084],
+                    [0.9673, 0.9441, 0.9084],
+                    [0.9673, 0.9441, 0.9084],
+                    [0.9673, 0.9441, 0.9084],
+                ],
+                [
+                    [0.8405, 0.6292, 0.3840],
+                    [0.9673, 0.9441, 0.9084],
+                    [0.9673, 0.9441, 0.9084],
+                    [0.9673, 0.9441, 0.9084],
+                ],
+            ]
+        )
+        EXPECTED_MASKS = np.array([-26.5424, -34.0901, -30.6406])
+        self.assertTrue(np.allclose(scores.numpy(), EXPECTED_SCORES, atol=1e-3))
+        self.assertTrue(np.allclose(masks.numpy(), EXPECTED_MASKS, atol=3e-2))
+
+    def test_inference_mask_generation_one_point_one_bb_zero(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+        input_boxes = [[[620, 900, 1000, 1255]]]
+        input_points = [[[820, 1080]]]
+        labels = [[0]]
+
+        inputs = processor(
+            images=raw_image,
+            input_boxes=input_boxes,
+            input_points=input_points,
+            input_labels=labels,
+            return_tensors="tf",
+        )
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9689), atol=1e-4))
+
+    def test_inference_mask_generation_one_point(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650]]]
+        input_labels = [[1]]
+
+        inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1], np.array(0.9712), atol=1e-4))
+
+        # With no label
+        input_points = [[[400, 650]]]
+
+        inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9712), atol=1e-4))
+
+    def test_inference_mask_generation_two_points(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650], [800, 650]]]
+        input_labels = [[1, 1]]
+
+        inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9936), atol=1e-4))
+
+        # no labels
+        inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9936), atol=1e-4))
+
+    def test_inference_mask_generation_two_points_batched(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+
+        input_points = [[[400, 650], [800, 650]], [[400, 650]]]
+        input_labels = [[1, 1], [1]]
+
+        inputs = processor(
+            images=[raw_image, raw_image], input_points=input_points, input_labels=input_labels, return_tensors="tf"
+        )
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[0][-1].numpy(), np.array(0.9936), atol=1e-4))
+        self.assertTrue(np.allclose(scores[1][-1], np.array(0.9716), atol=1e-4))
+
+    def test_inference_mask_generation_one_box(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+
+        input_boxes = [[[75, 275, 1725, 850]]]
+
+        inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores = tf.squeeze(outputs.iou_scores)
+
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.8686), atol=1e-4))
+
+    def test_inference_mask_generation_batched_image_one_point(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+        raw_dog_image = prepare_dog_img()
+
+        input_points = [[[820, 1080]], [[220, 470]]]
+
+        inputs = processor(images=[raw_image, raw_dog_image], input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores_batched = tf.squeeze(outputs.iou_scores)
+
+        input_points = [[[220, 470]]]
+
+        inputs = processor(images=raw_dog_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+        scores_single = tf.squeeze(outputs.iou_scores)
+        self.assertTrue(np.allclose(scores_batched[1, :].numpy(), scores_single.numpy(), atol=1e-4))
+
+    def test_inference_mask_generation_two_points_point_batch(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+
+        # fmt: off
+        input_points = tf.convert_to_tensor([[[400, 650]], [[220, 470]]])
+        # fmt: on
+
+        input_points = tf.expand_dims(input_points, 0)
+
+        inputs = processor(raw_image, input_points=input_points, return_tensors="tf")
+
+        outputs = model(**inputs)
+
+        iou_scores = outputs.iou_scores
+        self.assertTrue(iou_scores.shape == (1, 2, 3))
+        self.assertTrue(
+            np.allclose(
+                iou_scores.numpy(),
+                np.array([[[0.9848, 0.9788, 0.9713], [0.9211, 0.9128, 0.7427]]]),
+                atol=1e-4,
+                rtol=1e-4,
+            )
+        )
+
+    def test_inference_mask_generation_three_boxes_point_batch(self):
+        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+        raw_image = prepare_image()
+
+        # fmt: off
+        input_boxes = tf.convert_to_tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]])
+        EXPECTED_IOU = np.array([[[1.0071, 1.0032, 0.9946], [0.4962, 0.8770, 0.8686], [0.4962, 0.8770, 0.8686]]])
+        # fmt: on
+        input_boxes = tf.expand_dims(input_boxes, 0)
+
+        inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="tf")
+
+        outputs = model(**inputs)
+
+        iou_scores = outputs.iou_scores
+        self.assertTrue(iou_scores.shape == (1, 3, 3))
+        self.assertTrue(np.allclose(iou_scores.numpy(), EXPECTED_IOU, atol=1e-4, rtol=1e-4))
diff --git a/tests/models/sam/test_processor_sam.py b/tests/models/sam/test_processor_sam.py
index 13efa22e3e3c..7d669bb96914 100644
--- a/tests/models/sam/test_processor_sam.py
+++ b/tests/models/sam/test_processor_sam.py
@@ -17,8 +17,14 @@
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_torchvision, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    is_pt_tf_cross_test,
+    require_tf,
+    require_torch,
+    require_torchvision,
+    require_vision,
+)
+from transformers.utils import is_tf_available, is_torch_available, is_vision_available
 
 
 if is_vision_available():
@@ -29,6 +35,9 @@
 if is_torch_available():
     import torch
 
+if is_tf_available():
+    import tensorflow as tf
+
 
 @require_vision
 @require_torchvision
@@ -110,3 +119,158 @@ def test_post_process_masks(self):
         dummy_masks = [[1, 0], [0, 1]]
         with self.assertRaises(ValueError):
             masks = processor.post_process_masks(dummy_masks, np.array(original_sizes), np.array(reshaped_input_size))
+
+
+@require_vision
+@require_tf
+class TFSamProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = SamImageProcessor()
+        processor = SamProcessor(image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = SamProcessor(image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = SamProcessor.from_pretrained(self.tmpdirname, do_normalize=False, padding_value=1.0)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, SamImageProcessor)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        input_feat_extract.pop("original_sizes")  # pop original_sizes as it is popped in the processor
+        input_feat_extract.pop("reshaped_input_sizes")  # pop reshaped_input_sizes as it is popped in the processor
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    @require_tf
+    def test_post_process_masks(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+        dummy_masks = [tf.ones((1, 3, 5, 5))]
+
+        original_sizes = [[1764, 2646]]
+
+        reshaped_input_size = [[683, 1024]]
+        masks = processor.post_process_masks(dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf")
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        masks = processor.post_process_masks(
+            dummy_masks,
+            tf.convert_to_tensor(original_sizes),
+            tf.convert_to_tensor(reshaped_input_size),
+            return_tensors="tf",
+        )
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        # should also work with np
+        dummy_masks = [np.ones((1, 3, 5, 5))]
+        masks = processor.post_process_masks(
+            dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
+        )
+
+        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
+
+        dummy_masks = [[1, 0], [0, 1]]
+        with self.assertRaises(tf.errors.InvalidArgumentError):
+            masks = processor.post_process_masks(
+                dummy_masks, np.array(original_sizes), np.array(reshaped_input_size), return_tensors="tf"
+            )
+
+
+@require_vision
+@require_torchvision
+class SamProcessorEquivalenceTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = SamImageProcessor()
+        processor = SamProcessor(image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    @is_pt_tf_cross_test
+    def test_post_process_masks_equivalence(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+        dummy_masks = np.random.randint(0, 2, size=(1, 3, 5, 5)).astype(np.float32)
+        tf_dummy_masks = [tf.convert_to_tensor(dummy_masks)]
+        pt_dummy_masks = [torch.tensor(dummy_masks)]
+
+        original_sizes = [[1764, 2646]]
+
+        reshaped_input_size = [[683, 1024]]
+        tf_masks = processor.post_process_masks(
+            tf_dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf"
+        )
+        pt_masks = processor.post_process_masks(
+            pt_dummy_masks, original_sizes, reshaped_input_size, return_tensors="pt"
+        )
+
+        self.assertTrue(np.all(tf_masks[0].numpy() == pt_masks[0].numpy()))
+
+    @is_pt_tf_cross_test
+    def test_image_processor_equivalence(self):
+        image_processor = self.get_image_processor()
+
+        processor = SamProcessor(image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+
+        pt_input_feat_extract = image_processor(image_input, return_tensors="pt")["pixel_values"].numpy()
+        pt_input_processor = processor(images=image_input, return_tensors="pt")["pixel_values"].numpy()
+
+        tf_input_feat_extract = image_processor(image_input, return_tensors="tf")["pixel_values"].numpy()
+        tf_input_processor = processor(images=image_input, return_tensors="tf")["pixel_values"].numpy()
+
+        self.assertTrue(np.allclose(pt_input_feat_extract, pt_input_processor))
+        self.assertTrue(np.allclose(pt_input_feat_extract, tf_input_feat_extract))
+        self.assertTrue(np.allclose(pt_input_feat_extract, tf_input_processor))

From 21bd3be1728e2dc6b39f98b48d96d52037ec61f6 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 19 May 2023 16:12:25 +0200
Subject: [PATCH 171/935] [`RWKV`] Rwkv fix for 8bit inference (#23468)

* rwkv fix for 8bit inference

* add comment
---
 src/transformers/models/rwkv/modeling_rwkv.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index d40509638d9c..7a78ec082f45 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -709,8 +709,13 @@ def _rescale_layers(self):
                         block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
                         block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
                     else:
-                        block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
-                        block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
+                        # Deal with quantization statistics
+                        if hasattr(block.attention.output.weight, "SCB"):
+                            block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
+                            block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
+                        else:
+                            block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
+                            block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))
 
         self.layers_are_rescaled = not self.training
 

From 2aa0cc2c2abf4e770399442ef8788b5ccae530bd Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 19 May 2023 10:36:14 -0400
Subject: [PATCH 172/935] Use config to set name and description if not present
 (#23473)

Use config to set name and descriptiob if not present
---
 src/transformers/tools/base.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index b97bc4a43beb..add64e373dcd 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -260,6 +260,24 @@ def from_hub(
         tool_class = custom_tool["tool_class"]
         tool_class = get_class_from_dynamic_module(tool_class, repo_id, use_auth_token=token, **hub_kwargs)
 
+        if len(tool_class.name) == 0:
+            tool_class.name = custom_tool["name"]
+        if tool_class.name != custom_tool["name"]:
+            logger.warn(
+                f"{tool_class.__name__} implements a different name in its configuration and class. Using the tool "
+                "configuration name."
+            )
+            tool_class.name = custom_tool["name"]
+
+        if len(tool_class.description) == 0:
+            tool_class.description = custom_tool["description"]
+        if tool_class.description != custom_tool["description"]:
+            logger.warn(
+                f"{tool_class.__name__} implements a different description in its configuration and class. Using the "
+                "tool configuration description."
+            )
+            tool_class.description = custom_tool["description"]
+
         if remote:
             return RemoteTool(model_repo_id, token=token, tool_class=tool_class)
         return tool_class(model_repo_id, token=token, **kwargs)

From db4d765249497df9eb485ffeaa29833e93629646 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 19 May 2023 17:50:06 +0200
Subject: [PATCH 173/935] Fix `transformers`' DeepSpeed CI job (#23463)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index b6e892a5e15b..71df34999cfc 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -1,4 +1,4 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
 FROM nvcr.io/nvidia/pytorch:22.12-py3
 LABEL maintainer="Hugging Face"
 
@@ -26,6 +26,9 @@ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
+# Uninstall `transformer-engine` shipped with the base image
+RUN python3 -m pip uninstall -y transformer-engine
+
 # Uninstall `torch-tensorrt` shipped with the base image
 RUN python3 -m pip uninstall -y torch-tensorrt
 

From b455ad0a6460569a2237f201d2be21d5792e8a6f Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Fri, 19 May 2023 12:44:24 -0400
Subject: [PATCH 174/935] Fix parallel mode check (#23409)

* Fix sagemaker/distributed state

* Fix correctly

* Bring back -1

* Bring back local rank for distributed check

* better version

* Cleanest option
---
 src/transformers/training_args.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 3189c5f86bb9..b42400e57a6e 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1613,6 +1613,7 @@ def _setup_devices(self) -> "torch.device":
             raise ImportError(
                 "Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
             )
+        self.distributed_state = None
         if self.no_cuda:
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
             self._n_gpu = 0
@@ -1636,7 +1637,7 @@ def _setup_devices(self) -> "torch.device":
         if (
             torch.distributed.is_available()
             and torch.distributed.is_initialized()
-            and self.distributed_state.distributed_type == DistributedType.NO
+            and self.parallel_mode != ParallelMode.DISTRIBUTED
         ):
             logger.warning(
                 "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
@@ -1728,7 +1729,9 @@ def parallel_mode(self):
             return ParallelMode.SAGEMAKER_MODEL_PARALLEL
         elif is_sagemaker_dp_enabled():
             return ParallelMode.SAGEMAKER_DATA_PARALLEL
-        elif hasattr(self, "distributed_state") and self.distributed_state.distributed_type != DistributedType.NO:
+        elif (
+            self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO
+        ) or (self.distributed_state is None and self.local_rank != -1):
             return ParallelMode.DISTRIBUTED
         elif self.n_gpu > 1:
             return ParallelMode.NOT_DISTRIBUTED

From 389bdba618c92dc50bab42bed3e2dcb5dcb5581f Mon Sep 17 00:00:00 2001
From: joaoareis <34096208+joaoareis@users.noreply.github.com>
Date: Fri, 19 May 2023 17:48:35 +0100
Subject: [PATCH 175/935] Fix PretrainedConfig `min_length` docstring (#23471)

---
 src/transformers/configuration_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 6f670a5a509e..3b7f668b9ea8 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -119,7 +119,7 @@ class PretrainedConfig(PushToHubMixin):
 
         max_length (`int`, *optional*, defaults to 20):
             Maximum length that will be used by default in the `generate` method of the model.
-        min_length (`int`, *optional*, defaults to 10):
+        min_length (`int`, *optional*, defaults to 0):
             Minimum length that will be used by default in the `generate` method of the model.
         do_sample (`bool`, *optional*, defaults to `False`):
             Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;

From 847e5691a6ed95ea5ea0434a05a2389b996d5598 Mon Sep 17 00:00:00 2001
From: Dennis Loevlie <59749099+loevlie@users.noreply.github.com>
Date: Fri, 19 May 2023 12:50:11 -0400
Subject: [PATCH 176/935] Fix: Change tensors to integers for torch.dynamo and
 torch.compile compatibility (#23475)

* Fix: Change tensors to integers in torch.split() for torch.dynamo and torch.compile compatibility

* Applied the suggested fix to the utils/check_copies.py test

* Applied the suggested fix by changing the original function that gets copied
---
 .../models/deformable_detr/modeling_deformable_detr.py        | 2 +-
 src/transformers/models/deta/modeling_deta.py                 | 2 +-
 src/transformers/models/mask2former/modeling_mask2former.py   | 4 ++--
 src/transformers/models/oneformer/modeling_oneformer.py       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 96de07cb4120..d85d98ef9ec8 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -550,7 +550,7 @@ def multi_scale_deformable_attention(
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 2a66def09696..5f14aa2469b4 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -453,7 +453,7 @@ def multi_scale_deformable_attention(
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index f8a9c3c1ae4d..289b4919e369 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -810,7 +810,7 @@ def multi_scale_deformable_attention(
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -1340,7 +1340,7 @@ def forward(
             else:
                 split_sizes[i] = last_hidden_state.shape[1] - level_start_index[i]
 
-        encoder_output = torch.split(last_hidden_state, split_sizes, dim=1)
+        encoder_output = torch.split(last_hidden_state, [size.item() for size in split_sizes], dim=1)
 
         # Compute final features
         outputs = [
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 6536a54ba5f1..a874611acde8 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -66,7 +66,7 @@ def multi_scale_deformable_attention(
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):

From 3cb9309024aeeeba5cf820539119f2f12aa4eac7 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 19 May 2023 19:14:16 +0200
Subject: [PATCH 177/935] [`Blip`] Remove redundant shift right (#23153)

* remove redundant shit right

* fix failing tests

* this time fix tests
---
 src/transformers/models/blip/modeling_blip.py | 17 ----
 .../models/blip/modeling_tf_blip.py           | 28 ------
 tests/models/blip/test_modeling_blip.py       | 83 +++++++++++++++---
 tests/models/blip/test_modeling_tf_blip.py    | 86 ++++++++++++++++++-
 4 files changed, 154 insertions(+), 60 deletions(-)

diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index e2459239ac57..9e0fc7419d46 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -1121,19 +1121,6 @@ def __init__(self, config: BlipConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
-    # Adapted from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right
-    def _shift_right(self, input_ids):
-        pad_token_id = self.decoder_pad_token_id
-
-        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-        shifted_input_ids[..., 0] = self.decoder_start_token_id
-
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
-        return shifted_input_ids
-
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
     def forward(
@@ -1215,10 +1202,6 @@ def forward(
 
         question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
 
-        if labels is not None and decoder_input_ids is None:
-            # get decoder inputs from shifting lm labels to the right - this is used in training mode
-            decoder_input_ids = self._shift_right(labels)
-
         answer_output = self.text_decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 6ae7a2503cc3..4ea9c7e7e5f5 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -1335,30 +1335,6 @@ def serving_output(self, output: TFBlipTextVisionModelOutput) -> TFBlipTextVisio
             attentions=attns,
         )
 
-    # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
-    def _shift_right(self, input_ids):
-        decoder_start_token_id = self.decoder_start_token_id
-        pad_token_id = self.decoder_pad_token_id
-
-        if decoder_start_token_id is None or pad_token_id is None:
-            raise ValueError("decoder_start_token_id and pad_token_id must be defined!")
-
-        start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
-        start_tokens = tf.cast(start_tokens, input_ids.dtype)  # Ensure compatible dtypes for concatenation
-        shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
-
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids = tf.where(
-            shifted_input_ids == -100,
-            tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
-            shifted_input_ids,
-        )
-
-        # "Verify that `labels` has only positive values and -100"
-        tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))
-
-        return shifted_input_ids
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig)
@@ -1440,10 +1416,6 @@ def call(
 
         question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
 
-        if labels is not None and decoder_input_ids is None:
-            # get decoder inputs from shifting lm labels to the right - this is used in training mode
-            decoder_input_ids = self._shift_right(labels)
-
         answer_output = self.text_decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 8829903c16c5..7d9c6b5ba58b 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -626,17 +626,73 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+class BlipVQAModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = BlipModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "labels": input_ids,
+            "decoder_input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
 @require_torch
 @require_vision
-class BlipVQAModelTest(unittest.TestCase):
+class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
 
     def setUp(self):
-        self.model_tester = BlipModelTester(self)
+        self.model_tester = BlipVQAModelTester(self)
 
     def _prepare_inputs_for_vqa(self):
         _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         inputs_dict["labels"] = inputs_dict["input_ids"]
+        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
         inputs_dict.pop("return_loss")
         return inputs_dict
 
@@ -658,7 +714,7 @@ def test_training(self):
         for model_class in self.all_model_classes:
             model = model_class(self.model_tester.get_config()).to(torch_device)
             model.train()
-            loss = model(**self._prepare_inputs_for_vqa()).loss
+            loss = model(**self.model_tester.prepare_config_and_inputs_for_common()[1]).loss
             loss.backward()
 
             # verify the gradients are not None
@@ -687,6 +743,18 @@ def test_forward_signature(self):
                     f"Argument {arg} of forward function signature should include {arg}. Found {args}.",
                 )
 
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
 
 @require_torch
 class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
@@ -886,14 +954,7 @@ def test_model_from_pretrained(self):
 
 @require_torch
 class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BlipForConditionalGeneration,
-            BlipForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index b8fd916ec13e..3bb7b87edbb5 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -526,17 +526,71 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+class BlipVQAModelsModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = TFBlipTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = TFBlipVisionModelTester(parent, **vision_kwargs)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return BlipConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFBlipModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
+            "labels": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
 @require_tf
 @require_vision
-class BlipVQAModelTest(unittest.TestCase):
+class TFBlipVQAModelTest(TFModelTesterMixin, unittest.TestCase):
     all_model_classes = (TFBlipForQuestionAnswering,) if is_tf_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
 
     def setUp(self):
-        self.model_tester = TFBlipModelTester(self)
+        self.model_tester = BlipVQAModelsModelTester(self)
 
     def _prepare_inputs_for_vqa(self):
         _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         inputs_dict["labels"] = inputs_dict["input_ids"]
+        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
         inputs_dict.pop("return_loss")
         return inputs_dict
 
@@ -557,10 +611,34 @@ def test_training(self):
         """
         for model_class in self.all_model_classes:
             model = model_class(self.model_tester.get_config())
-            loss = model(**self._prepare_inputs_for_vqa(), training=True).loss
+            loss = model(**self.model_tester.prepare_config_and_inputs_for_common()[1], training=True).loss
 
             self.assertIsNotNone(loss, "Loss should not be None")
 
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="BlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="Tested in individual model tests")
+    def test_compile_tf_model(self):
+        pass
+
+    @unittest.skip("Model doesn't have a clean loss output.")
+    def test_keras_fit(self):
+        pass
+
 
 @require_tf
 class TFBlipTextRetrievalModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -643,7 +721,7 @@ def test_keras_fit(self):
 
 @require_tf
 class TFBlipTextImageModelTest(TFModelTesterMixin, unittest.TestCase):
-    all_model_classes = (TFBlipForConditionalGeneration, TFBlipForQuestionAnswering) if is_tf_available() else ()
+    all_model_classes = (TFBlipForConditionalGeneration,) if is_tf_available() else ()
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False

From 1f2c00d67148b85199dff12e322c1095cd2286f4 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 19 May 2023 20:31:55 +0200
Subject: [PATCH 178/935] Fix DeepSpeed stuff in the nightly CI (#23478)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/build-nightly-ci-docker-images.yml   | 10 ++++++++++
 .../Dockerfile                                         |  5 ++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml
index f13dda7daa82..1b8cab864d92 100644
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@@ -52,6 +52,16 @@ jobs:
     name: "Nightly PyTorch + DeepSpeed"
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
index 50efc08129d8..b3ead0c61547 100644
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -1,4 +1,4 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
 FROM nvcr.io/nvidia/pytorch:22.12-py3
 LABEL maintainer="Hugging Face"
 
@@ -25,6 +25,9 @@ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
+# Uninstall `transformer-engine` shipped with the base image
+RUN python3 -m pip uninstall -y transformer-engine
+
 # Uninstall `torch-tensorrt` and `apex` shipped with the base image
 RUN python3 -m pip uninstall -y torch-tensorrt apex
 

From 9728f1134bf1bcb822dcb46a205e19f12ff21323 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 19 May 2023 22:10:18 +0200
Subject: [PATCH 179/935] Fix confusing `transformers` installation in CI
 (#23465)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-nightly-scheduled.yml | 12 ++++++++++++
 .github/workflows/self-past.yml              | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index aea6bf44d275..34c1ee59f4d4 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -115,6 +115,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -172,6 +176,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -217,6 +225,10 @@ jobs:
         working-directory: /workspace/transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index 08be58b93a90..5e3aa3152b6c 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -111,6 +111,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ inputs.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -183,6 +187,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ inputs.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -255,6 +263,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Install
         working-directory: /transformers
         run: |

From 3658488ff77ff8d45101293e749263acf437f4d5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Sat, 20 May 2023 06:53:10 +0200
Subject: [PATCH 180/935] Fix `tests/repo_utils/test_get_test_info.py` (#23485)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/repo_utils/test_get_test_info.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/repo_utils/test_get_test_info.py b/tests/repo_utils/test_get_test_info.py
index e432dd945ee2..a6d4a9984d32 100644
--- a/tests/repo_utils/test_get_test_info.py
+++ b/tests/repo_utils/test_get_test_info.py
@@ -45,7 +45,7 @@ def test_get_test_to_tester_mapping(self):
             "BlipTextImageModelTest": "BlipTextImageModelsModelTester",
             "BlipTextModelTest": "BlipTextModelTester",
             "BlipTextRetrievalModelTest": "BlipTextRetrievalModelTester",
-            "BlipVQAModelTest": "BlipModelTester",
+            "BlipVQAModelTest": "BlipVQAModelTester",
             "BlipVisionModelTest": "BlipVisionModelTester",
         }
 
@@ -71,7 +71,7 @@ def test_get_model_to_test_mapping(self):
         EXPECTED_BLIP_MAPPING = {
             "BlipForConditionalGeneration": ["BlipTextImageModelTest"],
             "BlipForImageTextRetrieval": ["BlipTextRetrievalModelTest"],
-            "BlipForQuestionAnswering": ["BlipTextImageModelTest", "BlipVQAModelTest"],
+            "BlipForQuestionAnswering": ["BlipVQAModelTest"],
             "BlipModel": ["BlipModelTest"],
             "BlipTextModel": ["BlipTextModelTest"],
             "BlipVisionModel": ["BlipVisionModelTest"],
@@ -99,7 +99,7 @@ def test_get_model_to_tester_mapping(self):
         EXPECTED_BLIP_MAPPING = {
             "BlipForConditionalGeneration": ["BlipTextImageModelsModelTester"],
             "BlipForImageTextRetrieval": ["BlipTextRetrievalModelTester"],
-            "BlipForQuestionAnswering": ["BlipModelTester", "BlipTextImageModelsModelTester"],
+            "BlipForQuestionAnswering": ["BlipVQAModelTester"],
             "BlipModel": ["BlipModelTester"],
             "BlipTextModel": ["BlipTextModelTester"],
             "BlipVisionModel": ["BlipVisionModelTester"],

From 6397b7f00804b1ebdd1ed3612fc1abdd8cc07659 Mon Sep 17 00:00:00 2001
From: Tyler <41713505+Tylersuard@users.noreply.github.com>
Date: Mon, 22 May 2023 17:53:14 +0800
Subject: [PATCH 181/935] Debug example code for MegaForCausalLM (#23382)

* Debug example code for MegaForCausalLM

set ignore_mismatched_sizes=True in model loading code

* Fix up
---
 src/transformers/models/mega/modeling_mega.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py
index 98914b439c18..9d1890788f4f 100644
--- a/src/transformers/models/mega/modeling_mega.py
+++ b/src/transformers/models/mega/modeling_mega.py
@@ -1743,7 +1743,9 @@ def forward(
         >>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext")
         >>> config.is_decoder = True
         >>> config.bidirectional = False
-        >>> model = MegaForCausalLM.from_pretrained("mnaylor/mega-base-wikitext", config=config)
+        >>> model = MegaForCausalLM.from_pretrained(
+        ...     "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True
+        ... )
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)

From 12ec7f0c20be85af2e5e2b041880179ec81cbdb0 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 22 May 2023 15:28:26 +0200
Subject: [PATCH 182/935] Remove erroneous `img` closing tag (#23646)

See https://github.com/huggingface/transformers/pull/23625
---
 docs/source/pt/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/pt/index.mdx b/docs/source/pt/index.mdx
index 9b5cbc12e610..e9de6f464dd1 100644
--- a/docs/source/pt/index.mdx
+++ b/docs/source/pt/index.mdx
@@ -34,7 +34,7 @@ Cada arquitetura 🤗 Transformers é definida em um módulo individual do Pytho
 ## Se você estiver procurando suporte do time da Hugging Face, acesse
 
 <a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);"></img>
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
 </a>
 
 ## Conteúdo

From 29294b0e6856456eb681fe033557d1059d1d2086 Mon Sep 17 00:00:00 2001
From: zspo <songpo.zhang@foxmail.com>
Date: Mon, 22 May 2023 21:30:46 +0800
Subject: [PATCH 183/935] Fix tensor device while attention_mask is not None
 (#23538)

* Fix tensor device while attention_mask is not None

* Fix tensor device while attention_mask is not None
---
 src/transformers/models/llama/modeling_llama.py           | 4 +++-
 src/transformers/models/open_llama/modeling_open_llama.py | 4 +++-
 src/transformers/models/opt/modeling_opt.py               | 4 +++-
 src/transformers/models/xglm/modeling_xglm.py             | 4 +++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index e7ce8a661fc8..3d4514725c4c 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -226,7 +226,9 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index a88ba62056d4..73632124a068 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -249,7 +249,9 @@ def forward(
                         f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                     )
                 attn_weights = attn_weights + attention_mask
-                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+                attn_weights = torch.max(
+                    attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+                )
 
             # upcast attention to fp32
             attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index b6c84777cc1f..15fc3b033a22 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -222,7 +222,9 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 3cf16352a70c..7f1fca9f94a8 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -315,7 +315,9 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437

From 7bbdfd7b240141dd578610e0bcae2fe62cc13451 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 22 May 2023 15:39:47 +0200
Subject: [PATCH 184/935] Fix accelerate logger bug (#23650)

* fix logger bug

* Update tests/mixed_int8/test_mixed_int8.py

Co-authored-by: Zachary Mueller <muellerzr@gmail.com>

* import `PartialState`

---------

Co-authored-by: Zachary Mueller <muellerzr@gmail.com>
---
 tests/mixed_int8/test_mixed_int8.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/mixed_int8/test_mixed_int8.py b/tests/mixed_int8/test_mixed_int8.py
index 1628e08155d6..7053be30a101 100644
--- a/tests/mixed_int8/test_mixed_int8.py
+++ b/tests/mixed_int8/test_mixed_int8.py
@@ -29,6 +29,7 @@
     pipeline,
 )
 from transformers.testing_utils import (
+    is_accelerate_available,
     is_torch_available,
     require_accelerate,
     require_bitsandbytes,
@@ -40,6 +41,13 @@
 from transformers.utils.versions import importlib_metadata
 
 
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.logging import get_logger
+
+    logger = get_logger(__name__)
+    _ = PartialState()
+
 if is_torch_available():
     import torch
     import torch.nn as nn

From fe34486f129d47abc0dddb39a22b24cdbe52dec8 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Mon, 22 May 2023 11:22:54 -0400
Subject: [PATCH 185/935] Muellerzr fix deepspeed (#23657)

* Fix deepspeed recursion

* Better fix
---
 src/transformers/training_args.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b42400e57a6e..e2df8dc333b0 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1708,7 +1708,8 @@ def n_gpu(self):
         """
         requires_backends(self, ["torch"])
         # Make sure `self._n_gpu` is properly setup.
-        _ = self._setup_devices
+        if not hasattr(self, "_n_gpu"):
+            _ = self._setup_devices
         return self._n_gpu
 
     @property

From 4ddd9de9d31b8ec839facaacf1c7a1756bb1840a Mon Sep 17 00:00:00 2001
From: Tim Dettmers <TimDettmers@users.noreply.github.com>
Date: Mon, 22 May 2023 09:20:38 -0700
Subject: [PATCH 186/935] Bugfix: LLaMA layer norm incorrectly changes input
 type and consumers lots of memory (#23535)

* Fixed bug where LLaMA layer norm would change input type.

* make fix-copies

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 src/transformers/models/llama/modeling_llama.py           | 7 ++-----
 src/transformers/models/open_llama/modeling_open_llama.py | 7 ++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 3d4514725c4c..80cfdfa5f066 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -81,14 +81,11 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
 
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
+        return (self.weight * hidden_states).to(input_dtype)
 
 
 class LlamaRotaryEmbedding(torch.nn.Module):
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 73632124a068..5a63b733183b 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -91,14 +91,11 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
 
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
+        return (self.weight * hidden_states).to(input_dtype)
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->OpenLlama

From 5de2a6d5e51a455c50bf047fa727f41dcf54613d Mon Sep 17 00:00:00 2001
From: LWprogramming <LWprogramming@users.noreply.github.com>
Date: Mon, 22 May 2023 09:57:45 -0700
Subject: [PATCH 187/935] Fix wav2vec2 is_batched check to include 2-D numpy
 arrays (#23223)

* Fix wav2vec2 is_batched check to include 2-D numpy arrays

* address comment

* Add tests

* oops

* oops

* Switch to np array

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Switch to np array

* condition merge

* Specify mono channel only in comment

* oops, add other comment too

* make style

* Switch list check from falsiness to empty

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/feature_extraction_sequence_utils.py |  2 +-
 .../models/wav2vec2/feature_extraction_wav2vec2.py    | 11 +++++++----
 .../models/wav2vec2/tokenization_wav2vec2.py          | 11 +++++++----
 .../wav2vec2/test_feature_extraction_wav2vec2.py      |  8 ++++++++
 tests/models/wav2vec2/test_tokenization_wav2vec2.py   |  8 ++++++++
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 2121261be056..40717d993185 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -140,7 +140,7 @@ def pad(
             return_attention_mask if return_attention_mask is not None else self.return_attention_mask
         )
 
-        if not required_input:
+        if len(required_input) == 0:
             if return_attention_mask:
                 processed_features["attention_mask"] = []
             return processed_features
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index 9550b7c2a9ef..2c2066739ddd 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -117,7 +117,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -181,9 +182,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         # always return batch
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 1708dbf12512..15d3471da0d2 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -817,12 +817,15 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrayr or a list of list of float values.
+                values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
         """
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         # make sure input is in list format
diff --git a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
index 44f2ed5b8736..556f01c6b2ee 100644
--- a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
+++ b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
@@ -123,6 +123,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_zero_mean_unit_variance_normalization_np(self):
         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
index cf5dc100c2a7..9715680e27bf 100644
--- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py
+++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
@@ -164,6 +164,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = tokenizer(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = tokenizer(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_padding(self, max_length=50):
         def _input_values_have_equal_length(input_values):
             length = len(input_values[0])

From 6f72e71f97fd7bd114b10b4322e72ecbda283f3b Mon Sep 17 00:00:00 2001
From: sshahrokhi <shahrokhi@google.com>
Date: Mon, 22 May 2023 09:58:55 -0700
Subject: [PATCH 188/935] changing the requirements to a cpu torch version that
 works (#23483)

---
 examples/flax/vision/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/flax/vision/requirements.txt b/examples/flax/vision/requirements.txt
index cf1859d75494..539ffdc6fa9f 100644
--- a/examples/flax/vision/requirements.txt
+++ b/examples/flax/vision/requirements.txt
@@ -3,6 +3,6 @@ jaxlib>=0.1.59
 flax>=0.3.5
 optax>=0.0.8
 -f https://download.pytorch.org/whl/torch_stable.html
-torch==1.9.0+cpu 
+torch==1.11.0+cpu
 -f https://download.pytorch.org/whl/torch_stable.html
-torchvision==0.10.0+cpu
\ No newline at end of file
+torchvision==0.12.0+cpu

From 26a06814a1a6c7e2254b789f72f757dcc100f7bd Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Mon, 22 May 2023 18:42:35 +0100
Subject: [PATCH 189/935] Fix SAM tests and use smaller checkpoints (#23656)

* Fix SAM tests and use smaller checkpoints

* Override test_model_from_pretrained to use sam-vit-base as well

* make fixup
---
 tests/models/sam/test_modeling_sam.py    | 104 +++++++++++------------
 tests/models/sam/test_modeling_tf_sam.py | 100 +++++++++++-----------
 2 files changed, 99 insertions(+), 105 deletions(-)

diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 8507c0a6381b..599ed5e384bc 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -436,8 +436,9 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_hidden_states_output(self):
         pass
 
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=True, tol=5e-4):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=True, tol=tol)
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
+        # Use a slightly higher default tol to make the tests non-flaky
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
 
     @slow
     def test_model_from_pretrained(self):
@@ -461,8 +462,8 @@ def prepare_dog_img():
 @slow
 class SamModelIntegrationTest(unittest.TestCase):
     def test_inference_mask_generation_no_point(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -474,13 +475,12 @@ def test_inference_mask_generation_no_point(self):
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
-
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.5798), atol=2e-4))
-        self.assertTrue(torch.allclose(masks, torch.tensor([-6.6381, -6.0734, -7.5308]).to(torch_device), atol=2e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.4515), atol=2e-4))
+        self.assertTrue(torch.allclose(masks, torch.tensor([-4.1807, -3.4949, -3.4483]).to(torch_device), atol=2e-4))
 
     def test_inference_mask_generation_one_point_one_bb(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -497,15 +497,14 @@ def test_inference_mask_generation_one_point_one_bb(self):
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
-
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9935), atol=2e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9566), atol=2e-4))
         self.assertTrue(
-            torch.allclose(masks, torch.tensor([-21.5465, -23.1122, -22.3331]).to(torch_device), atol=2e-4)
+            torch.allclose(masks, torch.tensor([-12.7657, -12.3683, -12.5985]).to(torch_device), atol=2e-4)
         )
 
     def test_inference_mask_generation_batched_points_batched_images(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -528,26 +527,26 @@ def test_inference_mask_generation_batched_points_batched_images(self):
         EXPECTED_SCORES = torch.tensor(
             [
                 [
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
                 ],
                 [
-                    [0.8405, 0.6292, 0.3840],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
+                    [0.3317, 0.7264, 0.7646],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
                 ],
             ]
         )
-        EXPECTED_MASKS = torch.tensor([-26.5424, -34.0901, -30.6406])
+        EXPECTED_MASKS = torch.tensor([-2.8552, -2.7990, -2.9612])
         self.assertTrue(torch.allclose(scores, EXPECTED_SCORES, atol=1e-3))
         self.assertTrue(torch.allclose(masks, EXPECTED_MASKS, atol=1e-3))
 
     def test_inference_mask_generation_one_point_one_bb_zero(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -569,11 +568,11 @@ def test_inference_mask_generation_one_point_one_bb_zero(self):
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
 
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9689), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.7892), atol=1e-4))
 
     def test_inference_mask_generation_one_point(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -590,8 +589,7 @@ def test_inference_mask_generation_one_point(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9712), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9675), atol=1e-4))
 
         # With no label
         input_points = [[[400, 650]]]
@@ -601,12 +599,11 @@ def test_inference_mask_generation_one_point(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9712), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9675), atol=1e-4))
 
     def test_inference_mask_generation_two_points(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -623,8 +620,7 @@ def test_inference_mask_generation_two_points(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9936), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9762), atol=1e-4))
 
         # no labels
         inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt").to(torch_device)
@@ -633,11 +629,11 @@ def test_inference_mask_generation_two_points(self):
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
 
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9936), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9762), atol=1e-4))
 
     def test_inference_mask_generation_two_points_batched(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -654,13 +650,12 @@ def test_inference_mask_generation_two_points_batched(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(torch.allclose(scores[0][-1], torch.tensor(0.9936), atol=1e-4))
-        self.assertTrue(torch.allclose(scores[1][-1], torch.tensor(0.9716), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[0][-1], torch.tensor(0.9762), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[1][-1], torch.tensor(0.9637), atol=1e-4))
 
     def test_inference_mask_generation_one_box(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -674,12 +669,11 @@ def test_inference_mask_generation_one_box(self):
         with torch.no_grad():
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.8686), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.7937), atol=1e-4))
 
     def test_inference_mask_generation_batched_image_one_point(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -707,8 +701,8 @@ def test_inference_mask_generation_batched_image_one_point(self):
         self.assertTrue(torch.allclose(scores_batched[1, :], scores_single, atol=1e-4))
 
     def test_inference_mask_generation_two_points_point_batch(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -729,12 +723,12 @@ def test_inference_mask_generation_two_points_point_batch(self):
         iou_scores = outputs.iou_scores.cpu()
         self.assertTrue(iou_scores.shape == (1, 2, 3))
         torch.testing.assert_allclose(
-            iou_scores, torch.tensor([[[0.9848, 0.9788, 0.9713], [0.9211, 0.9128, 0.7427]]]), atol=1e-4, rtol=1e-4
+            iou_scores, torch.tensor([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]), atol=1e-4, rtol=1e-4
         )
 
     def test_inference_mask_generation_three_boxes_point_batch(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = SamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         model.to(torch_device)
         model.eval()
@@ -743,7 +737,9 @@ def test_inference_mask_generation_three_boxes_point_batch(self):
 
         # fmt: off
         input_boxes = torch.Tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]]).cpu()
-        EXPECTED_IOU = torch.tensor([[[1.0071, 1.0032, 0.9946], [0.4962, 0.8770, 0.8686], [0.4962, 0.8770, 0.8686]]])
+        EXPECTED_IOU = torch.tensor([[[0.9773, 0.9881, 0.9522],
+         [0.5996, 0.7661, 0.7937],
+         [0.5996, 0.7661, 0.7937]]])
         # fmt: on
         input_boxes = input_boxes.unsqueeze(0)
 
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index a07398365fff..fc8dd79765a0 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -34,7 +34,6 @@
     import tensorflow as tf
 
     from transformers import SamProcessor, TFSamModel
-    from transformers.models.sam.modeling_tf_sam import TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST
 
 if is_vision_available():
     from PIL import Image
@@ -400,9 +399,8 @@ def test_hidden_states_output(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in TF_SAM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TFSamModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")  # sam-vit-huge blows out our memory
+        self.assertIsNotNone(model)
 
     def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-4, name="outputs", attributes=None):
         super().check_pt_tf_outputs(
@@ -430,8 +428,8 @@ def prepare_dog_img():
 @slow
 class SamModelIntegrationTest(unittest.TestCase):
     def test_inference_mask_generation_no_point(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
         inputs = processor(images=raw_image, return_tensors="tf")
@@ -439,13 +437,12 @@ def test_inference_mask_generation_no_point(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
-
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.5798), atol=2e-4))
-        self.assertTrue(np.allclose(masks.numpy(), np.array([-6.6381, -6.0734, -7.5308]), atol=1e-2))
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.4515), atol=2e-4))
+        self.assertTrue(np.allclose(masks.numpy(), np.array([-4.1807, -3.4949, -3.4483]), atol=1e-2))
 
     def test_inference_mask_generation_one_point_one_bb(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
         input_boxes = [[[650, 900, 1000, 1250]]]
@@ -457,12 +454,12 @@ def test_inference_mask_generation_one_point_one_bb(self):
         scores = tf.squeeze(outputs.iou_scores)
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
 
-        self.assertTrue(np.allclose(scores[-1], np.array(0.9935), atol=2e-4))
-        self.assertTrue(np.allclose(masks.numpy(), np.array([-21.5465, -23.1122, -22.3331]), atol=2e-2))
+        self.assertTrue(np.allclose(scores[-1], np.array(0.9566), atol=2e-4))
+        self.assertTrue(np.allclose(masks.numpy(), np.array([-12.7657, -12.3683, -12.5985]), atol=2e-2))
 
     def test_inference_mask_generation_batched_points_batched_images(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
         input_points = [
@@ -479,26 +476,26 @@ def test_inference_mask_generation_batched_points_batched_images(self):
         EXPECTED_SCORES = np.array(
             [
                 [
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
                 ],
                 [
-                    [0.8405, 0.6292, 0.3840],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
-                    [0.9673, 0.9441, 0.9084],
+                    [0.3317, 0.7264, 0.7646],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
+                    [0.6765, 0.9379, 0.8803],
                 ],
             ]
         )
-        EXPECTED_MASKS = np.array([-26.5424, -34.0901, -30.6406])
+        EXPECTED_MASKS = np.array([-2.8552, -2.7990, -2.9612])
         self.assertTrue(np.allclose(scores.numpy(), EXPECTED_SCORES, atol=1e-3))
         self.assertTrue(np.allclose(masks.numpy(), EXPECTED_MASKS, atol=3e-2))
 
     def test_inference_mask_generation_one_point_one_bb_zero(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
         input_boxes = [[[620, 900, 1000, 1255]]]
@@ -515,12 +512,11 @@ def test_inference_mask_generation_one_point_one_bb_zero(self):
 
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
-
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9689), atol=1e-4))
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7894), atol=1e-4))
 
     def test_inference_mask_generation_one_point(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
 
@@ -532,7 +528,7 @@ def test_inference_mask_generation_one_point(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
 
-        self.assertTrue(np.allclose(scores[-1], np.array(0.9712), atol=1e-4))
+        self.assertTrue(np.allclose(scores[-1], np.array(0.9675), atol=1e-4))
 
         # With no label
         input_points = [[[400, 650]]]
@@ -542,11 +538,11 @@ def test_inference_mask_generation_one_point(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
 
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9712), atol=1e-4))
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9675), atol=1e-4))
 
     def test_inference_mask_generation_two_points(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
         raw_image = prepare_image()
 
         input_points = [[[400, 650], [800, 650]]]
@@ -557,7 +553,7 @@ def test_inference_mask_generation_two_points(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
 
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9936), atol=1e-4))
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
 
         # no labels
         inputs = processor(images=raw_image, input_points=input_points, return_tensors="tf")
@@ -565,11 +561,11 @@ def test_inference_mask_generation_two_points(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
 
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9936), atol=1e-4))
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.9762), atol=1e-4))
 
     def test_inference_mask_generation_two_points_batched(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
 
@@ -583,12 +579,12 @@ def test_inference_mask_generation_two_points_batched(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
 
-        self.assertTrue(np.allclose(scores[0][-1].numpy(), np.array(0.9936), atol=1e-4))
-        self.assertTrue(np.allclose(scores[1][-1], np.array(0.9716), atol=1e-4))
+        self.assertTrue(np.allclose(scores[0][-1].numpy(), np.array(0.9762), atol=1e-4))
+        self.assertTrue(np.allclose(scores[1][-1], np.array(0.9637), atol=1e-4))
 
     def test_inference_mask_generation_one_box(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
 
@@ -599,11 +595,11 @@ def test_inference_mask_generation_one_box(self):
         outputs = model(**inputs)
         scores = tf.squeeze(outputs.iou_scores)
 
-        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.8686), atol=1e-4))
+        self.assertTrue(np.allclose(scores[-1].numpy(), np.array(0.7937), atol=1e-4))
 
     def test_inference_mask_generation_batched_image_one_point(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
         raw_dog_image = prepare_dog_img()
@@ -624,8 +620,8 @@ def test_inference_mask_generation_batched_image_one_point(self):
         self.assertTrue(np.allclose(scores_batched[1, :].numpy(), scores_single.numpy(), atol=1e-4))
 
     def test_inference_mask_generation_two_points_point_batch(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
 
@@ -644,21 +640,23 @@ def test_inference_mask_generation_two_points_point_batch(self):
         self.assertTrue(
             np.allclose(
                 iou_scores.numpy(),
-                np.array([[[0.9848, 0.9788, 0.9713], [0.9211, 0.9128, 0.7427]]]),
+                np.array([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]),
                 atol=1e-4,
                 rtol=1e-4,
             )
         )
 
     def test_inference_mask_generation_three_boxes_point_batch(self):
-        model = TFSamModel.from_pretrained("facebook/sam-vit-huge")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = TFSamModel.from_pretrained("facebook/sam-vit-base")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
 
         raw_image = prepare_image()
 
         # fmt: off
         input_boxes = tf.convert_to_tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]])
-        EXPECTED_IOU = np.array([[[1.0071, 1.0032, 0.9946], [0.4962, 0.8770, 0.8686], [0.4962, 0.8770, 0.8686]]])
+        EXPECTED_IOU = np.array([[[0.9773, 0.9881, 0.9522],
+         [0.5996, 0.7661, 0.7937],
+         [0.5996, 0.7661, 0.7937]]])
         # fmt: on
         input_boxes = tf.expand_dims(input_boxes, 0)
 

From b191d7db442fcd4ea794768363155c091e60b868 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Mon, 22 May 2023 14:49:31 -0400
Subject: [PATCH 190/935] Update all no_trainer with skip_first_batches
 (#23664)

---
 .../run_image_classification_no_trainer.py    | 18 ++++++++++-------
 .../image-pretraining/run_mim_no_trainer.py   | 20 +++++++++----------
 .../language-modeling/run_clm_no_trainer.py   | 20 +++++++++----------
 .../language-modeling/run_mlm_no_trainer.py   | 20 +++++++++----------
 .../multiple-choice/run_swag_no_trainer.py    | 18 ++++++++++-------
 .../run_qa_beam_search_no_trainer.py          | 18 ++++++++++-------
 .../question-answering/run_qa_no_trainer.py   | 18 ++++++++++-------
 .../run_semantic_segmentation_no_trainer.py   | 20 +++++++++++--------
 .../run_summarization_no_trainer.py           | 18 ++++++++++-------
 .../run_glue_no_trainer.py                    | 12 +++++------
 .../run_ner_no_trainer.py                     | 12 +++++------
 .../translation/run_translation_no_trainer.py | 19 +++++++++---------
 12 files changed, 115 insertions(+), 98 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 6a900ff76137..9dd373e1a563 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -451,22 +451,26 @@ def collate_fn(examples):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 126029150b9e..18d5c15638c5 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -660,29 +660,27 @@ def preprocess_images(examples):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index c31f2867c37f..fb07ad9392d1 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -566,29 +566,27 @@ def group_texts(examples):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 29a2559851d0..593dae4628d9 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -610,29 +610,27 @@ def group_texts(examples):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 21f2e1bf04bc..7492f31ddbf9 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -557,22 +557,26 @@ def preprocess_function(examples):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 05c85bdf50e3..ed69e8b396fe 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -809,22 +809,26 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 2e363ae97098..8054779cc7b9 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -825,22 +825,26 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index de997529de81..5087a4186a17 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -554,22 +554,26 @@ def preprocess_val(example_batch):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
         if args.with_tracking:
             total_loss = 0
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index ea09c6b89e06..bef349fdb369 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -626,22 +626,26 @@ def postprocess_text(preds, labels):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
-
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             with accelerator.accumulate(model):
                 outputs = model(**batch)
                 loss = outputs.loss
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 2fbacade06c6..2ba3c4449836 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -510,12 +510,12 @@ def preprocess_function(examples):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             outputs = model(**batch)
             loss = outputs.loss
             # We keep track of the loss at each epoch
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index bc51fab14e04..82aea5d4d6cd 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -668,12 +668,12 @@ def compute_metrics():
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    completed_steps += 1
-                    continue
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             outputs = model(**batch)
             loss = outputs.loss
             # We keep track of the loss at each epoch
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index e52050308ab2..5267cea4e269 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -607,28 +607,27 @@ def postprocess_text(preds, labels):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
             # need to multiply `gradient_accumulation_steps` to reflect real steps
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step
 
     # update the progress_bar if load from checkpoint
-    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
-    completed_steps = starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
         if args.with_tracking:
             total_loss = 0
-        for step, batch in enumerate(train_dataloader):
-            # We need to skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == starting_epoch:
-                if resume_step is not None and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        completed_steps += 1
-                    continue
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
             outputs = model(**batch)
             loss = outputs.loss
             # We keep track of the loss at each epoch

From e69feab8a13cf6cbf99fd6f3ff6cbc105d2183d9 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 22 May 2023 21:26:51 +0200
Subject: [PATCH 191/935] Update workflow files (#23658)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-push.yml      | 16 ++++++++++++++++
 .github/workflows/self-scheduled.yml | 24 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index 603a148358d9..878ab4f18c0b 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -195,6 +195,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -284,6 +288,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -373,6 +381,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
@@ -459,6 +471,10 @@ jobs:
           git checkout ${{ env.CI_SHA }}
           echo "log = $(git log -n 1)"
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index e4a8cecb448e..a0a9d3a5de4e 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -119,6 +119,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -176,6 +180,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -221,6 +229,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -268,6 +280,10 @@ jobs:
         working-directory: /transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -315,6 +331,10 @@ jobs:
         run: |
           git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
@@ -361,6 +381,10 @@ jobs:
         working-directory: /workspace/transformers
         run: git fetch && git checkout ${{ github.sha }}
 
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
       - name: Remove cached torch extensions
         run: rm -rf /github/home/.cache/torch_extensions/
 

From 2f424d79797ea5344f1b3ac241be1a181cfc220d Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Mon, 22 May 2023 21:45:50 +0200
Subject: [PATCH 192/935] [image-to-text pipeline] Add conditional text support
 + GIT (#23362)

* First draft

* Remove print statements

* Add conditional generation

* Add more tests

* Remove scripts

* Remove BLIP specific linkes

* Add support for pix2struct

* Add fast test

* Address comment

* Fix style
---
 src/transformers/models/auto/modeling_auto.py |  2 +
 src/transformers/pipelines/image_to_text.py   | 49 +++++++++++-
 .../pipelines/test_pipelines_image_to_text.py | 76 +++++++++++++++++++
 3 files changed, 123 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2fb8f1172f39..d42cf7491d57 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -529,6 +529,8 @@
     [
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
+        ("git", "GitForCausalLM"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
     ]
 )
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index f34dad3cef81..1c082c2ecb38 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -20,6 +20,8 @@
     from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING
 
 logger = logging.get_logger(__name__)
@@ -56,8 +58,13 @@ def __init__(self, *args, **kwargs):
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING
         )
 
-    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None):
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None):
         forward_kwargs = {}
+        preprocess_params = {}
+
+        if prompt is not None:
+            preprocess_params["prompt"] = prompt
+
         if generate_kwargs is not None:
             forward_kwargs["generate_kwargs"] = generate_kwargs
         if max_new_tokens is not None:
@@ -69,7 +76,7 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None):
                     " please use only one"
                 )
             forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
-        return {}, forward_kwargs, {}
+        return preprocess_params, forward_kwargs, {}
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
@@ -98,9 +105,43 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image):
+    def preprocess(self, image, prompt=None):
         image = load_image(image)
-        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+
+        if prompt is not None:
+            if not isinstance(prompt, str):
+                raise ValueError(
+                    f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
+                    "Note also that one single text can be provided for conditional image to text generation."
+                )
+
+            model_type = self.model.config.model_type
+
+            if model_type == "git":
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
+                input_ids = [self.tokenizer.cls_token_id] + input_ids
+                input_ids = torch.tensor(input_ids).unsqueeze(0)
+                model_inputs.update({"input_ids": input_ids})
+
+            elif model_type == "pix2struct":
+                model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+
+            elif model_type != "vision-encoder-decoder":
+                # vision-encoder-decoder does not support conditional generation
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
+                model_inputs.update(text_inputs)
+
+            else:
+                raise ValueError(f"Model type {model_type} does not support conditional text generation")
+
+        else:
+            model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+
+        if self.model.config.model_type == "git" and prompt is None:
+            model_inputs["input_ids"] = None
+
         return model_inputs
 
     def _forward(self, model_inputs, generate_kwargs=None):
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 97fe3a398f58..2a73206f1ba6 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+import requests
+
 from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
 from transformers.pipelines import pipeline
 from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision, slow
@@ -125,6 +127,15 @@ def test_small_model_pt(self):
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_conditional(self):
+        pipe = pipeline("image-to-text", model="hf-internal-testing/tiny-random-BlipForConditionalGeneration")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertTrue(outputs[0]["generated_text"].startswith(prompt))
+
     @slow
     @require_torch
     def test_large_model_pt(self):
@@ -143,6 +154,71 @@ def test_large_model_pt(self):
             ],
         )
 
+    @slow
+    @require_torch
+    def test_generation_pt_blip(self):
+        pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+        url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        outputs = pipe(image)
+        self.assertEqual(outputs, [{"generated_text": "a pink pokemon pokemon with a blue shirt and a blue shirt"}])
+
+    @slow
+    @require_torch
+    def test_generation_pt_git(self):
+        pipe = pipeline("image-to-text", model="microsoft/git-base-coco")
+        url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        outputs = pipe(image)
+        self.assertEqual(outputs, [{"generated_text": "a cartoon of a purple character."}])
+
+    @slow
+    @require_torch
+    def test_conditional_generation_pt_blip(self):
+        pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = "a photography of"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertEqual(outputs, [{"generated_text": "a photography of a volcano"}])
+
+        with self.assertRaises(ValueError):
+            outputs = pipe([image, image], prompt=[prompt, prompt])
+
+    @slow
+    @require_torch
+    def test_conditional_generation_pt_git(self):
+        pipe = pipeline("image-to-text", model="microsoft/git-base-coco")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = "a photo of a"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertEqual(outputs, [{"generated_text": "a photo of a tent with a tent and a tent in the background."}])
+
+        with self.assertRaises(ValueError):
+            outputs = pipe([image, image], prompt=[prompt, prompt])
+
+    @slow
+    @require_torch
+    def test_conditional_generation_pt_pix2struct(self):
+        pipe = pipeline("image-to-text", model="google/pix2struct-ai2d-base")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
+
+        outputs = pipe(image, prompt=prompt)
+        self.assertEqual(outputs, [{"generated_text": "ash cloud"}])
+
+        with self.assertRaises(ValueError):
+            outputs = pipe([image, image], prompt=[prompt, prompt])
+
     @slow
     @require_tf
     def test_large_model_tf(self):

From e30ceae07b9f49752623646181ed38cc53c699d5 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 23 May 2023 09:27:36 +0200
Subject: [PATCH 193/935] small fix to remove unused eos in processor when it's
 not used. (#23408)

---
 src/transformers/convert_slow_tokenizer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 195d09ecd891..1934e35a575d 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -1175,7 +1175,11 @@ def post_processor(self):
             single = f"{(bos+':0 ') * add_bos}$A:0{(' '+eos+':0') * add_eos}"
             pair = f"{single}{(' '+bos+':1') * add_bos} $B:1{(' '+eos+':1') * add_eos}"
 
-            special_tokens = [(bos, bos_token_id), (eos, eos_token_id)]
+            special_tokens = []
+            if add_bos:
+                special_tokens.append((bos, bos_token_id))
+            if add_eos:
+                special_tokens.append((eos, eos_token_id))
             return processors.TemplateProcessing(single=single, pair=pair, special_tokens=special_tokens)
 
         else:

From ecc05f8c1e3e93dcf4178079222fc59d9da1418e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 23 May 2023 05:28:09 -0400
Subject: [PATCH 194/935] Bump requests from 2.27.1 to 2.31.0 in
 /examples/research_projects/decision_transformer (#23673)

Bump requests in /examples/research_projects/decision_transformer

Bumps [requests](https://github.com/psf/requests) from 2.27.1 to 2.31.0.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.27.1...v2.31.0)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 3cf50951975c..8c0ba1cc64a3 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -177,7 +177,7 @@ PyYAML==6.0
 ray==1.11.0
 redis==4.5.4
 regex==2022.3.15
-requests==2.27.1
+requests==2.31.0
 requests-oauthlib==1.3.1
 resampy==0.2.2
 responses==0.18.0

From 9bf72ae5648a1835e6e703c6a77379ee99b06f0e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 23 May 2023 05:31:30 -0400
Subject: [PATCH 195/935] Bump requests from 2.22.0 to 2.31.0 in
 /examples/research_projects/visual_bert (#23670)

Bump requests in /examples/research_projects/visual_bert

Bumps [requests](https://github.com/psf/requests) from 2.22.0 to 2.31.0.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.22.0...v2.31.0)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/visual_bert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
index 0d483b6d1892..8b2d89edbc4e 100644
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@@ -75,7 +75,7 @@ pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
 regex==2020.7.14
-requests==2.22.0
+requests==2.31.0
 retrying==1.3.3
 sacremoses==0.0.43
 Send2Trash==1.5.0

From aa30cd4f3f68adf28737a2ed6555c6353e9a8e90 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 23 May 2023 05:31:53 -0400
Subject: [PATCH 196/935] Bump requests from 2.22.0 to 2.31.0 in
 /examples/research_projects/lxmert (#23668)

Bump requests in /examples/research_projects/lxmert

Bumps [requests](https://github.com/psf/requests) from 2.22.0 to 2.31.0.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.22.0...v2.31.0)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/lxmert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index 0d483b6d1892..8b2d89edbc4e 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -75,7 +75,7 @@ pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
 regex==2020.7.14
-requests==2.22.0
+requests==2.31.0
 retrying==1.3.3
 sacremoses==0.0.43
 Send2Trash==1.5.0

From 527ab894e59b6582578008e3b47648a65063f73d Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 23 May 2023 11:43:12 +0200
Subject: [PATCH 197/935] Add PerSAM [bis] (#23659)

* Add PerSAM args

* Make attn_sim optional

* Rename to attention_similarity

* Add docstrigns

* Improve docstrings
---
 src/transformers/models/sam/modeling_sam.py | 39 ++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 7df461175097..29111c144362 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -224,7 +224,7 @@ def _recombine_heads(self, hidden_states: Tensor, point_batch_size: int) -> Tens
         hidden_states = hidden_states.transpose(1, 2)
         return hidden_states.reshape(batch // point_batch_size, point_batch_size, n_tokens, n_heads * c_per_head)
 
-    def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor:
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, attention_similarity: Tensor = None) -> Tensor:
         # Input projections
         query = self.q_proj(query)
         key = self.k_proj(key)
@@ -242,6 +242,10 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor:
         attn = attn / math.sqrt(c_per_head)
         attn = torch.softmax(attn, dim=-1)
 
+        if attention_similarity is not None:
+            attn = attn + attention_similarity
+            attn = torch.softmax(attn, dim=-1)
+
         # Get output
         out = attn @ value
         out = self._recombine_heads(out, point_batch_size)
@@ -290,6 +294,7 @@ def forward(
         keys: Tensor,
         query_point_embedding: Tensor,
         key_point_embedding: Tensor,
+        attention_similarity: Tensor,
         output_attentions: bool = False,
     ):
         # Self attention block
@@ -305,7 +310,9 @@ def forward(
         query = queries + query_point_embedding
         key = keys + key_point_embedding
 
-        attn_out = self.cross_attn_token_to_image(query=query, key=key, value=keys)
+        attn_out = self.cross_attn_token_to_image(
+            query=query, key=key, value=keys, attention_similarity=attention_similarity
+        )
         queries = queries + attn_out
 
         queries = self.layer_norm2(queries)
@@ -353,6 +360,8 @@ def forward(
         point_embeddings: Tensor,
         image_embeddings: Tensor,
         image_positional_embeddings: Tensor,
+        attention_similarity: Tensor,
+        target_embedding=None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -377,11 +386,15 @@ def forward(
 
         # Apply transformer blocks and final layernorm
         for layer in self.layers:
+            if target_embedding is not None:
+                queries += target_embedding
+
             queries, keys, attention_outputs = layer(
                 queries=queries,
                 keys=keys,
                 query_point_embedding=point_embeddings,
                 key_point_embedding=image_positional_embeddings,
+                attention_similarity=attention_similarity,
                 output_attentions=output_attentions,
             )
 
@@ -460,6 +473,8 @@ def forward(
         dense_prompt_embeddings: torch.Tensor,
         multimask_output: bool,
         output_attentions: Optional[bool] = None,
+        attention_similarity: torch.Tensor = None,
+        target_embedding: torch.Tensor = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Predict masks given image and prompt embeddings.
@@ -500,6 +515,8 @@ def forward(
             point_embeddings=point_embeddings,
             image_embeddings=image_embeddings,
             image_positional_embeddings=image_positional_embeddings,
+            attention_similarity=attention_similarity,
+            target_embedding=target_embedding,
             output_attentions=output_attentions,
         )
         iou_token_out = point_embedding[:, :, 0, :]
@@ -576,8 +593,12 @@ def __init__(self, config: SamPromptEncoderConfig):
         self.conv1 = nn.Conv2d(1, self.mask_input_channels, kernel_size=2, stride=2)
         self.conv2 = nn.Conv2d(self.mask_input_channels, config.mask_input_channels, kernel_size=2, stride=2)
         self.conv3 = nn.Conv2d(config.mask_input_channels, config.hidden_size, kernel_size=1)
-        self.layer_norm1 = SamLayerNorm(self.mask_input_channels, config.layer_norm_eps)
-        self.layer_norm2 = SamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps)
+        self.layer_norm1 = SamLayerNorm(
+            self.mask_input_channels, eps=config.layer_norm_eps, data_format="channels_first"
+        )
+        self.layer_norm2 = SamLayerNorm(
+            self.mask_input_channels * 4, eps=config.layer_norm_eps, data_format="channels_first"
+        )
 
     def forward(self, masks):
         hidden_states = self.conv1(masks)
@@ -1146,6 +1167,12 @@ def _init_weights(self, module):
             In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
             bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
             "best" mask, by specifying `multimask_output=False`.
+        attention_similarity (`torch.FloatTensor`, *optional*):
+            Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the
+            model is used for personalization as introduced in [PerSAM](https://arxiv.org/abs/2305.03048).
+        target_embedding (`torch.FloatTensor`, *optional*):
+            Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case
+            the model is used for personalization as introduced in [PerSAM](https://arxiv.org/abs/2305.03048).
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -1265,6 +1292,8 @@ def forward(
         input_masks: Optional[torch.LongTensor] = None,
         image_embeddings: Optional[torch.FloatTensor] = None,
         multimask_output: bool = True,
+        attention_similarity: Optional[torch.FloatTensor] = None,
+        target_embedding: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict=None,
@@ -1374,6 +1403,8 @@ def forward(
             sparse_prompt_embeddings=sparse_embeddings,
             dense_prompt_embeddings=dense_embeddings,
             multimask_output=multimask_output,
+            attention_similarity=attention_similarity,
+            target_embedding=target_embedding,
             output_attentions=output_attentions,
         )
 

From b687af0b3681b19abb0423d9cf54c2f3040ddc72 Mon Sep 17 00:00:00 2001
From: Alex <116374290+aaalexlit@users.noreply.github.com>
Date: Tue, 23 May 2023 13:57:58 +0200
Subject: [PATCH 198/935] Fix typo in a parameter name for open llama model
 (#23637)

* Update modeling_open_llama.py

Fix typo in `use_memorry_efficient_attention` parameter name

* Update configuration_open_llama.py

Fix typo in `use_memorry_efficient_attention` parameter name

* Update configuration_open_llama.py

Take care of backwards compatibility ensuring that the previous parameter name is taken into account if used

* Update configuration_open_llama.py

format to adjust the line length

* Update configuration_open_llama.py

proper code formatting using `make fixup`

* Update configuration_open_llama.py

pop the argument not to let it be set later down the line
---
 .../models/open_llama/configuration_open_llama.py           | 6 ++++--
 src/transformers/models/open_llama/modeling_open_llama.py   | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/open_llama/configuration_open_llama.py b/src/transformers/models/open_llama/configuration_open_llama.py
index c202082b5536..cbde4d67d498 100644
--- a/src/transformers/models/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/open_llama/configuration_open_llama.py
@@ -99,7 +99,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        use_memorry_efficient_attention=True,
+        use_memory_efficient_attention=True,
         hidden_dropout_prob=0.1,
         attention_dropout_prob=0.1,
         use_stable_embedding=True,
@@ -116,7 +116,9 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.use_memorry_efficient_attention = use_memorry_efficient_attention
+        self.use_memory_efficient_attention = kwargs.pop(
+            "use_memorry_efficient_attention", use_memory_efficient_attention
+        )
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_dropout_prob = attention_dropout_prob
         self.use_stable_embedding = use_stable_embedding
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 5a63b733183b..9a49f2380682 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -40,7 +40,7 @@
 except ImportError:
     xops = None
     logger.warn(
-        "Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
+        "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
     )
 
 
@@ -223,7 +223,7 @@ def forward(
 
         past_key_value = (key_states, value_states) if use_cache else None
 
-        if self.config.use_memorry_efficient_attention and xops is not None and self.training:
+        if self.config.use_memory_efficient_attention and xops is not None and self.training:
             attn_weights = None
             query_states = query_states.transpose(1, 2)
             key_states = key_states.transpose(1, 2)
@@ -563,7 +563,7 @@ def forward(
             if self.embed_layer_norm:
                 inputs_embeds = self.embed_layer_norm(inputs_embeds)
         # embed positions
-        if self.config.use_memorry_efficient_attention and self.training:
+        if self.config.use_memory_efficient_attention and self.training:
             attention_mask = None
         elif attention_mask is None:
             attention_mask = torch.ones(

From abf691aac0b486cbea2341e538dee75fdb76db71 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 23 May 2023 14:48:38 +0200
Subject: [PATCH 199/935] Fix PyTorch SAM tests (#23682)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/sam/test_modeling_sam.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 599ed5e384bc..2342e8010b92 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -476,7 +476,7 @@ def test_inference_mask_generation_no_point(self):
         scores = outputs.iou_scores.squeeze()
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
         self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.4515), atol=2e-4))
-        self.assertTrue(torch.allclose(masks, torch.tensor([-4.1807, -3.4949, -3.4483]).to(torch_device), atol=2e-4))
+        self.assertTrue(torch.allclose(masks, torch.tensor([-4.1800, -3.4948, -3.4481]).to(torch_device), atol=2e-4))
 
     def test_inference_mask_generation_one_point_one_bb(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
@@ -499,7 +499,7 @@ def test_inference_mask_generation_one_point_one_bb(self):
         masks = outputs.pred_masks[0, 0, 0, 0, :3]
         self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.9566), atol=2e-4))
         self.assertTrue(
-            torch.allclose(masks, torch.tensor([-12.7657, -12.3683, -12.5985]).to(torch_device), atol=2e-4)
+            torch.allclose(masks, torch.tensor([-12.7729, -12.3665, -12.6061]).to(torch_device), atol=2e-4)
         )
 
     def test_inference_mask_generation_batched_points_batched_images(self):
@@ -540,7 +540,7 @@ def test_inference_mask_generation_batched_points_batched_images(self):
                 ],
             ]
         )
-        EXPECTED_MASKS = torch.tensor([-2.8552, -2.7990, -2.9612])
+        EXPECTED_MASKS = torch.tensor([-2.8550, -2.7988, -2.9625])
         self.assertTrue(torch.allclose(scores, EXPECTED_SCORES, atol=1e-3))
         self.assertTrue(torch.allclose(masks, EXPECTED_MASKS, atol=1e-3))
 
@@ -568,7 +568,7 @@ def test_inference_mask_generation_one_point_one_bb_zero(self):
             outputs = model(**inputs)
         scores = outputs.iou_scores.squeeze()
 
-        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.7892), atol=1e-4))
+        self.assertTrue(torch.allclose(scores[-1], torch.tensor(0.7894), atol=1e-4))
 
     def test_inference_mask_generation_one_point(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")

From 9e8d7066e6ae90bf68ab6f8cd71b196c3b31295f Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 23 May 2023 15:16:34 +0200
Subject: [PATCH 200/935] Making `safetensors` a core dependency. (#23254)

* Making `safetensors` a core dependency.

To be merged later, I'm creating the PR so we can try it out.

* Update setup.py

* Remove duplicates.

* Even more redundant.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 952cfcf510aa..19754eaf3f21 100644
--- a/setup.py
+++ b/setup.py
@@ -321,7 +321,6 @@ def run(self):
         "protobuf",  # Can be removed once we can unpin protobuf
         "sacremoses",
         "rjieba",
-        "safetensors",
         "beautifulsoup4",
     )
     + extras["retrieval"]
@@ -425,6 +424,7 @@ def run(self):
     deps["regex"],  # for OpenAI GPT
     deps["requests"],  # for downloading models over HTTPS
     deps["tokenizers"],
+    deps["safetensors"],
     deps["tqdm"],  # progress bars in model download and training scripts
 ]
 

From 1fe1e3caa44617047f149bcc0c0b566343b714a7 Mon Sep 17 00:00:00 2001
From: Nayeon Han <nayeon2.han@gmail.com>
Date: Tue, 23 May 2023 22:54:39 +0900
Subject: [PATCH 201/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`tasks/monocular=5Fdepth=5Festimation.mdx`=20to=20Korean=20(?=
 =?UTF-8?q?#23621)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

docs: ko: `tasks/monocular_depth_estimation`

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml                   |   4 +-
 .../ko/tasks/monocular_depth_estimation.mdx   | 145 ++++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tasks/monocular_depth_estimation.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 386723497f0c..aae299cfb66c 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -62,8 +62,8 @@
         title: 제로샷(zero-shot) 객체 탐지
       - local: tasks/zero_shot_image_classification
         title: 제로샷(zero-shot) 이미지 분류
-      - local: in_translation
-        title: (번역중) Depth estimation
+      - local: tasks/monocular_depth_estimation
+        title: 단일 영상 기반 깊이 추정
     title: (번역중) 컴퓨터 비전
     isExpanded: false
   - sections:
diff --git a/docs/source/ko/tasks/monocular_depth_estimation.mdx b/docs/source/ko/tasks/monocular_depth_estimation.mdx
new file mode 100644
index 000000000000..2ccadd2fd3cb
--- /dev/null
+++ b/docs/source/ko/tasks/monocular_depth_estimation.mdx
@@ -0,0 +1,145 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 단일 영상 기반 깊이 추정[[depth-estimation-pipeline]]
+
+단일 영상 기반 깊이 추정은 한 장면의 단일 이미지에서 장면의 깊이 정보를 예측하는 컴퓨터 비전 작업입니다.
+즉, 단일 카메라 시점의 장면에 있는 물체의 거리를 예측하는 과정입니다.
+
+단일 영상 기반 깊이 추정은 3D 재구성, 증강 현실, 자율 주행, 로봇 공학 등 다양한 분야에서 응용됩니다. 
+조명 조건, 가려짐, 텍스처와 같은 요소의 영향을 받을 수 있는 장면 내 물체와 해당 깊이 정보 간의 복잡한 관계를 모델이 이해해야 하므로 까다로운 작업입니다.
+
+
+<Tip>
+이 튜토리얼에서 다루는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+이번 가이드에서 배울 내용은 다음과 같습니다:
+
+* 깊이 추정 파이프라인 만들기
+* 직접 깊이 추정 추론하기
+
+시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+
+```bash
+pip install -q transformers
+```
+
+## 깊이 추정 파이프라인[[depth-estimation-inference-by-hand]]
+
+깊이 추정을 추론하는 가장 간단한 방법은 해당 기능을 제공하는 [`pipeline`]을 사용하는 것입니다.
+[Hugging Face Hub 체크포인트](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads)에서 파이프라인을 초기화합니다:
+
+```py
+>>> from transformers import pipeline
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+>>> depth_estimator = pipeline("depth-estimation", model=checkpoint)
+```
+
+
+다음으로, 분석할 이미지를 한 장 선택하세요:
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-estimation-example.jpg" alt="Photo of a busy street"/>
+</div>
+
+이미지를 파이프라인으로 전달합니다.
+
+```py
+>>> predictions = depth_estimator(image)
+```
+
+파이프라인은 두 개의 항목을 가지는 딕셔너리를 반환합니다.
+첫 번째는 `predicted_depth`로 각 픽셀의 깊이를 미터로 표현한 값을 가지는 텐서입니다.
+두 번째는 `depth`로 깊이 추정 결과를 시각화하는 PIL 이미지입니다.
+
+이제 시각화한 결과를 살펴보겠습니다:
+
+```py
+>>> predictions["depth"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
+
+## 직접 깊이 추정 추론하기[[depth-estimation-inference-by-hand]]
+
+이제 깊이 추정 파이프라인 사용법을 살펴보았으니 동일한 결과를 복제하는 방법을 살펴보겠습니다.
+[Hugging Face Hub 체크포인트](https://huggingface.co/models?pipeline_tag=depth-estimation&sort=downloads)에서 모델과 관련 프로세서를 가져오는 것부터 시작합니다.
+여기서 이전에 사용한 체크포인트와 동일한 것을 사용합니다:
+
+```py
+>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+>>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint)
+```
+
+필요한 이미지 변환을 처리하는 `image_processor`를 사용하여 모델에 대한 이미지 입력을 준비합니다.
+`image_processor`는 크기 조정 및 정규화 등 필요한 이미지 변환을 처리합니다:
+
+```py
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+```
+
+준비한 입력을 모델로 전달합니다:
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(pixel_values)
+...     predicted_depth = outputs.predicted_depth
+```
+
+결과를 시각화합니다:
+
+```py
+>>> import numpy as np
+
+>>> # 원본 사이즈로 복원
+>>> prediction = torch.nn.functional.interpolate(
+...     predicted_depth.unsqueeze(1),
+...     size=image.size[::-1],
+...     mode="bicubic",
+...     align_corners=False,
+... ).squeeze()
+>>> output = prediction.numpy()
+
+>>> formatted = (output * 255 / np.max(output)).astype("uint8")
+>>> depth = Image.fromarray(formatted)
+>>> depth
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>

From 71a5ed34339f4b2e043998fef53a8cfeeaa2e13f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 23 May 2023 17:32:57 +0200
Subject: [PATCH 202/935] Fix a `BridgeTower` test (#23694)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/bridgetower/test_modeling_bridgetower.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index 1a4d34d3967f..9c40f376a7b5 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -627,7 +627,8 @@ def _get_non_used_layer_names(self, model_class):
         non_used_layer_names = ["text_model.pooler"]
         if model_class == BridgeTowerForMaskedLM:
             non_used_layer_names = non_used_layer_names + [
-                "cross_modal_image_layers.5",
+                # This number `1` actually depends on the number of layers in `cross_modal_image_layers` (by minus 1)
+                "cross_modal_image_layers.1",
                 "cross_modal_image_pooler",
                 "cross_modal_text_pooler",
             ]

From 42baa58f904b430ff1f93be207bf66e8daa096b5 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 23 May 2023 17:36:49 +0200
Subject: [PATCH 203/935] =?UTF-8?q?[`SAM`]=C2=A0Fixes=20pipeline=20and=20a?=
 =?UTF-8?q?dds=20a=20dummy=20pipeline=20test=20(#23684)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add a dummy pipeline test

* change test name
---
 src/transformers/models/sam/image_processing_sam.py | 2 +-
 tests/models/sam/test_modeling_sam.py               | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index 64f3bae22218..821b43624d07 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -934,7 +934,7 @@ def _generate_crop_boxes(
     cropped_images, point_grid_per_crop = _generate_crop_images(
         crop_boxes, image, points_grid, layer_idxs, target_size, original_size
     )
-
+    crop_boxes = np.array(crop_boxes)
     crop_boxes = crop_boxes.astype(np.float32)
     points_per_crop = np.array([point_grid_per_crop])
     points_per_crop = np.transpose(points_per_crop, axes=(0, 2, 1, 3))
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 2342e8010b92..a70151452229 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -20,7 +20,7 @@
 
 import requests
 
-from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
+from transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig, pipeline
 from transformers.testing_utils import require_torch, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -751,3 +751,9 @@ def test_inference_mask_generation_three_boxes_point_batch(self):
         iou_scores = outputs.iou_scores.cpu()
         self.assertTrue(iou_scores.shape == (1, 3, 3))
         torch.testing.assert_allclose(iou_scores, EXPECTED_IOU, atol=1e-4, rtol=1e-4)
+
+    def test_dummy_pipeline_generation(self):
+        generator = pipeline("mask-generation", model="facebook/sam-vit-base", device=torch_device)
+        raw_image = prepare_image()
+
+        _ = generator(raw_image, points_per_batch=64)

From 876d9a32c607831874dd72576fd9b41a4ad48102 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 23 May 2023 16:42:11 +0100
Subject: [PATCH 204/935] TF version compatibility fixes (#23663)

* New TF version compatibility fixes

* Remove dummy print statement, move expand_1d

* Make a proper framework inference function

* Make a proper framework inference function

* ValueError -> TypeError
---
 src/transformers/modeling_tf_utils.py | 31 ++++------
 src/transformers/pipelines/base.py    |  7 +--
 src/transformers/tf_utils.py          | 87 +++++++++++++++++++++++++++
 src/transformers/utils/__init__.py    |  1 +
 src/transformers/utils/generic.py     | 32 +++++++---
 5 files changed, 128 insertions(+), 30 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 630290d92161..c58926f476c1 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -38,7 +38,7 @@
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation import GenerationConfig, TFGenerationMixin
-from .tf_utils import shape_list
+from .tf_utils import expand_1d, load_attributes_from_hdf5_group, save_attributes_to_hdf5_group, shape_list
 from .utils import (
     DUMMY_INPUTS,
     SAFE_WEIGHTS_INDEX_NAME,
@@ -65,16 +65,15 @@
 from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 
 
-if parse(tf.__version__) >= parse("2.11.0"):
+if parse(tf.__version__).minor >= 13:
+    from keras import backend as K
+    from keras.__internal__ import KerasTensor
+elif parse(tf.__version__).minor >= 11:
     from keras import backend as K
-    from keras.engine import data_adapter
     from keras.engine.keras_tensor import KerasTensor
-    from keras.saving.legacy import hdf5_format
 else:
     from tensorflow.python.keras import backend as K
-    from tensorflow.python.keras.engine import data_adapter
     from tensorflow.python.keras.engine.keras_tensor import KerasTensor
-    from tensorflow.python.keras.saving import hdf5_format
 
 
 if is_safetensors_available():
@@ -797,9 +796,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
     try:
         with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
             # Retrieve the name of each layer from the H5 file
-            saved_h5_model_layers_name = set(
-                hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
-            )
+            saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names"))
             weight_value_tuples = []
 
             # Compute missing and unexpected sub layers
@@ -898,9 +895,7 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
     # Read the H5 file
     with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
         # Retrieve the name of each layer from the H5 file
-        saved_h5_model_layers_name = set(
-            hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
-        )
+        saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names"))
 
         # Find the missing layers from the high level list of layers
         missing_layers = list({layer.name for layer in model.layers} - saved_h5_model_layers_name)
@@ -924,7 +919,7 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
 
                 # Create a dict from the H5 saved model that looks like {"weight_name": weight_value}
                 # And a set with only the names
-                for weight_name in hdf5_format.load_attributes_from_hdf5_group(h5_layer_object, "weight_names"):
+                for weight_name in load_attributes_from_hdf5_group(h5_layer_object, "weight_names"):
                     # TF names always start with the model name so we ignore it
                     name = "/".join(weight_name.split("/")[1:])
 
@@ -1528,8 +1523,8 @@ def train_step(self, data):
         output_to_label = {val: key for key, val in label_to_output.items()}
         if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
             # Newer TF train steps leave this out
-            data = data_adapter.expand_1d(data)
-        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+            data = expand_1d(data)
+        x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
         # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
         # them during input/label pre-processing. This avoids surprising the user by wrecking their data.
         # In addition, modifying mutable Python inputs makes XLA compilation impossible.
@@ -1635,8 +1630,8 @@ def test_step(self, data):
         output_to_label = {val: key for key, val in label_to_output.items()}
         if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
             # Newer versions leave this out
-            data = data_adapter.expand_1d(data)
-        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+            data = expand_1d(data)
+        x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
         # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
         # them during input/label pre-processing. This avoids surprising the user by wrecking their data.
         # In addition, modifying mutable Python inputs makes XLA compilation impossible.
@@ -2402,7 +2397,7 @@ def save_pretrained(
                         )
                         param_dset[:] = layer.numpy()
                         layers.append(layer_name.encode("utf8"))
-                    hdf5_format.save_attributes_to_hdf5_group(shard_file, "layer_names", layers)
+                    save_attributes_to_hdf5_group(shard_file, "layer_names", layers)
 
         if push_to_hub:
             self._upload_modified_files(
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index de6c9a8ec4d9..f57d08efffd6 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -15,7 +15,6 @@
 import collections
 import csv
 import importlib
-import inspect
 import json
 import os
 import pickle
@@ -36,7 +35,7 @@
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
 from ..tokenization_utils import PreTrainedTokenizer
-from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available, logging
+from ..utils import ModelOutput, add_end_docstrings, infer_framework, is_tf_available, is_torch_available, logging
 
 
 GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
@@ -278,7 +277,7 @@ def infer_framework_load_model(
         if isinstance(model, str):
             raise ValueError(f"Could not load model {model} with any of the following classes: {class_tuple}.")
 
-    framework = "tf" if "keras.engine.training.Model" in str(inspect.getmro(model.__class__)) else "pt"
+    framework = infer_framework(model.__class__)
     return framework, model
 
 
@@ -351,7 +350,7 @@ def get_framework(model, revision: Optional[str] = None):
             except OSError:
                 model = TFAutoModel.from_pretrained(model, revision=revision)
 
-    framework = "tf" if "keras.engine.training.Model" in str(inspect.getmro(model.__class__)) else "pt"
+    framework = infer_framework(model.__class__)
     return framework
 
 
diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py
index 306f73c0b1ba..0900ac587c46 100644
--- a/src/transformers/tf_utils.py
+++ b/src/transformers/tf_utils.py
@@ -166,3 +166,90 @@ def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_nam
             f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time."
         ),
     )
+
+
+def save_attributes_to_hdf5_group(group, name, data):
+    """Saves attributes (data) of the specified name into the HDF5 group.
+
+    This method deals with an inherent problem of HDF5 file which is not able to store data larger than
+    HDF5_OBJECT_HEADER_LIMIT bytes.
+
+    Args:
+        group: A pointer to a HDF5 group.
+        name: A name of the attributes to save.
+        data: Attributes data to store.
+
+    Raises:
+      RuntimeError: If any single attribute is too large to be saved.
+
+    Copied from Keras to Transformers to avoid versioning issues.
+    """
+    HDF5_OBJECT_HEADER_LIMIT = 64512
+    # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+    # because in that case even chunking the array would not make the saving
+    # possible.
+    bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+    # Expecting this to never be true.
+    if bad_attributes:
+        raise RuntimeError(
+            "The following attributes cannot be saved to HDF5 file because "
+            f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} "
+            f"bytes: {bad_attributes}"
+        )
+
+    data_npy = np.asarray(data)
+
+    num_chunks = 1
+    chunked_data = np.array_split(data_npy, num_chunks)
+
+    # This will never loop forever thanks to the test above.
+    while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
+        num_chunks += 1
+        chunked_data = np.array_split(data_npy, num_chunks)
+
+    if num_chunks > 1:
+        for chunk_id, chunk_data in enumerate(chunked_data):
+            group.attrs["%s%d" % (name, chunk_id)] = chunk_data
+    else:
+        group.attrs[name] = data
+
+
+def load_attributes_from_hdf5_group(group, name):
+    """Loads attributes of the specified name from the HDF5 group.
+
+    This method deals with an inherent problem of HDF5 file which is not able to store data larger than
+    HDF5_OBJECT_HEADER_LIMIT bytes.
+
+    Args:
+        group: A pointer to a HDF5 group.
+        name: A name of the attributes to load.
+
+    Returns:
+        data: Attributes data.
+
+    Copied from Keras to Transformers to avoid versioning issues.
+    """
+    if name in group.attrs:
+        data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]]
+    else:
+        data = []
+        chunk_id = 0
+        while "%s%d" % (name, chunk_id) in group.attrs:
+            data.extend(
+                [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]]
+            )
+            chunk_id += 1
+    return data
+
+
+def expand_1d(data):
+    """Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s.
+    Copied from Keras to here to avoid versioning issues."""
+
+    def _expand_single_1d_tensor(t):
+        if isinstance(t, tf.Tensor) and t.shape.rank == 1:
+            return tf.expand_dims(t, axis=-1)
+        return t
+
+    return tf.nest.map_structure(_expand_single_1d_tensor, data)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 35d3638aecdb..7169c7daf969 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -39,6 +39,7 @@
     expand_dims,
     find_labels,
     flatten_dict,
+    infer_framework,
     is_jax_tensor,
     is_numpy_array,
     is_tensor,
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 23214db8f859..afe102408378 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -398,11 +398,10 @@ def can_return_loss(model_class):
     Args:
         model_class (`type`): The class of the model.
     """
-    base_classes = str(inspect.getmro(model_class))
-
-    if "keras.engine.training.Model" in base_classes:
+    framework = infer_framework(model_class)
+    if framework == "tf":
         signature = inspect.signature(model_class.call)  # TensorFlow models
-    elif "torch.nn.modules.module.Module" in base_classes:
+    elif framework == "pt":
         signature = inspect.signature(model_class.forward)  # PyTorch models
     else:
         signature = inspect.signature(model_class.__call__)  # Flax models
@@ -422,11 +421,10 @@ def find_labels(model_class):
         model_class (`type`): The class of the model.
     """
     model_name = model_class.__name__
-    base_classes = str(inspect.getmro(model_class))
-
-    if "keras.engine.training.Model" in base_classes:
+    framework = infer_framework(model_class)
+    if framework == "tf":
         signature = inspect.signature(model_class.call)  # TensorFlow models
-    elif "torch.nn.modules.module.Module" in base_classes:
+    elif framework == "pt":
         signature = inspect.signature(model_class.forward)  # PyTorch models
     else:
         signature = inspect.signature(model_class.__call__)  # Flax models
@@ -565,3 +563,21 @@ def add_model_info_to_auto_map(auto_map, repo_id):
             auto_map[key] = f"{repo_id}--{value}"
 
     return auto_map
+
+
+def infer_framework(model_class):
+    """
+    Infers the framework of a given model without using isinstance(), because we cannot guarantee that the relevant
+    classes are imported or available.
+    """
+    for base_class in inspect.getmro(model_class):
+        module = base_class.__module__
+        name = base_class.__name__
+        if module.startswith("tensorflow") or module.startswith("keras") or name == "TFPreTrainedModel":
+            return "tf"
+        elif module.startswith("torch") or name == "PreTrainedModel":
+            return "pt"
+        elif module.startswith("flax") or module.startswith("jax") or name == "FlaxPreTrainedModel":
+            return "flax"
+    else:
+        raise TypeError(f"Could not infer framework from class {model_class}.")

From 6b7d6f848bffb2ed68935858e71395c4070e3200 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 23 May 2023 18:25:44 +0200
Subject: [PATCH 205/935] [`Blip`] Fix blip doctest (#23698)

fix blip doctest
---
 src/transformers/models/blip/modeling_blip.py    | 4 ++++
 src/transformers/models/blip/modeling_tf_blip.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 9e0fc7419d46..e41bb55fa19b 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -1200,6 +1200,10 @@ def forward(
             return_dict=return_dict,
         )
 
+        if labels is not None and decoder_input_ids is None:
+            # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
+            decoder_input_ids = labels
+
         question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
 
         answer_output = self.text_decoder(
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 4ea9c7e7e5f5..ae32c2f32a52 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -1416,6 +1416,10 @@ def call(
 
         question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
 
+        if labels is not None and decoder_input_ids is None:
+            # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
+            decoder_input_ids = labels
+
         answer_output = self.text_decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,

From 3d57404464516cbddf22d6ae43dea59c233fe53f Mon Sep 17 00:00:00 2001
From: LWprogramming <LWprogramming@users.noreply.github.com>
Date: Tue, 23 May 2023 11:37:35 -0700
Subject: [PATCH 206/935] is_batched fix for remaining 2-D numpy arrays
 (#23309)

* Fix is_batched code to allow 2-D numpy arrays for audio

* Tests

* Fix typo

* Incorporate comments from PR #23223
---
 ...eature_extraction_audio_spectrogram_transformer.py | 11 +++++++----
 .../models/clap/feature_extraction_clap.py            | 11 +++++++----
 .../models/mctct/feature_extraction_mctct.py          | 11 +++++++----
 .../feature_extraction_speech_to_text.py              | 11 +++++++----
 .../models/speecht5/feature_extraction_speecht5.py    | 11 +++++++----
 .../models/tvlt/feature_extraction_tvlt.py            | 11 +++++++----
 .../models/whisper/feature_extraction_whisper.py      | 11 +++++++----
 ...eature_extraction_audio_spectrogram_transformer.py |  8 ++++++++
 tests/models/clap/test_feature_extraction_clap.py     |  8 ++++++++
 tests/models/mctct/test_feature_extraction_mctct.py   |  8 ++++++++
 .../test_feature_extraction_speech_to_text.py         |  8 ++++++++
 .../speecht5/test_feature_extraction_speecht5.py      |  8 ++++++++
 tests/models/tvlt/test_feature_extraction_tvlt.py     |  9 +++++++++
 .../models/whisper/test_feature_extraction_whisper.py |  8 ++++++++
 14 files changed, 106 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index deda2fc7781b..786548fd2336 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -135,7 +135,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             sampling_rate (`int`, *optional*):
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                 `sampling_rate` at the forward call to prevent silent errors.
@@ -160,9 +161,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 6edd739fa16d..d33307ffbd22 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -272,7 +272,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             truncation (`str`, *optional*):
                 Truncation pattern for long audio inputs. Two patterns are available:
                     - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
@@ -312,9 +313,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py
index 467e654244b9..9e9e276c168c 100644
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ b/src/transformers/models/mctct/feature_extraction_mctct.py
@@ -180,7 +180,8 @@ def __call__(
         Args:
             raw_speech (`torch.Tensor`, `np.ndarray`, `List[float]`, `List[torch.Tensor]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
-                of float values, a list of tensors, a list of numpy arrays or a list of list of float values.
+                of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be
+                mono channel audio, not stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -231,9 +232,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index a5e6b0d40042..81f2ea4e99be 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -141,7 +141,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -200,9 +201,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
index 5fe6ca39765c..dd5ff4c8a1af 100644
--- a/src/transformers/models/speecht5/feature_extraction_speecht5.py
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -201,7 +201,8 @@ def __call__(
         Args:
             audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                 The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. This outputs waveform features.
+                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
+                be mono channel audio, not stereo, i.e. single float per timestep.
             audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                 The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
                 list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
@@ -307,9 +308,11 @@ def _process_audio(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
-        is_batched = bool(
-            isinstance(speech, (list, tuple))
-            and (isinstance(speech[0], np.ndarray) or isinstance(speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(speech, np.ndarray) and len(speech.shape) > 1
+        if is_batched_numpy and len(speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(speech, (list, tuple)) and (isinstance(speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py
index 6d919550cf55..d5beba76bd69 100644
--- a/src/transformers/models/tvlt/feature_extraction_tvlt.py
+++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py
@@ -129,7 +129,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
@@ -176,9 +177,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
         if is_batched:
             raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index e0b772216205..70eb8bd94e76 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -152,7 +152,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             truncation (`bool`, *optional*, default to `True`):
                 Activates truncation to cut input sequences longer than *max_length* to *max_length*.
             pad_to_multiple_of (`int`, *optional*, defaults to None):
@@ -203,9 +204,11 @@ def __call__(
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
-        is_batched = bool(
-            isinstance(raw_speech, (list, tuple))
-            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )
 
         if is_batched:
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index a7a81dceb153..69a1bddc8250 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -125,6 +125,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     @require_torch
     def test_double_precision_pad(self):
         import torch
diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py
index 733dd66681c4..c49d045ba874 100644
--- a/tests/models/clap/test_feature_extraction_clap.py
+++ b/tests/models/clap/test_feature_extraction_clap.py
@@ -139,6 +139,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_double_precision_pad(self):
         import torch
 
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
index cab2911fdd40..f3d8f0fea940 100644
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ b/tests/models/mctct/test_feature_extraction_mctct.py
@@ -134,6 +134,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_cepstral_mean_and_variance_normalization(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
         speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
index aedd445e5d63..293b33fde80e 100644
--- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
+++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
@@ -136,6 +136,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_cepstral_mean_and_variance_normalization(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
index 11ed50de4bbc..a09bf7f8ae58 100644
--- a/tests/models/speecht5/test_feature_extraction_speecht5.py
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -275,6 +275,14 @@ def test_call_target(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
     def test_batch_feature_target(self):
         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
         feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
index a76f3c9dca08..051708a30698 100644
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -189,6 +189,15 @@ def test_call(self):
         self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
         self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
+        self.assertTrue(encoded_audios.ndim == 4)
+        self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
+        self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
+
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index 31ea28b9ad62..90cbfc21c04f 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -173,6 +173,14 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
         # Test truncation required
         speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]

From de5f86e59db2f4687aec93159b9d09fa8d47ab6b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 23 May 2023 20:47:47 +0200
Subject: [PATCH 207/935] Skip `TFCvtModelTest::test_keras_fit_mixed_precision`
 for now (#23699)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/cvt/test_modeling_tf_cvt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index 484bd295d172..f0f7b9c5d823 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -186,6 +186,7 @@ def test_dataset_conversion(self):
     def test_keras_fit(self):
         super().test_keras_fit()
 
+    @unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8")
     def test_keras_fit_mixed_precision(self):
         policy = tf.keras.mixed_precision.Policy("mixed_float16")
         tf.keras.mixed_precision.set_global_policy(policy)

From 357f281ba24af8d49afd84c7628329d99868f411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=8F=E6=A1=90=E6=A1=90?=
 <32215330+dkqkxx@users.noreply.github.com>
Date: Wed, 24 May 2023 02:50:27 +0800
Subject: [PATCH 208/935] fix: load_best_model_at_end error when load_in_8bit
 is True (#23443)

Ref: https://github.com/huggingface/peft/issues/394
    Loading a quantized checkpoint into non-quantized Linear8bitLt is not supported.
    call module.cuda() before module.load_state_dict()
---
 src/transformers/trainer.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index ce4e1f240671..90b25cfb2097 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2226,16 +2226,35 @@ def _load_best_model(self):
                         state_dict["_smp_is_partial"] = False
                         load_result = model.load_state_dict(state_dict, strict=True)
                 else:
-                    # We load the model state dict on the CPU to avoid an OOM error.
-                    if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
-                        state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                    if hasattr(model, "base_model") and getattr(model.base_model, "is_8bit_serializable", False):
+                        # If train base_8_bit_models using PEFT & LoRA, assume that adapter have been saved properly.
+                        if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+                            if os.path.exists(os.path.join(self.state.best_model_checkpoint, "adapter_model.bin")):
+                                model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
+                                # Load_adapter has no return value present, modify it when appropriate.
+                                from torch.nn.modules.module import _IncompatibleKeys
+
+                                load_result = _IncompatibleKeys([], [])
+                            else:
+                                logger.warning(
+                                    "The intermediate checkpoints of PEFT may not be saved correctly, "
+                                    "using `TrainerCallback` to save adapter_model.bin in corresponding folders, "
+                                    "here are some examples https://github.com/huggingface/peft/issues/96"
+                                )
+                        else:
+                            # We can't do pure 8bit training using transformers.
+                            logger.warning("Could not loading a quantized checkpoint.")
                     else:
-                        state_dict = torch.load(best_model_path, map_location="cpu")
+                        # We load the model state dict on the CPU to avoid an OOM error.
+                        if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                            state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                        else:
+                            state_dict = torch.load(best_model_path, map_location="cpu")
 
-                    # If the model is on the GPU, it still works!
-                    # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
-                    # which takes *args instead of **kwargs
-                    load_result = model.load_state_dict(state_dict, False)
+                        # If the model is on the GPU, it still works!
+                        # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                        # which takes *args instead of **kwargs
+                        load_result = model.load_state_dict(state_dict, False)
                 if not is_sagemaker_mp_enabled():
                     self._issue_warnings_after_load(load_result)
         elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):

From 003a0cf8cc4d78e47ef9debfb1e93a5c1197ca9a Mon Sep 17 00:00:00 2001
From: zspo <songpo.zhang@foxmail.com>
Date: Wed, 24 May 2023 02:50:40 +0800
Subject: [PATCH 209/935] Fix some docs what layerdrop does (#23691)

* Fix some docs what layerdrop does

* Update src/transformers/models/data2vec/configuration_data2vec_audio.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Fix more docs

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../pabee/modeling_pabee_albert.py                 |  2 +-
 src/transformers/generation/logits_process.py      |  2 +-
 src/transformers/modeling_outputs.py               |  4 ++--
 src/transformers/modeling_utils.py                 |  2 +-
 .../models/align/configuration_align.py            |  2 +-
 src/transformers/models/blip/modeling_blip_text.py |  2 +-
 .../models/blip/modeling_tf_blip_text.py           |  2 +-
 src/transformers/models/blip_2/modeling_blip_2.py  |  2 +-
 src/transformers/models/bloom/modeling_bloom.py    |  2 +-
 .../data2vec/configuration_data2vec_audio.py       |  3 +++
 .../configuration_deformable_detr.py               |  2 +-
 src/transformers/models/deta/configuration_deta.py |  2 +-
 .../efficientnet/configuration_efficientnet.py     |  2 +-
 .../models/hubert/configuration_hubert.py          |  3 +++
 src/transformers/models/lxmert/modeling_lxmert.py  | 14 +++++++-------
 .../models/lxmert/modeling_tf_lxmert.py            | 10 +++++-----
 .../models/mask2former/modeling_mask2former.py     |  4 ++--
 .../models/mpnet/tokenization_mpnet.py             |  2 +-
 .../models/mpnet/tokenization_mpnet_fast.py        |  2 +-
 src/transformers/models/opt/configuration_opt.py   |  2 +-
 .../models/pegasus_x/configuration_pegasus_x.py    |  4 ++--
 src/transformers/models/rag/modeling_rag.py        |  2 +-
 src/transformers/models/rag/retrieval_rag.py       |  4 ++--
 src/transformers/models/realm/modeling_realm.py    |  2 +-
 src/transformers/models/sew/configuration_sew.py   |  3 +++
 .../models/unispeech/configuration_unispeech.py    |  3 +++
 .../unispeech_sat/configuration_unispeech_sat.py   |  3 +++
 .../models/wav2vec2/configuration_wav2vec2.py      |  3 +++
 .../configuration_wav2vec2_conformer.py            |  3 +++
 .../models/wavlm/configuration_wavlm.py            |  3 +++
 src/transformers/optimization_tf.py                |  4 ++--
 src/transformers/pipelines/audio_utils.py          |  2 +-
 src/transformers/pipelines/base.py                 |  2 +-
 src/transformers/tokenization_utils_base.py        |  2 +-
 src/transformers/training_args_tf.py               |  2 +-
 ...uration_{{cookiecutter.lowercase_modelname}}.py |  4 ++--
 36 files changed, 68 insertions(+), 44 deletions(-)

diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
index 5e17352dc19b..57b649ec067b 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -253,7 +253,7 @@ def forward(
 
         Returns:
             :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-            loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
                 Classification (or regression if config.num_labels==1) loss.
             logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
                 Classification (or regression if config.num_labels==1) scores (before SoftMax).
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 95c8064ee404..dd51610afd43 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -678,7 +678,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
     generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
 
     Args:
-        prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
+        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
             This function constraints the beam search to allowed tokens only at each step. This function takes 2
             arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
             next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index c69e426ab531..aceec7abd406 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -1522,7 +1522,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
         scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+        static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
@@ -1593,7 +1593,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
         scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+        static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index a8e8e4b2e241..b7b832b4640e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -912,7 +912,7 @@ def get_head_mask(
                 The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
             num_hidden_layers (`int`):
                 The number of hidden layers in the model.
-            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+            is_attention_chunked (`bool`, *optional*, defaults to `False`):
                 Whether or not the attentions scores are computed by chunks or not.
 
         Returns:
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 488b7f6fe458..0436b278f0a7 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -184,7 +184,7 @@ class AlignVisionConfig(PretrainedConfig):
             List of output channel sizes to be used in each block for convolutional layers.
         depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
             List of block indices with square padding.
-        strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
+        strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
             List of stride sizes to be used in each block for convolutional layers.
         num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
             List of the number of times each block is to repeated.
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 85585f3d5dd3..1f269cf852ee 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -613,7 +613,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            device: (`torch.device`):
+            device (`torch.device`):
                 The device of the input to the model.
 
         Returns:
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index 6e8ed8a891c0..02f592c25902 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -633,7 +633,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            is_decoder: (`bool`):
+            is_decoder (`bool`):
                 Whether the model is used as a decoder.
 
         Returns:
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index be499dc1d516..381a3fe0ed7a 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1059,7 +1059,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            device: (`torch.device`):
+            device (`torch.device`):
                 The device of the input to the model.
 
         Returns:
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index e954cfb44730..5c0d570cbe9c 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -256,7 +256,7 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         Merge heads together over the last dimenstion
 
         Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
 
         Returns:
             torch.tensor: [batch_size, seq_length, num_heads * head_dim]
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index 2ec526924f36..066d81a5daed 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -62,6 +62,9 @@ class Data2VecAudioConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 54614eef96f9..dbe5fd7f0a78 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -77,7 +77,7 @@ class DeformableDetrConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         init_xavier_std (`float`, *optional*, defaults to 1):
             The scaling factor used for the Xavier initialization gain in the HM Attention map module.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
         auxiliary_loss (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 836e9732e68e..dc85ea91df36 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -71,7 +71,7 @@ class DetaConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         init_xavier_std (`float`, *optional*, defaults to 1):
             The scaling factor used for the Xavier initialization gain in the HM Attention map module.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
         auxiliary_loss (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/efficientnet/configuration_efficientnet.py b/src/transformers/models/efficientnet/configuration_efficientnet.py
index a6e4d172f442..e6b6a1c261ca 100644
--- a/src/transformers/models/efficientnet/configuration_efficientnet.py
+++ b/src/transformers/models/efficientnet/configuration_efficientnet.py
@@ -60,7 +60,7 @@ class EfficientNetConfig(PretrainedConfig):
             List of output channel sizes to be used in each block for convolutional layers.
         depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
             List of block indices with square padding.
-        strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
+        strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
             List of stride sizes to be used in each block for convolutional layers.
         num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
             List of the number of times each block is to repeated.
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index 139df45bbb79..1f326871c3c9 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -62,6 +62,9 @@ class HubertConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 9fe7ecb730fd..73586872e8a1 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -111,7 +111,7 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
         loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction
             (classification) loss.k.
-        question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
+        question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
             Prediction scores of question answering objective (classification).
         language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
@@ -153,10 +153,10 @@ class LxmertForPreTrainingOutput(ModelOutput):
             (classification) loss.
         prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        cross_relationship_score: (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
             Prediction scores of the textual matching objective (classification) head (scores of True/False
             continuation before SoftMax).
-        question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
+        question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
             Prediction scores of question answering objective (classification).
         language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
@@ -828,12 +828,12 @@ def _init_weights(self, module):
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
-        visual_feats: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
+        visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
             This input represents visual features. They ROI pooled object features from bounding boxes using a
             faster-RCNN model)
 
             These are currently not provided by the transformers library.
-        visual_pos: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
+        visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
             This input represents spacial features corresponding to their relative (via index) visual features. The
             pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
             1.
@@ -1171,7 +1171,7 @@ def forward(
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        obj_labels: (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
+        obj_labels (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
             each key is named after each one of the visual losses and each element of the tuple is of the shape
             `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
             the label score respectively
@@ -1398,7 +1398,7 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]:
         r"""
-        labels: (`Torch.Tensor` of shape `(batch_size)`, *optional*):
+        labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
             A one-hot representation of the correct answer
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index 948053c93e28..f02d0ff6446a 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -111,10 +111,10 @@ class TFLxmertForPreTrainingOutput(ModelOutput):
             (classification) loss.
         prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        cross_relationship_score: (`tf.Tensor` of shape `(batch_size, 2)`):
+        cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
             Prediction scores of the textual matching objective (classification) head (scores of True/False
             continuation before SoftMax).
-        question_answering_score: (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
+        question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
             Prediction scores of question answering objective (classification).
         language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
@@ -873,12 +873,12 @@ def serving(self, inputs):
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
-        visual_feats: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
+        visual_feats (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
             This input represents visual features. They ROI pooled object features from bounding boxes using a
             faster-RCNN model)
 
             These are currently not provided by the transformers library.
-        visual_pos: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
+        visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
             This input represents spacial features corresponding to their relative (via index) visual features. The
             pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
             1.
@@ -1297,7 +1297,7 @@ def call(
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
+        obj_labels (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
             each key is named after each one of the visual losses and each element of the tuple is of the shape
             `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
             the label score respectively
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 289b4919e369..4cb2493e58c8 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -1767,7 +1767,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
     of the predicted mask for each query, instead of attending to the full feature map.
 
     Args:
-        config: (`Mask2FormerConfig`):
+        config (`Mask2FormerConfig`):
             Configuration used to instantiate Mask2FormerMaskedAttentionDecoder.
     """
 
@@ -2003,7 +2003,7 @@ def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Te
                 The feature dimension of the Mask2FormerMaskedAttentionDecoder
             num_heads (`int`):
                 The number of heads used in the Mask2FormerMaskedAttentionDecoder
-            mask_feature_size: (`torch.Tensor`):
+            mask_feature_size (`torch.Tensor`):
                 one of the output dimensions of the predicted masks for each query
         """
         super().__init__()
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 1f5ad2f41aae..57a06beeead0 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -119,7 +119,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
 
             This should likely be deactivated for Japanese (see this
             [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents: (`bool`, *optional*):
+        strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
     """
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
index 288c69c62b3c..82d8ffec08d9 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -98,7 +98,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
             issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents: (`bool`, *optional*):
+        strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
     """
diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py
index df13b3201998..d2b7a4347ea4 100644
--- a/src/transformers/models/opt/configuration_opt.py
+++ b/src/transformers/models/opt/configuration_opt.py
@@ -67,7 +67,7 @@ class OPTConfig(PretrainedConfig):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        layerdrop: (`float`, *optional*, defaults to 0.0):
+        layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
             details.
         init_std (`float`, *optional*, defaults to 0.02):
diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
index c393a6b8a910..f48e19bdcbca 100644
--- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
@@ -70,10 +70,10 @@ class PegasusXConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
-        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
         use_cache (`bool`, *optional*, defaults to `True`):
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 6941bca09c74..019b26ef08e9 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -1430,7 +1430,7 @@ def generate(
                 priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                 configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                 default values, whose documentation should be checked to parameterize generation.
-            prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                 If provided, this function constraints the beam search to allowed tokens only at each step. If not
                 provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
                 `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index 261255b9f62f..88cb54115bf5 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -573,10 +573,10 @@ def __call__(
         Retrieves documents for specified `question_hidden_states`.
 
         Args:
-            question_input_ids: (`List[List[int]]`) batch of input ids
+            question_input_ids (`List[List[int]]`) batch of input ids
             question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
                 A batch of query vectors to retrieve with.
-            prefix: (`str`, *optional*):
+            prefix (`str`, *optional*):
                 The prefix used by the generator's tokenizer.
             n_docs (`int`, *optional*):
                 The number of docs retrieved per query.
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 261b2b4a0b8c..339cc27e9289 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -726,7 +726,7 @@ class RealmReaderOutput(ModelOutput):
             The index of the retrieved span candidates in which the predicted answer is most likely.
         start_pos (`torch.IntTensor` of shape `()`):
             Predicted answer starting position in *RealmReader*'s inputs.
-        end_pos: (`torch.IntTensor` of shape `()`):
+        end_pos (`torch.IntTensor` of shape `()`):
             Predicted answer ending position in *RealmReader*'s inputs.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index af7041843de3..07e3a7df26d8 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -63,6 +63,9 @@ class SEWConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`SEWForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index f4e8df659e9a..4054c49f3e0c 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -65,6 +65,9 @@ class UniSpeechConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`UniSpeechForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index 222f982fe769..8bd482f394d0 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -66,6 +66,9 @@ class UniSpeechSatConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index 7afcd3f0ee28..6f7709e535d1 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -63,6 +63,9 @@ class Wav2Vec2Config(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
index b9f24c7e7080..24b7dca73944 100644
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -65,6 +65,9 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index becbe100240d..c3ac6ba196a8 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -62,6 +62,9 @@ class WavLMConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`WavLMForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index b42e04041b8f..382eae2a30e2 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -117,9 +117,9 @@ def create_optimizer(
             The beta2 to use in Adam.
         adam_epsilon (`float`, *optional*, defaults to 1e-8):
             The epsilon to use in Adam.
-        adam_clipnorm: (`float`, *optional*, defaults to `None`):
+        adam_clipnorm (`float`, *optional*, defaults to `None`):
             If not `None`, clip the gradient norm for each weight tensor to this value.
-        adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
+        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
             If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
             weight tensors, as if they were concatenated into a single vector.
         weight_decay_rate (`float`, *optional*, defaults to 0):
diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py
index 62c2b00c467a..f17dd68d6439 100644
--- a/src/transformers/pipelines/audio_utils.py
+++ b/src/transformers/pipelines/audio_utils.py
@@ -119,7 +119,7 @@ def ffmpeg_microphone_live(
             The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
             an audio sample but without using that part to actually make the prediction. Setting this does not change
             the length of the chunk.
-        format_for_conversion: (`str`, defalts to `f32le`)
+        format_for_conversion (`str`, defalts to `f32le`)
             The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
             could also be used.
     Return:
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index f57d08efffd6..4fc256266449 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -514,7 +514,7 @@ def from_str(
         Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
 
         Args:
-            format: (`str`):
+            format (`str`):
                 The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
             output_path (`str`, *optional*):
                 Where to save the outgoing data.
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index a186c8338bad..ecc9d5011d78 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2093,7 +2093,7 @@ def save_pretrained(
 
                 If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
                 error is raised.
-            filename_prefix: (`str`, *optional*):
+            filename_prefix (`str`, *optional*):
                 A prefix to add to the names of the files saved by the tokenizer.
             push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 847bbdb78a15..461c4086acc3 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -66,7 +66,7 @@ class TFTrainingArguments(TrainingArguments):
             The batch size per GPU/TPU core/CPU for training.
         per_device_eval_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for evaluation.
-        gradient_accumulation_steps: (`int`, *optional*, defaults to 1):
+        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
             <Tip warning={true}>
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
index 3221696317bd..2898b5cf6f8f 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
@@ -107,10 +107,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see
             https://arxiv.org/abs/1909.11556) for more details.
-        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see
             https://arxiv.org/abs/1909.11556) for more details.
         use_cache (`bool`, *optional*, defaults to `True`):

From 33687a3f61f4f17145346b08ccf4ad698333d791 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 24 May 2023 17:57:56 +0800
Subject: [PATCH 210/935] add GPTJ/bloom/llama/opt into model list and enhance
 the jit support (#23291)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/pytorch/text-generation/README.md    |  2 +-
 .../pytorch/text-generation/run_generation.py | 95 +++++++++++--------
 2 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/examples/pytorch/text-generation/README.md b/examples/pytorch/text-generation/README.md
index 2177c45c3b88..fce4aef86b14 100644
--- a/examples/pytorch/text-generation/README.md
+++ b/examples/pytorch/text-generation/README.md
@@ -18,7 +18,7 @@ limitations under the License.
 
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
 
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPTJ, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.
 
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index e0dda0ec0c2f..75221934da85 100755
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -19,6 +19,7 @@
 
 
 import argparse
+import inspect
 import logging
 from typing import Tuple
 
@@ -26,13 +27,20 @@
 import torch
 
 from transformers import (
+    AutoTokenizer,
+    BloomForCausalLM,
+    BloomTokenizerFast,
     CTRLLMHeadModel,
     CTRLTokenizer,
     GenerationMixin,
     GPT2LMHeadModel,
     GPT2Tokenizer,
+    GPTJForCausalLM,
+    LlamaForCausalLM,
+    LlamaTokenizer,
     OpenAIGPTLMHeadModel,
     OpenAIGPTTokenizer,
+    OPTForCausalLM,
     TransfoXLLMHeadModel,
     TransfoXLTokenizer,
     XLMTokenizer,
@@ -59,6 +67,10 @@
     "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
     "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
     "xlm": (XLMWithLMHeadModel, XLMTokenizer),
+    "gptj": (GPTJForCausalLM, AutoTokenizer),
+    "bloom": (BloomForCausalLM, BloomTokenizerFast),
+    "llama": (LlamaForCausalLM, LlamaTokenizer),
+    "opt": (OPTForCausalLM, GPT2Tokenizer),
 }
 
 # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -173,23 +185,26 @@ def sparse_model_config(model_config):
         raise ValueError("Check the model config")
 
     num_embedding_size_per_head = int(embedding_size / num_head)
-    num_layer = model_config.n_layer
+    if hasattr(model_config, "n_layer"):
+        num_layer = model_config.n_layer
+    elif hasattr(model_config, "num_hidden_layers"):
+        num_layer = model_config.num_hidden_layers
+    else:
+        raise ValueError("Number of hidden layers couldn't be determined from the model config")
 
     return num_layer, num_head, num_embedding_size_per_head
 
 
-def prepare_jit_inputs(inputs, model, tokenizer):
-    num_batch = len(inputs)
-    dummy_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
+def generate_past_key_values(model, batch_size, seq_len):
     num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
     if model.config.model_type == "bloom":
         past_key_values = tuple(
             (
-                torch.zeros(int(num_attention_heads * num_batch), num_embedding_size_per_head, 1)
-                .to(model.config.torch_dtype)
+                torch.empty(int(num_attention_heads * batch_size), num_embedding_size_per_head, seq_len)
+                .to(model.dtype)
                 .to(model.device),
-                torch.zeros(int(num_attention_heads * num_batch), 1, num_embedding_size_per_head)
-                .to(model.config.torch_dtype)
+                torch.empty(int(num_attention_heads * batch_size), seq_len, num_embedding_size_per_head)
+                .to(model.dtype)
                 .to(model.device),
             )
             for _ in range(num_block_layers)
@@ -197,37 +212,34 @@ def prepare_jit_inputs(inputs, model, tokenizer):
     else:
         past_key_values = tuple(
             (
-                torch.zeros(num_batch, num_attention_heads, 1, num_embedding_size_per_head)
-                .to(model.config.torch_dtype)
+                torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
+                .to(model.dtype)
                 .to(model.device),
-                torch.zeros(num_batch, num_attention_heads, 1, num_embedding_size_per_head)
-                .to(model.config.torch_dtype)
+                torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
+                .to(model.dtype)
                 .to(model.device),
             )
             for _ in range(num_block_layers)
         )
+    return past_key_values
 
+
+def prepare_jit_inputs(inputs, model, tokenizer):
+    batch_size = len(inputs)
+    dummy_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
+    dummy_input = dummy_input.to(model.device)
+    if model.config.use_cache:
+        dummy_input["past_key_values"] = generate_past_key_values(model, batch_size, 1)
     dummy_input["attention_mask"] = torch.cat(
         [
-            torch.zeros(dummy_input["attention_mask"].shape[0], 1).to(dummy_input["attention_mask"].dtype),
+            torch.zeros(dummy_input["attention_mask"].shape[0], 1)
+            .to(dummy_input["attention_mask"].dtype)
+            .to(model.device),
             dummy_input["attention_mask"],
         ],
         -1,
     )
-
-    if model.config.use_cache:
-        jit_inputs = (
-            dummy_input["input_ids"].to(model.device),
-            past_key_values,
-            dummy_input["attention_mask"].to(model.device),
-        )
-    else:
-        jit_inputs = (
-            dummy_input["input_ids"].to(model.device),
-            dummy_input["attention_mask"].to(model.device),
-        )
-
-    return jit_inputs
+    return dummy_input
 
 
 class _ModelFallbackWrapper(GenerationMixin):
@@ -238,15 +250,13 @@ def __init__(self, optimized, default):
         self._default = default
 
     def __call__(self, *args, **kwargs):
-        if kwargs["past_key_values"] is None:
-            return self._default(*args, **kwargs)
-        trace_graph_inputs = []
+        if kwargs["past_key_values"] is None and self._default.config.use_cache:
+            kwargs["past_key_values"] = generate_past_key_values(self._default, kwargs["input_ids"].shape[0], 0)
         kwargs.pop("position_ids", None)
-        for k, v in kwargs.items():
-            if v is not None and not isinstance(v, bool):
-                trace_graph_inputs.append(v)
-        trace_graph_inputs = tuple(trace_graph_inputs)
-        outputs = self._optimized(*trace_graph_inputs)
+        for k in list(kwargs.keys()):
+            if kwargs[k] is None or isinstance(kwargs[k], bool):
+                kwargs.pop(k)
+        outputs = self._optimized(**kwargs)
         lm_logits = outputs[0]
         past_key_values = outputs[1]
         fixed_output = CausalLMOutputWithPast(
@@ -324,9 +334,7 @@ def main():
         action="store_true",
         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
     )
-    parser.add_argument(
-        "--jit", type=bool, default=False, help="Whether or not to use jit trace to accelerate inference"
-    )
+    parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
     args = parser.parse_args()
 
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
@@ -351,8 +359,8 @@ def main():
 
     if args.fp16:
         model.half()
-
-    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
+    max_seq_length = getattr(model.config, "max_position_embeddings", 0)
+    args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length)
     logger.info(args)
 
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
@@ -382,10 +390,15 @@ def main():
         input_ids = encoded_prompt
 
     if args.jit:
-        jit_input_texts = ["jit"]
+        jit_input_texts = ["enable jit"]
         jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
         torch._C._jit_set_texpr_fuser_enabled(False)
         model.config.return_dict = False
+        if hasattr(model, "forward"):
+            sig = inspect.signature(model.forward)
+        else:
+            sig = inspect.signature(model.__call__)
+        jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
         traced_model = torch.jit.trace(model, jit_inputs, strict=False)
         traced_model = torch.jit.freeze(traced_model.eval())
         traced_model(*jit_inputs)

From 9d73b922692776d1cf268b8c95fd90c68e561add Mon Sep 17 00:00:00 2001
From: Tim Dettmers <TimDettmers@users.noreply.github.com>
Date: Wed, 24 May 2023 03:52:45 -0700
Subject: [PATCH 211/935] 4-bit QLoRA via bitsandbytes (4-bit base model +
 LoRA) (#23479)

* Added lion and paged optimizers and made original tests pass.

* Added tests for paged and lion optimizers.

* Added and fixed optimizer tests.

* Style and quality checks.

* Initial draft. Some tests fail.

* Fixed dtype bug.

* Fixed bug caused by torch_dtype='auto'.

* All test green for 8-bit and 4-bit layers.

* Added fix for fp32 layer norms and bf16 compute in LLaMA.

* Initial draft. Some tests fail.

* Fixed dtype bug.

* Fixed bug caused by torch_dtype='auto'.

* All test green for 8-bit and 4-bit layers.

* Added lion and paged optimizers and made original tests pass.

* Added tests for paged and lion optimizers.

* Added and fixed optimizer tests.

* Style and quality checks.

* Fixing issues for PR #23479.

* Added fix for fp32 layer norms and bf16 compute in LLaMA.

* Reverted variable name change.

* Initial draft. Some tests fail.

* Fixed dtype bug.

* Fixed bug caused by torch_dtype='auto'.

* All test green for 8-bit and 4-bit layers.

* Added lion and paged optimizers and made original tests pass.

* Added tests for paged and lion optimizers.

* Added and fixed optimizer tests.

* Style and quality checks.

* Added missing tests.

* Fixup changes.

* Added fixup changes.

* Missed some variables to rename.

* revert trainer tests

* revert test trainer

* another revert

* fix tests and safety checkers

* protect import

* simplify a bit

* Update src/transformers/trainer.py

* few fixes

* add warning

* replace with `load_in_kbit = load_in_4bit or load_in_8bit`

* fix test

* fix tests

* this time fix tests

* safety checker

* add docs

* revert torch_dtype

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* multiple fixes

* update docs

* version checks and multiple fixes

* replace `is_loaded_in_kbit`

* replace `load_in_kbit`

* change methods names

* better checks

* oops

* oops

* address final comments

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/main_classes/quantization.mdx  |  85 +++-
 docs/source/en/perf_infer_gpu_one.mdx         |  54 ++
 src/transformers/modeling_utils.py            | 135 +++--
 src/transformers/trainer.py                   |   4 +-
 src/transformers/utils/bitsandbytes.py        |  92 +++-
 src/transformers/utils/quantization_config.py |  92 +++-
 tests/{mixed_int8 => bitsandbytes}/README.md  |   0
 .../{mixed_int8 => bitsandbytes}/__init__.py  |   0
 tests/bitsandbytes/test_4bit.py               | 460 ++++++++++++++++++
 .../test_mixed_int8.py                        |  18 +
 10 files changed, 864 insertions(+), 76 deletions(-)
 rename tests/{mixed_int8 => bitsandbytes}/README.md (100%)
 rename tests/{mixed_int8 => bitsandbytes}/__init__.py (100%)
 create mode 100644 tests/bitsandbytes/test_4bit.py
 rename tests/{mixed_int8 => bitsandbytes}/test_mixed_int8.py (97%)

diff --git a/docs/source/en/main_classes/quantization.mdx b/docs/source/en/main_classes/quantization.mdx
index 3dd6d36ee497..c168b11a8302 100644
--- a/docs/source/en/main_classes/quantization.mdx
+++ b/docs/source/en/main_classes/quantization.mdx
@@ -19,8 +19,45 @@ This is supported by most of the GPU hardwares since the `0.37.0` release of `bi
 
 Learn more about the quantization method in the [LLM.int8()](https://arxiv.org/abs/2208.07339) paper, or the [blogpost](https://huggingface.co/blog/hf-bitsandbytes-integration) about the collaboration.
 
+Since its `0.39.0` release, you can load any model that supports `device_map` using 4-bit quantization, leveraging FP4 data type.
+
 Here are the things you can do using `bitsandbytes` integration
 
+### FP4 quantization 
+
+#### Requirements
+
+Make sure that you have installed the requirements below before running any of the code snippets below.
+
+- Latest `bitsandbytes` library
+`pip install bitsandbytes>=0.39.0`
+
+- Install latest `accelerate` from source
+`pip install git+https://github.com/huggingface/accelerate.git`
+
+- Install latest `transformers` from source 
+`pip install git+https://github.com/huggingface/transformers.git`
+
+#### Load a large model in 4bit
+
+By using `load_in_4bit=True` when calling the `.from_pretrained` method, you can divide your memory use by 4 (roughly).
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
+```
+
+<Tip warning={true}>
+
+Note that once a model has been loaded in 4-bit it is currently not possible to push the quantized weights on the Hub. Note also that you cannot train 4-bit weights as this is not supported yet. However you can use 4-bit models to train extra parameters, this will be covered in the next section.
+
+</Tip>
+
 ### Load a large model in 8bit
 
 You can load a model by roughly halving the memory requirements by using `load_in_8bit=True` argument when calling `.from_pretrained` method
@@ -48,10 +85,56 @@ With this integration we were able to load large models on smaller devices and r
 
 <Tip warning={true}>
 
-Note that once a model has been loaded in 8-bit it is currently not possible to push the quantized weights on the Hub. Note also that you cannot train 8-bit weights as this is not supported yet. However you can use 8-bit models to train extra parameters, this will be covered in the next section.
+Note that once a model has been loaded in 8-bit it is currently not possible to push the quantized weights on the Hub except if you use the latest `transformers` and `bitsandbytes`. Note also that you cannot train 8-bit weights as this is not supported yet. However you can use 8-bit models to train extra parameters, this will be covered in the next section.
 
 </Tip>
 
+#### Advanced usecases
+
+Here we will cover some advanced usecases you can perform with FP4 quantization 
+
+##### Change the compute dtype
+
+The compute dtype is used to change the dtype that will be used during computation. For example, hidden states could be in `float32` but computation can be set to bf16 for speedups. By default, the compute dtype is set to `float32`.
+
+```python
+import torch
+from transformers import BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+```
+
+##### Using NF4 (Normal Float 4) data type 
+
+You can also use the NF4 data type, which is a new 4bit datatype adapted for weights that have been initialized using a normal distribution. For that run:
+
+```python
+from transformers import BitsAndBytesConfig
+
+nf4_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+```
+
+##### Use nested quantization for more memory efficient inference
+
+We also advise users to use the nested quantization technique. This saves more memory at no additional performance - from our empirical observations, this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4.
+
+```python
+from transformers import BitsAndBytesConfig
+
+double_quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+)
+
+model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
+```
+
+
 ### Push quantized models on the 🤗 Hub
 
 You can push a quantized model on the Hub by naively using `push_to_hub` method. This will first push the quantization configuration file, then push the quantized model weights.
diff --git a/docs/source/en/perf_infer_gpu_one.mdx b/docs/source/en/perf_infer_gpu_one.mdx
index 3403e81fb384..4bcf3c111116 100644
--- a/docs/source/en/perf_infer_gpu_one.mdx
+++ b/docs/source/en/perf_infer_gpu_one.mdx
@@ -34,6 +34,60 @@ model.save_pretrained("saved_model")
 
 As of PyTorch 2.0, the attention fastpath is supported for both encoders and decoders. The list of supported architectures can be found [here](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models).
 
+## `bitsandbytes` integration for FP4 mixed-precision inference
+
+You can install `bitsandbytes` and benefit from easy model compression on GPUs. Using FP4 quantization you can expect to reduce up to 8x the model size compared to its native full precision version. Check out below how to get started.
+
+<Tip>
+
+Note that this feature can also be used in a multi GPU setup.
+
+</Tip>
+
+### Requirements
+
+- Latest `bitsandbytes` library
+`pip install bitsandbytes>=0.39.0`
+
+- Install latest `accelerate` from source
+`pip install git+https://github.com/huggingface/accelerate.git`
+
+- Install latest `transformers` from source 
+`pip install git+https://github.com/huggingface/transformers.git`
+
+### Running FP4 models - single GPU setup - Quickstart
+
+You can quickly run a FP4 model on a single GPU by running the following code:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+
+### Running FP4 models - multi GPU setup 
+
+The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup):
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
+
+```py
+max_memory_mapping = {0: "600MB", 1: "1GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
+)
+```
+In this example, the first GPU will use 600MB of memory and the second 1GB.
+
+### Advanced usage 
+
+For more advanced usage of this method, please have a look at the [quantization](main_classes/quantization) documentation page.
+
 ## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
 
 <Tip>
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b7b832b4640e..6495b997b89a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -606,7 +606,7 @@ def _load_state_dict_into_meta_model(
     state_dict_folder=None,
     state_dict_index=None,
     dtype=None,
-    load_in_8bit=False,
+    is_quantized=False,
     is_safetensors=False,
     keep_in_fp32_modules=None,
 ):
@@ -627,8 +627,8 @@ def _load_state_dict_into_meta_model(
     # - Is there a situation where some keys aren't in `loaded_state_dict_keys` and in which case
     #   they won't get loaded.
 
-    if load_in_8bit:
-        from .utils.bitsandbytes import set_module_8bit_tensor_to_device
+    if is_quantized:
+        from .utils.bitsandbytes import set_module_quantized_tensor_to_device
 
     error_msgs = []
 
@@ -699,12 +699,13 @@ def _load_state_dict_into_meta_model(
                 # TODO: group all errors and raise at the end.
                 raise ValueError(f"{param_name} doesn't have any device set.")
             param_device = device_map[module_name]
+
         if param_device == "disk":
             if not is_safetensors:
                 offload_index = offload_weight(param, param_name, offload_folder, offload_index)
         elif param_device == "cpu" and state_dict_index is not None:
             state_dict_index = offload_weight(param, param_name, state_dict_folder, state_dict_index)
-        elif not load_in_8bit:
+        elif not is_quantized:
             # For backward compatibility with older versions of `accelerate`
             set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
         else:
@@ -714,7 +715,7 @@ def _load_state_dict_into_meta_model(
                 fp16_statistics = None
 
             if "SCB" not in param_name:
-                set_module_8bit_tensor_to_device(
+                set_module_quantized_tensor_to_device(
                     model, param_name, param_device, value=param, fp16_statistics=fp16_statistics
                 )
 
@@ -1700,6 +1701,11 @@ def save_pretrained(
                 UserWarning,
             )
 
+        if getattr(self, "is_loaded_in_4bit", False):
+            raise NotImplementedError(
+                "You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported"
+            )
+
         if "save_config" in kwargs:
             warnings.warn(
                 "`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead."
@@ -1876,9 +1882,9 @@ def get_memory_footprint(self, return_buffers=True):
 
     def to(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.to` is not supported for `8-bit` models. Please use the model as it is, since the"
+                "`.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
                 " model has already been set to the correct devices and casted to the correct `dtype`."
             )
         else:
@@ -1886,9 +1892,9 @@ def to(self, *args, **kwargs):
 
     def half(self, *args):
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.half()` is not supported for `8-bit` models. Please use the model as it is, since the"
+                "`.half()` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
                 " model has already been casted to the correct `dtype`."
             )
         else:
@@ -1896,9 +1902,9 @@ def half(self, *args):
 
     def float(self, *args):
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_loaded_in_8bit", False):
+        if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.float()` is not supported for `8-bit` models. Please use the model as it is, since the"
+                "`.float()` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
                 " model has already been casted to the correct `dtype`."
             )
         else:
@@ -2156,6 +2162,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         offload_folder = kwargs.pop("offload_folder", None)
         offload_state_dict = kwargs.pop("offload_state_dict", False)
         load_in_8bit = kwargs.pop("load_in_8bit", False)
+        load_in_4bit = kwargs.pop("load_in_4bit", False)
         quantization_config = kwargs.pop("quantization_config", None)
         subfolder = kwargs.pop("subfolder", "")
         commit_hash = kwargs.pop("_commit_hash", None)
@@ -2194,10 +2201,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         if quantization_config is None:
             quantization_config, kwargs = BitsAndBytesConfig.from_dict(
-                config_dict={"load_in_8bit": load_in_8bit}, return_unused_kwargs=True, **kwargs
+                config_dict={"load_in_8bit": load_in_8bit, "load_in_4bit": load_in_4bit},
+                return_unused_kwargs=True,
+                **kwargs,
             )
         elif quantization_config is not None:
             load_in_8bit = quantization_config.load_in_8bit
+            load_in_4bit = quantization_config.load_in_4bit
 
             quantization_config_kwargs = {
                 k: v for k, v in kwargs.items() if k in inspect.signature(BitsAndBytesConfig).parameters
@@ -2215,30 +2225,32 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 if low_cpu_mem_usage is None:
                     low_cpu_mem_usage = True
 
-        if load_in_8bit:
+        if load_in_8bit or load_in_4bit:
             if not (is_accelerate_available() and is_bitsandbytes_available()):
                 raise ImportError(
                     "Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of"
                     " bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or"
                     " pip install bitsandbytes` "
                 )
-            if torch_dtype != torch.float16:
+
+            if torch_dtype is None:
                 # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
-                logger.warning(
+                logger.info(
                     f"Overriding torch_dtype={torch_dtype} with `torch_dtype=torch.float16` due to "
-                    "requirements of `bitsandbytes` to enable model loading in mixed int8. "
-                    "Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning."
+                    "requirements of `bitsandbytes` to enable model loading in mixed kbit. "
+                    "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                    " torch_dtype=torch.float16 to remove this warning."
                 )
                 torch_dtype = torch.float16
 
             if device_map is None:
                 raise ValueError(
-                    "A device map needs to be passed to run convert models into mixed-int8 format. Please run"
+                    "A device map needs to be passed to run convert models into 8-bit and 4-bit formats. Please run"
                     "`.from_pretrained` with `device_map='auto'`"
                 )
             if from_tf or from_flax:
                 raise ValueError(
-                    "Converting into mixed 8-bit weights from tf/flax weights is currently not supported, please make"
+                    "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
                     " sure the weights are in PyTorch format."
                 )
 
@@ -2296,8 +2308,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             load_in_8bit = quantization_config.load_in_8bit
 
             if load_in_8bit:
-                torch_dtype = torch.float16
-
+                if torch_dtype is None:
+                    torch_dtype = torch.float16
                 if device_map is None:
                     device_map = "auto"
 
@@ -2582,7 +2594,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             # Check if `_keep_in_fp32_modules` is not None
             use_keep_in_fp32_modules = (
-                (cls._keep_in_fp32_modules is not None) and is_accelerate_available() and torch_dtype == torch.float16
+                (cls._keep_in_fp32_modules is not None)
+                and is_accelerate_available()
+                and (torch_dtype == torch.float16 or load_in_4bit or load_in_8bit)
             )
             if (
                 (cls._keep_in_fp32_modules is not None)
@@ -2611,7 +2625,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
-        elif load_in_8bit or low_cpu_mem_usage:
+        elif load_in_8bit or load_in_4bit or low_cpu_mem_usage:
             init_contexts.append(init_empty_weights())
 
         with ContextManagers(init_contexts):
@@ -2624,20 +2638,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         else:
             keep_in_fp32_modules = []
 
-        if load_in_8bit:
-            from .utils.bitsandbytes import get_keys_to_not_convert, replace_8bit_linear
+        if load_in_8bit or load_in_4bit:
+            from .utils.bitsandbytes import get_keys_to_not_convert, replace_with_bnb_linear
 
-            load_in_8bit_skip_modules = quantization_config.llm_int8_skip_modules
-            load_in_8bit_threshold = quantization_config.llm_int8_threshold
+            llm_int8_skip_modules = quantization_config.llm_int8_skip_modules
             load_in_8bit_fp32_cpu_offload = quantization_config.llm_int8_enable_fp32_cpu_offload
 
             logger.info("Detected 8-bit loading: activating 8-bit loading for this model")
 
             # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
-            if load_in_8bit_skip_modules is None:
+            if llm_int8_skip_modules is None:
                 modules_to_not_convert = get_keys_to_not_convert(model)
             else:
-                modules_to_not_convert = load_in_8bit_skip_modules
+                modules_to_not_convert = llm_int8_skip_modules
 
             if not isinstance(modules_to_not_convert, list):
                 modules_to_not_convert = [modules_to_not_convert]
@@ -2657,21 +2670,36 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
                 modules_to_not_convert.extend(keys_on_cpu)
 
-            model = replace_8bit_linear(
-                model, threshold=load_in_8bit_threshold, modules_to_not_convert=modules_to_not_convert
+            supports_4bit = version.parse(importlib_metadata.version("bitsandbytes")) >= version.parse("0.39.0")
+
+            if load_in_4bit and not supports_4bit:
+                raise ValueError(
+                    "You have a version of `bitsandbytes` that is not compatible with 4bit inference and training"
+                    " make sure you have the latest version of `bitsandbytes` installed"
+                )
+
+            model = replace_with_bnb_linear(
+                model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config
             )
 
             # training in 8-bit is only available in 0.37.0+
-            model._is_int8_training_enabled = version.parse(
+            model._is_kbit_training_enabled = version.parse(
                 importlib_metadata.version("bitsandbytes")
             ) >= version.parse("0.37.0")
 
             model.config.quantization_config = quantization_config
             model.is_8bit_serializable = is_8bit_serializable
 
+        if load_in_8bit and torch_dtype is None:
+            logger.warning(
+                "You are loading your model in 8bit but you did not specify a `torch_dtype` attribute."
+                "All non-linear modules will be loaded in full precision.",
+                " If you want to load the other modules in other precision, please specify a `torch_dtype` attribute.",
+            )
+
         if isinstance(device_map, str):
             special_dtypes = {}
-            if load_in_8bit:
+            if load_in_8bit or load_in_4bit:
                 special_dtypes.update(
                     {
                         name: torch_dtype
@@ -2688,6 +2716,23 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 }
             )
 
+            target_dtype = torch_dtype
+
+            if load_in_4bit:
+                if version.parse(importlib_metadata.version("accelerate")) > version.parse("0.19.0"):
+                    from accelerate.utils import CustomDtype
+
+                    target_dtype = CustomDtype.INT4
+                else:
+                    raise ValueError(
+                        "You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute"
+                        " the appropriate device map, you should upgrade your `accelerate` library,"
+                        "`pip install --upgrade accelerare` or install it from source to support fp4 auto device map"
+                        "calculation. You may encounter unexpected behavior, or pass your own device map"
+                    )
+            elif load_in_8bit:
+                target_dtype = torch.int8
+
             if model._no_split_modules is None:
                 raise ValueError(f"{model.__class__.__name__} does not support `device_map='{device_map}'` yet.")
             no_split_modules = model._no_split_modules
@@ -2710,7 +2755,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             if device_map != "sequential" and get_balanced_memory is not None:
                 max_memory = get_balanced_memory(
                     model,
-                    dtype=torch_dtype if not load_in_8bit else torch.int8,
+                    dtype=target_dtype,
                     low_zero=(device_map == "balanced_low_0"),
                     max_memory=max_memory,
                     **kwargs,
@@ -2718,9 +2763,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             kwargs["max_memory"] = max_memory
             # Make sure tied weights are tied before creating the device map.
             model.tie_weights()
-            device_map = infer_auto_device_map(model, dtype=torch_dtype if not load_in_8bit else torch.int8, **kwargs)
+            device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
 
-            if load_in_8bit:
+            if load_in_8bit or load_in_4bit:
                 # The LM head / tied weights or any last module can stay on disk / CPU
                 device_map_without_lm_head = {
                     key: device_map[key] for key in device_map.keys() if key not in modules_to_not_convert
@@ -2795,11 +2840,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 offload_folder=offload_folder,
                 offload_state_dict=offload_state_dict,
                 dtype=torch_dtype,
-                load_in_8bit=load_in_8bit,
+                is_quantized=(load_in_8bit or load_in_4bit),
                 keep_in_fp32_modules=keep_in_fp32_modules,
             )
 
+        model.is_loaded_in_4bit = load_in_4bit
         model.is_loaded_in_8bit = load_in_8bit
+        model.is_quantized = load_in_8bit or load_in_4bit
 
         # make sure token embedding weights are still tied if needed
         model.tie_weights()
@@ -2862,12 +2909,12 @@ def _load_pretrained_model(
         offload_folder=None,
         offload_state_dict=None,
         dtype=None,
-        load_in_8bit=False,
+        is_quantized=False,
         keep_in_fp32_modules=None,
     ):
         is_safetensors = False
-        if load_in_8bit:
-            from .utils.bitsandbytes import set_module_8bit_tensor_to_device
+        if is_quantized:
+            from .utils.bitsandbytes import set_module_quantized_tensor_to_device
 
         if device_map is not None and "disk" in device_map.values():
             archive_file = (
@@ -2973,10 +3020,10 @@ def _fix_key(key):
                     target_dtype = torch.float32
 
                 if param.device == torch.device("meta"):
-                    if not load_in_8bit:
+                    if not (is_quantized):
                         set_module_tensor_to_device(model, key, "cpu", torch.empty(*param.size(), dtype=target_dtype))
                     else:
-                        set_module_8bit_tensor_to_device(
+                        set_module_quantized_tensor_to_device(
                             model, key, "cpu", torch.empty(*param.size(), dtype=target_dtype)
                         )
 
@@ -3134,7 +3181,7 @@ def _find_mismatched_keys(
                         state_dict_folder=state_dict_folder,
                         state_dict_index=state_dict_index,
                         dtype=dtype,
-                        load_in_8bit=load_in_8bit,
+                        is_quantized=is_quantized,
                         is_safetensors=is_safetensors,
                         keep_in_fp32_modules=keep_in_fp32_modules,
                     )
@@ -3174,7 +3221,7 @@ def _find_mismatched_keys(
                 )
             raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
 
-        if load_in_8bit:
+        if is_quantized:
             unexpected_keys = [elem for elem in unexpected_keys if "SCB" not in elem]
             missing_keys = [elem for elem in missing_keys if "SCB" not in elem]
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 90b25cfb2097..e708de370152 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -391,8 +391,8 @@ def __init__(
             )
 
         # At this stage the model is already loaded
-        if getattr(model, "is_loaded_in_8bit", False):
-            if getattr(model, "_is_int8_training_enabled", False):
+        if getattr(model, "is_loaded_in_kbit", False):
+            if getattr(model, "_is_kbit_training_enabled", False):
                 logger.info(
                     "The model is loaded in 8-bit precision. To train this model you need to add additional modules"
                     " inside the model such as adapters using `peft` library and freeze the model weights. Please"
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 3768506f4113..5cb82c44d699 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -1,3 +1,4 @@
+import warnings
 from copy import deepcopy
 
 from packaging import version
@@ -15,7 +16,7 @@
     from accelerate.utils import find_tied_parameters
 
 
-def set_module_8bit_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
+def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
     """
     A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
     `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
@@ -52,12 +53,16 @@ class `Int8Params` from `bitsandbytes`.
     if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
         raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
 
-    if is_buffer:
-        has_fp16_weights = None
+    is_4bit = False
+    is_8bit = False
+    if is_buffer or not is_bitsandbytes_available():
+        is_8bit = False
+        is_4bit = False
     else:
-        has_fp16_weights = getattr(module._parameters[tensor_name], "has_fp16_weights", None)
+        is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit)
+        is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params)
 
-    if has_fp16_weights is not None:
+    if is_8bit or is_4bit:
         param = module._parameters[tensor_name]
         if param.device.type != "cuda":
             if value is None:
@@ -75,11 +80,17 @@ class `Int8Params` from `bitsandbytes`.
                         )
             else:
                 new_value = torch.tensor(value, device="cpu")
-            new_value = bnb.nn.Int8Params(new_value, requires_grad=False, has_fp16_weights=has_fp16_weights).to(device)
-            module._parameters[tensor_name] = new_value
 
+            kwargs = old_value.__dict__
+            if is_8bit:
+                new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
+            elif is_4bit:
+                new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device)
+
+            module._parameters[tensor_name] = new_value
             if fp16_statistics is not None:
                 setattr(module.weight, "SCB", fp16_statistics.to(device))
+
     else:
         if value is None:
             new_value = old_value.to(device)
@@ -95,10 +106,10 @@ class `Int8Params` from `bitsandbytes`.
             module._parameters[tensor_name] = new_value
 
 
-def replace_8bit_linear(model, threshold=6.0, modules_to_not_convert=None, current_key_name=None):
+def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
     """
     A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
-    library. This will enable running your models using mixed int8 precision as described by the paper `GPT3.int8():
+    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
     8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
     version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
     bitsandbytes`
@@ -113,9 +124,6 @@ def replace_8bit_linear(model, threshold=6.0, modules_to_not_convert=None, curre
     Parameters:
         model (`torch.nn.Module`):
             Input model or `torch.nn.Module` as the function is run recursively.
-        threshold (`float`, *optional*, defaults to 6.0):
-            `int8_threshold` for outlier detection as described in the formentioned paper. This parameters is set to
-            `6.0` as described by the paper.
         modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
             Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
             for numerical stability reasons.
@@ -128,29 +136,65 @@ def replace_8bit_linear(model, threshold=6.0, modules_to_not_convert=None, curre
     for name, module in model.named_children():
         if current_key_name is None:
             current_key_name = []
-        current_key_name.append(name)
-
-        if len(list(module.children())) > 0:
-            replace_8bit_linear(module, threshold, modules_to_not_convert, current_key_name)
 
         if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
             # Check if the current key is not in the `modules_to_not_convert`
             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
                 with init_empty_weights():
-                    model._modules[name] = bnb.nn.Linear8bitLt(
-                        module.in_features,
-                        module.out_features,
-                        module.bias is not None,
-                        has_fp16_weights=False,
-                        threshold=threshold,
-                    )
+                    if quantization_config.quantization_method() == "llm_int8":
+                        model._modules[name] = bnb.nn.Linear8bitLt(
+                            module.in_features,
+                            module.out_features,
+                            module.bias is not None,
+                            has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
+                            threshold=quantization_config.llm_int8_threshold,
+                        )
+                    else:
+                        if (
+                            quantization_config.llm_int8_skip_modules is not None
+                            and name in quantization_config.llm_int8_skip_modules
+                        ):
+                            pass
+                        else:
+                            model._modules[name] = bnb.nn.Linear4bit(
+                                module.in_features,
+                                module.out_features,
+                                module.bias is not None,
+                                quantization_config.bnb_4bit_compute_dtype,
+                                compress_statistics=quantization_config.bnb_4bit_use_double_quant,
+                                quant_type=quantization_config.bnb_4bit_quant_type,
+                            )
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
         # Remove the last key for recursion
-        current_key_name.pop(-1)
+        if len(list(module.children())) > 0:
+            replace_with_bnb_linear(
+                module,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+            )
     return model
 
 
+# For backward compatibility
+def replace_8bit_linear(*args, **kwargs):
+    warnings.warn(
+        "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead",
+        FutureWarning,
+    )
+    return replace_with_bnb_linear(*args, **kwargs)
+
+
+# For backward compatiblity
+def set_module_8bit_tensor_to_device(*args, **kwargs):
+    warnings.warn(
+        "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
+        FutureWarning,
+    )
+    return set_module_quantized_tensor_to_device(*args, **kwargs)
+
+
 def get_keys_to_not_convert(model):
     r"""
     An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index f123faaab32f..2647418a13a6 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -20,7 +20,14 @@
 from dataclasses import dataclass
 from typing import Any, Dict, Union
 
-from ..utils import logging
+from packaging import version
+
+from ..utils import is_torch_available, logging
+from ..utils.import_utils import importlib_metadata
+
+
+if is_torch_available():
+    import torch
 
 
 logger = logging.get_logger(__name__)
@@ -32,14 +39,17 @@ class BitsAndBytesConfig:
     This is a wrapper class about all possible attributes and features that you can play with a model that has been
     loaded using `bitsandbytes`.
 
-    This replaces `load_in_8bit` therefore both options are mutually exclusive.
+    This replaces `load_in_8bit` or `load_in_4bit`therefore both options are mutually exclusive.
 
-    For now, only arguments that are relative to `LLM.int8()` are supported, therefore the arguments are all termed as
-    `llm_int8_*`. If more methods are added to `bitsandbytes`, then more arguments will be added to this class.
+    Currently only supports `LLM.int8()`, `FP4`, and `NF4` quantization. If more methods are added to `bitsandbytes`,
+    then more arguments will be added to this class.
 
     Args:
         load_in_8bit (`bool`, *optional*, defaults to `False`):
             This flag is used to enable 8-bit quantization with LLM.int8().
+        load_in_4bit (`bool`, *optional*, defaults to `False`):
+            This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
+            `bitsandbytes`.
         llm_int8_threshold (`float`, *optional*, defaults to 6):
             This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
             Multiplication for Transformers at Scale` paper: https://arxiv.org/abs/2208.07339 Any hidden states value
@@ -58,6 +68,18 @@ class BitsAndBytesConfig:
             your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
             this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
             operations will not be run on CPU.
+        llm_int8_has_fp16_weight (`bool`, *optional*, defaults to `False`):
+            This flag runs LLM.int8() with 16-bit main weights. This is useful for fine-tuning as the weights do not
+            have to be converted back and forth for the backward pass.
+        bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
+            This sets the computational type which might be different than the input time. For example, inputs might be
+            fp32, but computation can be set to bf16 for speedups.
+        bnb_4bit_quant_type (`str`, {fp4, fn4}, defaults to `fp4`):
+            This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
+            which are specified by `fp4` or `fn4`.
+        bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
+            This flag is used for nested quantization where the quantization constants from the first quantization are
+            quantized again.
         kwargs (`Dict[str, Any]`, *optional*):
             Additional parameters from which to initialize the configuration object.
     """
@@ -65,15 +87,33 @@ class BitsAndBytesConfig:
     def __init__(
         self,
         load_in_8bit=False,
+        load_in_4bit=False,
         llm_int8_threshold=6.0,
         llm_int8_skip_modules=None,
         llm_int8_enable_fp32_cpu_offload=False,
+        llm_int8_has_fp16_weight=False,
+        bnb_4bit_compute_dtype=None,
+        bnb_4bit_quant_type="fp4",
+        bnb_4bit_use_double_quant=False,
         **kwargs,
     ):
         self.load_in_8bit = load_in_8bit
+        self.load_in_4bit = load_in_4bit
         self.llm_int8_threshold = llm_int8_threshold
         self.llm_int8_skip_modules = llm_int8_skip_modules
         self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
+        self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
+        self.bnb_4bit_quant_type = bnb_4bit_quant_type
+        self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
+
+        if bnb_4bit_compute_dtype is None:
+            self.bnb_4bit_compute_dtype = torch.float32
+        elif isinstance(bnb_4bit_compute_dtype, str):
+            self.bnb_4bit_compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+        elif isinstance(bnb_4bit_compute_dtype, torch.dtype):
+            self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        else:
+            raise ValueError("bnb_4bit_compute_dtype must be a string or a torch.dtype")
 
         self.post_init()
 
@@ -86,10 +126,48 @@ def post_init(self):
 
         if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
             raise ValueError("llm_int8_skip_modules must be a list of strings")
-
         if not isinstance(self.llm_int8_enable_fp32_cpu_offload, bool):
             raise ValueError("llm_int8_enable_fp32_cpu_offload must be a boolean")
 
+        if not isinstance(self.llm_int8_has_fp16_weight, bool):
+            raise ValueError("llm_int8_has_fp16_weight must be a boolean")
+
+        if self.bnb_4bit_compute_dtype is not None and not isinstance(self.bnb_4bit_compute_dtype, torch.dtype):
+            raise ValueError("bnb_4bit_compute_dtype must be torch.dtype")
+
+        if not isinstance(self.bnb_4bit_quant_type, str):
+            raise ValueError("bnb_4bit_quant_type must be a string")
+
+        if not isinstance(self.bnb_4bit_use_double_quant, bool):
+            raise ValueError("bnb_4bit_use_double_quant must be a boolean")
+
+        if self.load_in_4bit and not version.parse(importlib_metadata.version("bitsandbytes")) >= version.parse(
+            "0.39.0"
+        ):
+            raise ValueError(
+                "4 bit quantization requires bitsandbytes>=0.39.0 - please upgrade your bitsandbytes version"
+            )
+
+    def is_quantizable(self):
+        r"""
+        Returns `True` if the model is quantizable, `False` otherwise.
+        """
+        return self.load_in_8bit or self.load_in_4bit
+
+    def quantization_method(self):
+        r"""
+        This method returns the quantization method used for the model. If the model is not quantizable, it returns
+        `None`.
+        """
+        if self.load_in_8bit:
+            return "llm_int8"
+        elif self.load_in_4bit and self.bnb_4bit_quant_type == "fp4":
+            return "fp4"
+        elif self.load_in_4bit and self.bnb_4bit_quant_type == "nf4":
+            return "nf4"
+        else:
+            return None
+
     @classmethod
     def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
         """
@@ -107,6 +185,7 @@ def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
         Returns:
             [`BitsAndBytesConfig`]: The configuration object instantiated from those parameters.
         """
+
         config = cls(**config_dict)
 
         to_remove = []
@@ -144,5 +223,8 @@ def to_dict(self) -> Dict[str, Any]:
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
+
         output = copy.deepcopy(self.__dict__)
+        output["bnb_4bit_compute_dtype"] = str(output["bnb_4bit_compute_dtype"]).split(".")[1]
+
         return output
diff --git a/tests/mixed_int8/README.md b/tests/bitsandbytes/README.md
similarity index 100%
rename from tests/mixed_int8/README.md
rename to tests/bitsandbytes/README.md
diff --git a/tests/mixed_int8/__init__.py b/tests/bitsandbytes/__init__.py
similarity index 100%
rename from tests/mixed_int8/__init__.py
rename to tests/bitsandbytes/__init__.py
diff --git a/tests/bitsandbytes/test_4bit.py b/tests/bitsandbytes/test_4bit.py
new file mode 100644
index 000000000000..1d0ea6dc3de2
--- /dev/null
+++ b/tests/bitsandbytes/test_4bit.py
@@ -0,0 +1,460 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import tempfile
+import unittest
+
+from packaging import version
+
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_torch_available,
+    require_accelerate,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+from transformers.utils.versions import importlib_metadata
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    class LoRALayer(nn.Module):
+        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""
+
+        def __init__(self, module: nn.Module, rank: int):
+            super().__init__()
+            self.module = module
+            self.adapter = nn.Sequential(
+                nn.Linear(module.in_features, rank, bias=False),
+                nn.Linear(rank, module.out_features, bias=False),
+            )
+            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
+            nn.init.normal_(self.adapter[0].weight, std=small_std)
+            nn.init.zeros_(self.adapter[1].weight)
+            self.adapter.to(module.weight.device)
+
+        def forward(self, input, *args, **kwargs):
+            return self.module(input, *args, **kwargs) + self.adapter(input)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class Base4bitTest(unittest.TestCase):
+    # We keep the constants inside the init function and model loading inside setUp function
+
+    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
+    # Therefore here we use only bloom-1b3 to test our module
+    model_name = "bigscience/bloom-1b7"
+
+    # Constant values
+    EXPECTED_RELATIVE_DIFFERENCE = (
+        2.109659552692574  # This was obtained on a RTX Titan so the number might slightly change
+    )
+
+    input_text = "Hello my name is"
+    EXPECTED_OUTPUTS = set()
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
+    EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
+    MAX_NEW_TOKENS = 10
+
+    def setUp(self):
+        # Models and tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+
+class Bnb4BitTest(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+
+        # Models and tokenizer
+        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
+            self.model_name, torch_dtype=torch.float16, device_map="auto"
+        )
+        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.model_fp16
+        del self.model_4bit
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_memory_footprint(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from bitsandbytes.nn import Params4bit
+
+        mem_fp16 = self.model_fp16.get_memory_footprint()
+        mem_4bit = self.model_4bit.get_memory_footprint()
+
+        self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+        self.assertTrue(self.model_4bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
+
+    def test_linear_are_4bit(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from transformers import T5PreTrainedModel
+
+        self.model_fp16.get_memory_footprint()
+        self.model_4bit.get_memory_footprint()
+
+        for name, module in self.model_4bit.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
+                    # 4-bit parameters are packed in uint8 variables
+                    self.assertTrue(module.weight.dtype == torch.uint8)
+
+    def test_generate_quality(self):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_generate_quality_config(self):
+        r"""
+        Test that loading the model with the config is equivalent
+        """
+        bnb_config = BitsAndBytesConfig()
+        bnb_config.load_in_4bit = True
+
+        model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
+            self.model_name, quantization_config=bnb_config, device_map="auto"
+        )
+
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model_4bit_from_config.generate(
+            input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
+        )
+
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_raise_on_save_pretrained(self):
+        r"""
+        Test whether trying to save a model after converting it in 8-bit will throw a warning.
+        """
+        with self.assertRaises(NotImplementedError), tempfile.TemporaryDirectory() as tmpdirname:
+            self.model_4bit.save_pretrained(tmpdirname)
+
+    def test_raise_if_config_and_load_in_4bit(self):
+        r"""
+        Test that loading the model with the config and `load_in_4bit` raises an error
+        """
+        bnb_config = BitsAndBytesConfig()
+
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                quantization_config=bnb_config,
+                load_in_4bit=True,
+                device_map="auto",
+                bnb_4bit_quant_type="nf4",
+            )
+
+    def test_device_and_dtype_assignment(self):
+        r"""
+        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
+        Checks also if other models are casted correctly.
+        """
+        with self.assertRaises(ValueError):
+            # Tries with `str`
+            self.model_4bit.to("cpu")
+
+        with self.assertRaises(ValueError):
+            # Tries with a `dtype``
+            self.model_4bit.to(torch.float16)
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_4bit.to(torch.device("cuda:0"))
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_4bit.float()
+
+        with self.assertRaises(ValueError):
+            # Tries with a `device`
+            self.model_4bit.half()
+
+        # Test if we did not break anything
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        self.model_fp16 = self.model_fp16.to(torch.float32)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Check this does not throw an error
+        _ = self.model_fp16.to("cpu")
+
+        # Check this does not throw an error
+        _ = self.model_fp16.half()
+
+        # Check this does not throw an error
+        _ = self.model_fp16.float()
+
+    def test_fp32_4bit_conversion(self):
+        r"""
+        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        """
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_4bit=True, device_map="auto")
+        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class Bnb4BitT5Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "t5-small"
+        cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+        cls.input_text = "Translate in German: Hello, my dog is cute"
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_inference_without_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        from transformers import T5ForConditionalGeneration
+
+        modules = T5ForConditionalGeneration._keep_in_fp32_modules
+        T5ForConditionalGeneration._keep_in_fp32_modules = None
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_4bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+        T5ForConditionalGeneration._keep_in_fp32_modules = modules
+
+    def test_inference_with_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        import bitsandbytes as bnb
+
+        from transformers import T5ForConditionalGeneration
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+
+        # there was a bug with decoders - this test checks that it is fixed
+        self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
+
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_4bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+
+class Classes4BitModelTest(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+        # model_name
+        self.model_name = "bigscience/bloom-560m"
+        self.seq_to_seq_name = "t5-small"
+
+        # Different types of model
+
+        self.base_model = AutoModel.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        # Sequence classification model
+        self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name, load_in_4bit=True, device_map="auto"
+        )
+        # CausalLM model
+        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        # Seq2seq model
+        self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
+            self.seq_to_seq_name, load_in_4bit=True, device_map="auto"
+        )
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.base_model
+        del self.sequence_model
+        del self.model_4bit
+        del self.seq_to_seq_model
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_correct_head_class(self):
+        r"""
+        A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
+        are kept in their native class.
+        """
+        from bitsandbytes.nn import Params4bit
+
+        self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
+
+        # Other heads should be nn.Parameter
+        self.assertTrue(self.model_4bit.lm_head.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
+        self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
+
+
+class Pipeline4BitTest(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        del self.pipe
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_pipeline(self):
+        r"""
+        The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
+        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
+        on pipline.
+        """
+        # self._clear_cuda_cache()
+        self.pipe = pipeline(
+            "text-generation",
+            model=self.model_name,
+            model_kwargs={"device_map": "auto", "load_in_4bit": True, "torch_dtype": torch.float16},
+            max_new_tokens=self.MAX_NEW_TOKENS,
+        )
+
+        # Real second forward pass
+        pipeline_output = self.pipe(self.input_text)
+        self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
+
+
+@require_torch_multi_gpu
+class Bnb4bitTestMultiGpu(Base4bitTest):
+    def setUp(self):
+        super().setUp()
+
+    def test_multi_gpu_loading(self):
+        r"""
+        This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
+        Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
+        """
+
+        model_parallel = AutoModelForCausalLM.from_pretrained(
+            self.model_name, load_in_4bit=True, device_map="balanced"
+        )
+
+        # Check correct device map
+        self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
+
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Second real batch
+        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+
+class Bnb4BitTestTraining(Base4bitTest):
+    def setUp(self):
+        self.model_name = "facebook/opt-350m"
+        super().setUp()
+
+    def test_training(self):
+        if version.parse(importlib_metadata.version("bitsandbytes")) < version.parse("0.37.0"):
+            return
+
+        # Step 1: freeze all parameters
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+
+        for param in model.parameters():
+            param.requires_grad = False  # freeze the model - train adapters later
+            if param.ndim == 1:
+                # cast the small parameters (e.g. layernorm) to fp32 for stability
+                param.data = param.data.to(torch.float32)
+
+        # Step 2: add adapters
+        for _, module in model.named_modules():
+            if "OPTAttention" in repr(type(module)):
+                module.q_proj = LoRALayer(module.q_proj, rank=16)
+                module.k_proj = LoRALayer(module.k_proj, rank=16)
+                module.v_proj = LoRALayer(module.v_proj, rank=16)
+
+        # Step 3: dummy batch
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(0)
+
+        # Step 4: Check if the gradient is not None
+        with torch.cuda.amp.autocast():
+            out = model.forward(**batch)
+            out.logits.norm().backward()
+
+        for module in model.modules():
+            if isinstance(module, LoRALayer):
+                self.assertTrue(module.adapter[1].weight.grad is not None)
+                self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
+            elif isinstance(module, nn.Embedding):
+                self.assertTrue(module.weight.grad is None)
diff --git a/tests/mixed_int8/test_mixed_int8.py b/tests/bitsandbytes/test_mixed_int8.py
similarity index 97%
rename from tests/mixed_int8/test_mixed_int8.py
rename to tests/bitsandbytes/test_mixed_int8.py
index 7053be30a101..b31aaa386ae0 100644
--- a/tests/mixed_int8/test_mixed_int8.py
+++ b/tests/bitsandbytes/test_mixed_int8.py
@@ -131,6 +131,21 @@ def test_memory_footprint(self):
         self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
         self.assertTrue(self.model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
 
+    def test_linear_are_8bit(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model and the class type of the linear layers of the converted models
+        """
+        from transformers import T5PreTrainedModel
+
+        self.model_fp16.get_memory_footprint()
+        self.model_8bit.get_memory_footprint()
+
+        for name, module in self.model_8bit.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
+                    self.assertTrue(module.weight.dtype == torch.int8)
+
     def test_generate_quality(self):
         r"""
         Test the generation quality of the quantized model and see that we are matching the expected output.
@@ -147,6 +162,7 @@ def test_generate_quality_config(self):
         Test that loading the model with the config is equivalent
         """
         bnb_config = BitsAndBytesConfig()
+        bnb_config.load_in_8bit = True
 
         model_8bit_from_config = AutoModelForCausalLM.from_pretrained(
             self.model_name, quantization_config=bnb_config, device_map="auto"
@@ -329,6 +345,7 @@ def test_inference_without_keep_in_fp32(self):
         """
         from transformers import T5ForConditionalGeneration
 
+        modules = T5ForConditionalGeneration._keep_in_fp32_modules
         T5ForConditionalGeneration._keep_in_fp32_modules = None
 
         # test with `t5-small`
@@ -342,6 +359,7 @@ def test_inference_without_keep_in_fp32(self):
         )
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
         _ = model.generate(**encoded_input)
+        T5ForConditionalGeneration._keep_in_fp32_modules = modules
 
     def test_inference_with_keep_in_fp32(self):
         r"""

From 796162c51298547c357b20cc33d64cbcf77d0241 Mon Sep 17 00:00:00 2001
From: Tim Dettmers <TimDettmers@users.noreply.github.com>
Date: Wed, 24 May 2023 03:53:28 -0700
Subject: [PATCH 212/935] Paged Optimizer + Lion Optimizer for Trainer (#23217)

* Added lion and paged optimizers and made original tests pass.

* Added tests for paged and lion optimizers.

* Added and fixed optimizer tests.

* Style and quality checks.

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 src/transformers/trainer.py       |  32 +++++
 src/transformers/training_args.py |   9 +-
 tests/trainer/test_trainer.py     | 193 +++++++++++++++++++++++++++++-
 3 files changed, 230 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e708de370152..72fcd34d7ff7 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1170,6 +1170,38 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 optimizer_kwargs.update(adam_kwargs)
             except ImportError:
                 raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
+        elif args.optim in [
+            OptimizerNames.ADAMW_BNB,
+            OptimizerNames.ADAMW_8BIT,
+            OptimizerNames.PAGED_ADAMW,
+            OptimizerNames.PAGED_ADAMW_8BIT,
+            OptimizerNames.LION,
+            OptimizerNames.LION_8BIT,
+            OptimizerNames.PAGED_LION,
+            OptimizerNames.PAGED_LION_8BIT,
+        ]:
+            try:
+                from bitsandbytes.optim import AdamW, Lion
+
+                is_paged = False
+                optim_bits = 32
+                optimizer_cls = None
+                additional_optim_kwargs = adam_kwargs
+                if "paged" in args.optim:
+                    is_paged = True
+                if "8bit" in args.optim:
+                    optim_bits = 8
+                if "adam" in args.optim:
+                    optimizer_cls = AdamW
+                elif "lion" in args.optim:
+                    optimizer_cls = Lion
+                    additional_optim_kwargs = {"betas": (args.adam_beta1, args.adam_beta2)}
+
+                bnb_kwargs = {"is_paged": is_paged, "optim_bits": optim_bits}
+                optimizer_kwargs.update(additional_optim_kwargs)
+                optimizer_kwargs.update(bnb_kwargs)
+            except ImportError:
+                raise ValueError("Trainer tried to instantiate bnb optimizer but bnb is not installed!")
         elif args.optim == OptimizerNames.ADAMW_BNB:
             try:
                 from bitsandbytes.optim import Adam8bit
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index e2df8dc333b0..57aca25712de 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -139,10 +139,17 @@ class OptimizerNames(ExplicitEnum):
     ADAMW_TORCH_XLA = "adamw_torch_xla"
     ADAMW_APEX_FUSED = "adamw_apex_fused"
     ADAFACTOR = "adafactor"
-    ADAMW_BNB = "adamw_bnb_8bit"
     ADAMW_ANYPRECISION = "adamw_anyprecision"
     SGD = "sgd"
     ADAGRAD = "adagrad"
+    ADAMW_BNB = "adamw_bnb_8bit"
+    ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
+    LION_8BIT = "lion_8bit"
+    LION = "lion_32bit"
+    PAGED_ADAMW = "paged_adamw_32bit"
+    PAGED_ADAMW_8BIT = "paged_adamw_8bit"
+    PAGED_LION = "paged_lion_32bit"
+    PAGED_LION_8BIT = "paged_lion_8bit"
 
 
 @dataclass
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 63a126358804..95b92d5295d0 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -2474,6 +2474,11 @@ def hp_name(trial):
         "lr": TrainingArguments.learning_rate,
     }
 
+    default_lion_kwargs = {
+        "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2),
+        "lr": TrainingArguments.learning_rate,
+    }
+
     default_anyprecision_kwargs = {
         "use_kahan_summation": False,
         "momentum_dtype": torch.float32,
@@ -2525,11 +2530,59 @@ def hp_name(trial):
         optim_test_params.append(
             (
                 TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
-                bnb.optim.Adam8bit,
+                bnb.optim.AdamW,
                 default_adam_kwargs,
             )
         )
 
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
+                bnb.optim.AdamW,
+                default_adam_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
+                bnb.optim.AdamW,
+                default_adam_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
+                bnb.optim.AdamW,
+                default_adam_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
+                bnb.optim.Lion,
+                default_lion_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
+                bnb.optim.Lion,
+                default_lion_kwargs,
+            )
+        )
+
+        optim_test_params.append(
+            (
+                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
+                bnb.optim.Lion,
+                default_lion_kwargs,
+            )
+        )
+
     if is_torchdistx_available():
         import torchdistx
 
@@ -2598,15 +2651,113 @@ def test_bnb_adam8bit(self):
         modules = {
             "bitsandbytes": mock,
             "bitsandbytes.optim": mock.optim,
-            "bitsandbytes.optim.Adam8bit": mock.optim.Adam8bit,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
         }
         with patch.dict("sys.modules", modules):
             self.check_optim_and_kwargs(
                 TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
-                mock.optim.Adam8bit,
+                mock.optim.AdamW,
                 default_adam_kwargs,
             )
 
+    def test_bnb_paged_adam8bit_alias(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
+                mock.optim.AdamW,
+                default_adam_kwargs,
+            )
+
+    def test_bnb_paged_adam(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
+                mock.optim.AdamW,
+                default_adam_kwargs,
+            )
+
+    def test_bnb_paged_adam8bit(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
+                mock.optim.AdamW,
+                default_adam_kwargs,
+            )
+
+    def test_bnb_lion(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
+    def test_bnb_lion8bit(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
+    def test_bnb_paged_lion8bit(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
+    def test_bnb_paged_lion(self):
+        mock = Mock()
+        modules = {
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Lion": mock.optim.Lion,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None"),
+                mock.optim.Lion,
+                default_lion_kwargs,
+            )
+
     def test_bnb_adam8bit_no_bnb(self):
         args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None")
 
@@ -2616,6 +2767,42 @@ def test_bnb_adam8bit_no_bnb(self):
             with self.assertRaises(ValueError):
                 Trainer.get_optimizer_cls_and_kwargs(args)
 
+    def test_bnb_paged_adam_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
+    def test_bnb_paged_adam8bit_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
+    def test_bnb_paged_lion_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
+    def test_bnb_paged_lion8bit_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
     def test_anyprecision_adamw(self):
         # Pretend that torchdistx is installed and mock torchdistx.optimizers.AnyPrecisionAdamW exists.
         # Trainer.get_optimizer_cls_and_kwargs does not use AnyPrecisioinAdamW. It only has to return the

From 2eaaf17a0b0ab4c13cb1b1e87accd2d5dee47be4 Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Wed, 24 May 2023 08:13:23 -0400
Subject: [PATCH 213/935] Export to ONNX doc refocused on using optimum, added
 tflite (#23434)

* doc refocused on using optimum, tflite

* minor updates to fix checks

* Apply suggestions from code review

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>

* TFLite to separate page, added links

* Removed the onnx list builder

* make style

* Update docs/source/en/serialization.mdx

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>

---------

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
---
 docs/source/en/_toctree.yml      |   2 +
 docs/source/en/serialization.mdx | 556 ++++++-------------------------
 docs/source/en/tflite.mdx        |  58 ++++
 utils/check_table.py             |  47 ---
 4 files changed, 170 insertions(+), 493 deletions(-)
 create mode 100644 docs/source/en/tflite.mdx

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a9113c5e7400..30bb7fca768f 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -93,6 +93,8 @@
       title: Run training on Amazon SageMaker
     - local: serialization
       title: Export to ONNX
+    - local: tflite
+      title: Export to TFLite
     - local: torchscript
       title: Export to TorchScript
     - local: benchmarks
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index cc429dea08a5..022cf460f808 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -12,13 +12,20 @@ specific language governing permissions and limitations under the License.
 
 # Export to ONNX
 
-If you need to deploy 🤗 Transformers models in production environments, we recommend
-exporting them to a serialized format that can be loaded and executed on specialized
-runtimes and hardware. In this guide, we'll show you how to export 🤗 Transformers
-models to [ONNX (Open Neural Network eXchange)](http://onnx.ai).
+Deploying 🤗 Transformers models in production environments often requires, or can benefit from exporting the models into 
+a serialized format that can be loaded and executed on specialized runtimes and hardware.
 
-ONNX is an open standard that defines a common set of operators and a common file format
-to represent deep learning models in a wide variety of frameworks, including PyTorch and
+🤗 Optimum is an extension of Transformers that enables exporting models from PyTorch or TensorFlow to serialized formats 
+such as ONNX and TFLite through its `exporters` module. 🤗 Optimum also provides a set of performance optimization tools to train 
+and run models on targeted hardware with maximum efficiency.
+
+This guide demonstrates how you can export 🤗 Transformers models to ONNX with 🤗 Optimum, for the guide on exporting models to TFLite, 
+please refer to the [Export to TFLite page](tflite).
+
+## Export to ONNX 
+
+[ONNX (Open Neural Network eXchange)](http://onnx.ai) is an open standard that defines a common set of operators and a 
+common file format to represent deep learning models in a wide variety of frameworks, including PyTorch and
 TensorFlow. When a model is exported to the ONNX format, these operators are used to
 construct a computational graph (often called an _intermediate representation_) which
 represents the flow of data through the neural network.
@@ -27,166 +34,67 @@ By exposing a graph with standardized operators and data types, ONNX makes it ea
 switch between frameworks. For example, a model trained in PyTorch can be exported to
 ONNX format and then imported in TensorFlow (and vice versa).
 
-🤗 Transformers provides a [`transformers.onnx`](main_classes/onnx) package that enables
-you to convert model checkpoints to an ONNX graph by leveraging configuration objects.
-These configuration objects come ready made for a number of model architectures, and are
-designed to be easily extendable to other architectures.
-
-<Tip>
-
-You can also export 🤗 Transformers models with the [`optimum.exporters.onnx` package](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model)
-from 🤗 Optimum.
-
-Once exported, a model can be:
-
-- Optimized for inference via techniques such as quantization and graph optimization.
-- Run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort),
+Once exported to ONNX format, a model can be:
+- optimized for inference via techniques such as [graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization). 
+- run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort),
 which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers.
-- Run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines),
-which has the same API as the [`pipeline`] function in 🤗 Transformers.
+- run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines),
+which has the same API as the [`pipeline`] function in 🤗 Transformers. 
 
-To explore all these features,  check out the [🤗 Optimum library](https://github.com/huggingface/optimum).
+🤗 Optimum provides support for the ONNX export by leveraging configuration objects. These configuration objects come 
+ready-made for a number of model architectures, and are designed to be easily extendable to other architectures.
 
-</Tip>
+For the list of ready-made configurations, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/onnx/overview).
 
-Ready-made configurations include the following architectures:
-
-<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
-
-- ALBERT
-- BART
-- BEiT
-- BERT
-- BigBird
-- BigBird-Pegasus
-- Blenderbot
-- BlenderbotSmall
-- BLOOM
-- CamemBERT
-- Chinese-CLIP
-- CLIP
-- CodeGen
-- Conditional DETR
-- ConvBERT
-- ConvNeXT
-- Data2VecText
-- Data2VecVision
-- DeBERTa
-- DeBERTa-v2
-- DeiT
-- DETR
-- DistilBERT
-- EfficientNet
-- ELECTRA
-- ERNIE
-- FlauBERT
-- GPT Neo
-- GPT-J
-- GPT-Sw3
-- GroupViT
-- I-BERT
-- ImageGPT
-- LayoutLM
-- LayoutLMv3
-- LeViT
-- Longformer
-- LongT5
-- M2M100
-- Marian
-- mBART
-- MEGA
-- MobileBERT
-- MobileNetV1
-- MobileNetV2
-- MobileViT
-- MT5
-- OpenAI GPT-2
-- OWL-ViT
-- Perceiver
-- PLBart
-- PoolFormer
-- RemBERT
-- ResNet
-- RoBERTa
-- RoBERTa-PreLayerNorm
-- RoFormer
-- SegFormer
-- SqueezeBERT
-- SwiftFormer
-- Swin Transformer
-- T5
-- Table Transformer
-- Vision Encoder decoder
-- ViT
-- Whisper
-- X-MOD
-- XLM
-- XLM-RoBERTa
-- XLM-RoBERTa-XL
-- YOLOS
-
-In the next two sections, we'll show you how to:
-
-* Export a supported model using the `transformers.onnx` package.
-* Export a custom model for an unsupported architecture.
-
-## Exporting a model to ONNX
-
-<Tip>
-
-The recommended way of exporting a model is now to use
-[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli),
-do not worry it is very similar to `transformers.onnx`!
+There are two ways to export a 🤗 Transformers model to ONNX, here we show both:
 
-</Tip>
+- export with 🤗 Optimum via CLI.
+- export with 🤗 Optimum with `optimum.onnxruntime`.
 
-To export a 🤗 Transformers model to ONNX, you'll first need to install some extra
-dependencies:
+### Exporting a 🤗 Transformers model to ONNX with CLI
+
+To export a 🤗 Transformers model to ONNX, first install an extra dependency:
 
 ```bash
-pip install transformers[onnx]
+pip install optimum[exporters]
 ```
 
-The `transformers.onnx` package can then be used as a Python module:
+To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli), 
+or view help in command line:
 
 ```bash
-python -m transformers.onnx --help
-
-usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
-
-positional arguments:
-  output                Path indicating where to store generated ONNX model.
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Model ID on huggingface.co or path on disk to load model from.
-  --feature {causal-lm, ...}
-                        The type of features to export the model with.
-  --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerance when validating the model.
+optimum-cli export onnx --help
 ```
 
-Exporting a checkpoint using a ready-made configuration can be done as follows:
+To export a model's checkpoint from the 🤗 Hub, for example, `distilbert-base-uncased-distilled-squad`, run the following command: 
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
-You should see the following logs:
+You should see the logs indicating progress and showing where the resulting `model.onnx` is saved, like this:
 
 ```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
-        - Validating ONNX Model output "last_hidden_state":
-                -[✓] (2, 8, 768) matches (2, 8, 768)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
-```
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you 
+saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the 
+`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub and provide the `--task` argument. 
+You can review the list of supported tasks in the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager).
+If `task` argument is not provided, it will default to the model architecture without any task specific head.
 
-This exports an ONNX graph of the checkpoint defined by the `--model` argument. In this
-example, it is `distilbert-base-uncased`, but it can be any checkpoint on the Hugging
-Face Hub or one that's stored locally.
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
 
 The resulting `model.onnx` file can then be run on one of the [many
 accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX
@@ -195,348 +103,104 @@ Runtime](https://onnxruntime.ai/) as follows:
 
 ```python
 >>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-The required output names (like `["last_hidden_state"]`) can be obtained by taking a
-look at the ONNX configuration of each model. For example, for DistilBERT we have:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
 
-The process is identical for TensorFlow checkpoints on the Hub. For example, we can
-export a pure TensorFlow checkpoint from the [Keras
-organization](https://huggingface.co/keras-io) as follows:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-To export a model that's stored locally, you'll need to have the model's weights and
-tokenizer files stored in a directory. For example, we can load and save a checkpoint as
-follows:
-
-<frameworkcontent> <pt>
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-pt-checkpoint")
->>> pt_model.save_pretrained("local-pt-checkpoint")
-```
-
-Once the checkpoint is saved, we can export it to ONNX by pointing the `--model`
-argument of the `transformers.onnx` package to the desired directory:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-</pt> <tf>
-```python
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-tf-checkpoint")
->>> tf_model.save_pretrained("local-tf-checkpoint")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
 ```
 
-Once the checkpoint is saved, we can export it to ONNX by pointing the `--model`
-argument of the `transformers.onnx` package to the desired directory:
+The process is identical for TensorFlow checkpoints on the Hub. For instance, here's how you would
+export a pure TensorFlow checkpoint from the [Keras organization](https://huggingface.co/keras-io):
 
 ```bash
-python -m transformers.onnx --model=local-tf-checkpoint onnx/
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
 ```
-</tf> </frameworkcontent>
 
-## Selecting features for different model tasks
+### Exporting a 🤗 Transformers model to ONNX with `optimum.onnxruntime`
 
-<Tip>
-
-The recommended way of exporting a model is now to use `optimum.exporters.onnx`.
-You can check the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#selecting-a-task)
-to learn how to select a task.
-
-</Tip>
-
-Each ready-made configuration comes with a set of _features_ that enable you to export
-models for different types of tasks. As shown in the table below, each feature is
-associated with a different `AutoClass`:
-
-| Feature                              | Auto Class                           |
-| ------------------------------------ | ------------------------------------ |
-| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
-| `default`, `default-with-past`       | `AutoModel`                          |
-| `masked-lm`                          | `AutoModelForMaskedLM`               |
-| `question-answering`                 | `AutoModelForQuestionAnswering`      |
-| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
-| `sequence-classification`            | `AutoModelForSequenceClassification` |
-| `token-classification`               | `AutoModelForTokenClassification`    |
-
-For each configuration, you can find the list of supported features via the
-[`~transformers.onnx.FeaturesManager`]. For example, for DistilBERT we have:
+Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so: 
 
 ```python
->>> from transformers.onnx.features import FeaturesManager
-
->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
->>> print(distilbert_features)
-["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
-```
-
-You can then pass one of these features to the `--feature` argument in the
-`transformers.onnx` package. For example, to export a text-classification model we can
-pick a fine-tuned model from the Hub and run:
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
 
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
-                            --feature=sequence-classification onnx/
-```
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
 
-This displays the following logs:
+>>> # Load a model from transformers and export it to ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'logits'})
-        - Validating ONNX Model output "logits":
-                -[✓] (2, 2) matches (2, 2)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
+>>> # Save the onnx model and tokenizer
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
 ```
 
-Notice that in this case, the output names from the fine-tuned model are `logits`
-instead of the `last_hidden_state` we saw with the `distilbert-base-uncased` checkpoint
-earlier. This is expected since the fine-tuned model has a sequence classification head.
-
-<Tip>
-
-The features that have a `with-past` suffix (like `causal-lm-with-past`) correspond to
-model classes with precomputed hidden states (key and values in the attention blocks)
-that can be used for fast autoregressive decoding.
-
-</Tip>
-
-<Tip>
-
-For `VisionEncoderDecoder` type models, the encoder and decoder parts are
-exported separately as two ONNX files named `encoder_model.onnx` and `decoder_model.onnx` respectively.
-
-</Tip>
-
-
-## Exporting a model for an unsupported architecture
-
-<Tip>
+### Exporting a model for an unsupported architecture
 
 If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is
-supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/package_reference/configuration#supported-architectures),
-and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/contribute)
+supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview),
+and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)
 directly.
 
-</Tip>
-
-If you wish to export a model whose architecture is not natively supported by the
-library, there are three main steps to follow:
-
-1. Implement a custom ONNX configuration.
-2. Export the model to ONNX.
-3. Validate the outputs of the PyTorch and exported models.
-
-In this section, we'll look at how DistilBERT was implemented to show what's involved
-with each step.
-
-### Implementing a custom ONNX configuration
-
-Let's start with the ONNX configuration object. We provide three abstract classes that
-you should inherit from, depending on the type of model architecture you wish to export:
-
-* Encoder-based models inherit from [`~onnx.config.OnnxConfig`]
-* Decoder-based models inherit from [`~onnx.config.OnnxConfigWithPast`]
-* Encoder-decoder models inherit from [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-<Tip>
-
-A good way to implement a custom ONNX configuration is to look at the existing
-implementation in the `configuration_<model_name>.py` file of a similar architecture.
-
-</Tip>
-
-Since DistilBERT is an encoder-based model, its configuration inherits from
-`OnnxConfig`:
+### Exporting a model with `transformers.onnx`
 
-```python
->>> from typing import Mapping, OrderedDict
->>> from transformers.onnx import OnnxConfig
-
-
->>> class DistilBertOnnxConfig(OnnxConfig):
-...     @property
-...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
-...         return OrderedDict(
-...             [
-...                 ("input_ids", {0: "batch", 1: "sequence"}),
-...                 ("attention_mask", {0: "batch", 1: "sequence"}),
-...             ]
-...         )
-```
-
-Every configuration object must implement the `inputs` property and return a mapping,
-where each key corresponds to an expected input, and each value indicates the axis of
-that input. For DistilBERT, we can see that two inputs are required: `input_ids` and
-`attention_mask`. These inputs have the same shape of `(batch_size, sequence_length)`
-which is why we see the same axes used in the configuration.
+<Tip warning={true}>
 
-<Tip>
-
-Notice that `inputs` property for `DistilBertOnnxConfig` returns an `OrderedDict`. This
-ensures that the inputs are matched with their relative position within the
-`PreTrainedModel.forward()` method when tracing the graph. We recommend using an
-`OrderedDict` for the `inputs` and `outputs` properties when implementing custom ONNX
-configurations.
+`tranformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions.
 
 </Tip>
 
-Once you have implemented an ONNX configuration, you can instantiate it by providing the
-base model's configuration as follows:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config = DistilBertOnnxConfig(config)
-```
-
-The resulting object has several useful properties. For example, you can view the ONNX
-operator set that will be used during the export:
-
-```python
->>> print(onnx_config.default_onnx_opset)
-11
-```
-
-You can also view the outputs associated with the model as follows:
+To export a 🤗 Transformers model to ONNX with `tranformers.onnx`, install extra dependencies:
 
-```python
->>> print(onnx_config.outputs)
-OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
+```bash
+pip install transformers[onnx]
 ```
 
-Notice that the outputs property follows the same structure as the inputs; it returns an
-`OrderedDict` of named outputs and their shapes. The output structure is linked to the
-choice of feature that the configuration is initialised with. By default, the ONNX
-configuration is initialized with the `default` feature that corresponds to exporting a
-model loaded with the `AutoModel` class. If you want to export a model for another task,
-just provide a different feature to the `task` argument when you initialize the ONNX
-configuration. For example, if we wished to export DistilBERT with a sequence
-classification head, we could use:
+Use `transformers.onnx` package as a Python module to export a checkpoint using a ready-made configuration:
 
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
->>> print(onnx_config_for_seq_clf.outputs)
-OrderedDict([('logits', {0: 'batch'})])
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
 ```
 
-<Tip>
-
-All of the base properties and methods associated with [`~onnx.config.OnnxConfig`] and
-the other configuration classes can be overridden if needed. Check out [`BartOnnxConfig`]
-for an advanced example.
-
-</Tip>
-
-### Exporting the model
-
-Once you have implemented the ONNX configuration, the next step is to export the model.
-Here we can use the `export()` function provided by the `transformers.onnx` package.
-This function expects the ONNX configuration, along with the base model and tokenizer,
-and the path to save the exported file:
+This exports an ONNX graph of the checkpoint defined by the `--model` argument. Pass any checkpoint on the 🤗 Hub or one that's stored locally.
+The resulting `model.onnx` file can then be run on one of the many accelerators that support the ONNX standard. For example, 
+load and run the model with ONNX Runtime as follows:
 
 ```python
->>> from pathlib import Path
->>> from transformers.onnx import export
->>> from transformers import AutoTokenizer, AutoModel
-
->>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
->>> base_model = AutoModel.from_pretrained(model_ckpt)
->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
 
->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
 ```
 
-The `onnx_inputs` and `onnx_outputs` returned by the `export()` function are lists of
-the keys defined in the `inputs` and `outputs` properties of the configuration. Once the
-model is exported, you can test that the model is well formed as follows:
+The required output names (like `["last_hidden_state"]`) can be obtained by taking a look at the ONNX configuration of 
+each model. For example, for DistilBERT we have:
 
 ```python
->>> import onnx
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
 
->>> onnx_model = onnx.load("model.onnx")
->>> onnx.checker.check_model(onnx_model)
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
 ```
 
-<Tip>
+The process is identical for TensorFlow checkpoints on the Hub. For example, export a pure TensorFlow checkpoint like so:
 
-If your model is larger than 2GB, you will see that many additional files are created
-during the export. This is _expected_ because ONNX uses [Protocol
-Buffers](https://developers.google.com/protocol-buffers/) to store the model and these
-have a size limit of 2GB. See the [ONNX
-documentation](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md) for
-instructions on how to load models with external data.
-
-</Tip>
-
-### Validating the model outputs
-
-The final step is to validate that the outputs from the base and exported model agree
-within some absolute tolerance. Here we can use the `validate_model_outputs()` function
-provided by the `transformers.onnx` package as follows:
-
-```python
->>> from transformers.onnx import validate_model_outputs
-
->>> validate_model_outputs(
-...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
-... )
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
 ```
 
-This function uses the [`~transformers.onnx.OnnxConfig.generate_dummy_inputs`] method to
-generate inputs for the base and exported model, and the absolute tolerance can be
-defined in the configuration. We generally find numerical agreement in the 1e-6 to 1e-4
-range, although anything smaller than 1e-3 is likely to be OK.
-
-## Contributing a new configuration to 🤗 Transformers
+To export a model that's stored locally, save the model's weights and tokenizer files in the same directory (e.g. `local-pt-checkpoint`), 
+then export it to ONNX by pointing the `--model` argument of the `transformers.onnx` package to the desired directory:
 
-We are looking to expand the set of ready-made configurations and welcome contributions
-from the community! If you would like to contribute your addition to the library, you
-will need to:
-
-* Implement the ONNX configuration in the corresponding `configuration_<model_name>.py`
-file
-* Include the model architecture and corresponding features in
-  [`~onnx.features.FeatureManager`]
-* Add your model architecture to the tests in `test_onnx_v2.py`
-
-Check out how the configuration for [IBERT was
-contributed](https://github.com/huggingface/transformers/pull/14868/files) to get an
-idea of what's involved.
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
\ No newline at end of file
diff --git a/docs/source/en/tflite.mdx b/docs/source/en/tflite.mdx
new file mode 100644
index 000000000000..23e08478ba82
--- /dev/null
+++ b/docs/source/en/tflite.mdx
@@ -0,0 +1,58 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Export to TFLite
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide) is a lightweight framework for deploying machine learning models 
+on resource-constrained devices, such as mobile phones, embedded systems, and Internet of Things (IoT) devices. 
+TFLite is designed to optimize and run models efficiently on these devices with limited computational power, memory, and 
+power consumption.
+A TensorFlow Lite model is represented in a special efficient portable format identified by the `.tflite` file extension. 
+
+🤗 Optimum offers functionality to export 🤗 Transformers models to TFLite through the `exporters.tflite` module. 
+For the list of supported model architectures, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/tflite/overview).
+
+To export a model to TFLite, install the required dependencies:
+ 
+```bash
+pip install optimum[exporters-tf]
+```
+
+To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model), 
+or view help in command line:
+
+```bash
+optimum-cli export tflite --help
+```
+
+To export a model's checkpoint from the 🤗 Hub, for example, `bert-base-uncased`, run the following command:
+
+```bash
+optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+You should see the logs indicating progress and showing where the resulting `model.tflite` is saved, like this:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+ ```
+
+The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you 
+saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the 
+`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub. 
\ No newline at end of file
diff --git a/utils/check_table.py b/utils/check_table.py
index e7e31cfee3bc..80593881a39c 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -173,56 +173,9 @@ def check_model_table(overwrite=False):
             )
 
 
-def has_onnx(model_type):
-    """
-    Returns whether `model_type` is supported by ONNX (by checking if there is an ONNX config) or not.
-    """
-    config_mapping = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING
-    if model_type not in config_mapping:
-        return False
-    config = config_mapping[model_type]
-    config_module = config.__module__
-    module = transformers_module
-    for part in config_module.split(".")[1:]:
-        module = getattr(module, part)
-    config_name = config.__name__
-    onnx_config_name = config_name.replace("Config", "OnnxConfig")
-    return hasattr(module, onnx_config_name)
-
-
-def get_onnx_model_list():
-    """
-    Return the list of models supporting ONNX.
-    """
-    config_mapping = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING
-    model_names = config_mapping = transformers_module.models.auto.configuration_auto.MODEL_NAMES_MAPPING
-    onnx_model_types = [model_type for model_type in config_mapping.keys() if has_onnx(model_type)]
-    onnx_model_names = [model_names[model_type] for model_type in onnx_model_types]
-    onnx_model_names.sort(key=lambda x: x.upper())
-    return "\n".join([f"- {name}" for name in onnx_model_names]) + "\n"
-
-
-def check_onnx_model_list(overwrite=False):
-    """Check the model list in the serialization.mdx is consistent with the state of the lib and maybe `overwrite`."""
-    current_list, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "serialization.mdx"),
-        start_prompt="<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->",
-        end_prompt="In the next two sections, we'll show you how to:",
-    )
-    new_list = get_onnx_model_list()
-
-    if current_list != new_list:
-        if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "serialization.mdx"), "w", encoding="utf-8", newline="\n") as f:
-                f.writelines(lines[:start_index] + [new_list] + lines[end_index:])
-        else:
-            raise ValueError("The list of ONNX-supported models needs an update. Run `make fix-copies` to fix this.")
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
     args = parser.parse_args()
 
     check_model_table(args.fix_and_overwrite)
-    check_onnx_model_list(args.fix_and_overwrite)

From b4698b7ef23801dd02f6521e6c1ea7ab5c5fbec5 Mon Sep 17 00:00:00 2001
From: uchuhimo <uchuhimo@outlook.com>
Date: Wed, 24 May 2023 20:47:43 +0800
Subject: [PATCH 214/935] fix: use bool instead of uint8/byte in
 Deberta/DebertaV2/SEW-D to make it compatible with TensorRT (#23683)

* Use bool instead of uint8/byte in DebertaV2 to make it compatible with TensorRT

TensorRT cannot accept onnx graph with uint8/byte intermediate tensors. This PR uses bool tensors instead of unit8/byte tensors to make the exported onnx file can work with TensorRT.

* fix: use bool instead of uint8/byte in Deberta and SEW-D

---------

Co-authored-by: Yuxian Qiu <yuxianq@nvidia.com>
---
 src/transformers/models/deberta/modeling_deberta.py       | 5 ++---
 src/transformers/models/deberta_v2/modeling_deberta_v2.py | 7 +++----
 src/transformers/models/sew_d/modeling_sew_d.py           | 7 +++----
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 7c98a2d0d49d..62e89c824e64 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -139,7 +139,7 @@ def symbolic(g, self, mask, dim):
         r_mask = g.op(
             "Cast",
             g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
-            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
         )
         output = masked_fill(
             g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
@@ -420,7 +420,6 @@ def get_attention_mask(self, attention_mask):
         if attention_mask.dim() <= 2:
             extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
             attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
         elif attention_mask.dim() == 3:
             attention_mask = attention_mask.unsqueeze(1)
 
@@ -614,7 +613,7 @@ def forward(
                 Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 *Attention(Q,K,V)*
 
-            attention_mask (`torch.ByteTensor`):
+            attention_mask (`torch.BoolTensor`):
                 An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                 sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                 th token.
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index cd6e6318e3d4..95a288657e48 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -130,7 +130,7 @@ def symbolic(g, self, mask, dim):
         r_mask = g.op(
             "Cast",
             g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
-            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
         )
         output = masked_fill(
             g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
@@ -453,7 +453,6 @@ def get_attention_mask(self, attention_mask):
         if attention_mask.dim() <= 2:
             extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
             attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
         elif attention_mask.dim() == 3:
             attention_mask = attention_mask.unsqueeze(1)
 
@@ -484,7 +483,7 @@ def forward(
         if attention_mask.dim() <= 2:
             input_mask = attention_mask
         else:
-            input_mask = (attention_mask.sum(-2) > 0).byte()
+            input_mask = attention_mask.sum(-2) > 0
         attention_mask = self.get_attention_mask(attention_mask)
         relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
 
@@ -687,7 +686,7 @@ def forward(
                 Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 *Attention(Q,K,V)*
 
-            attention_mask (`torch.ByteTensor`):
+            attention_mask (`torch.BoolTensor`):
                 An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                 sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                 th token.
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 7cdce062eee3..417fd81c6e9d 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -559,7 +559,7 @@ def symbolic(g, self, mask, dim):
         r_mask = g.op(
             "Cast",
             g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
-            to_i=sym_help.cast_pytorch_to_onnx["Byte"],
+            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
         )
         output = masked_fill(
             g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
@@ -754,7 +754,7 @@ def forward(
                 Input states to the module usually the output from previous layer, it will be the Q,K and V in
                 *Attention(Q,K,V)*
 
-            attention_mask (`torch.ByteTensor`):
+            attention_mask (`torch.BoolTensor`):
                 An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                 sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                 th token.
@@ -1086,7 +1086,6 @@ def get_attention_mask(self, attention_mask):
         if attention_mask.dim() <= 2:
             extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
             attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
-            attention_mask = attention_mask.byte()
         elif attention_mask.dim() == 3:
             attention_mask = attention_mask.unsqueeze(1)
 
@@ -1117,7 +1116,7 @@ def forward(
         if attention_mask.dim() <= 2:
             input_mask = attention_mask
         else:
-            input_mask = (attention_mask.sum(-2) > 0).byte()
+            input_mask = attention_mask.sum(-2) > 0
         attention_mask = self.get_attention_mask(attention_mask)
         relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
 

From 767e6b5314d54cb5f642f328fb4f42f2c83b218b Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 24 May 2023 20:48:31 +0800
Subject: [PATCH 215/935] fix gptj could not jit.trace in GPU (#23317)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 src/transformers/models/gptj/modeling_gptj.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 3a1f99dd713a..82cb280caba9 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -212,7 +212,7 @@ def forward(
         key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
         value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
 
-        if is_torch_fx_proxy(position_ids):
+        if is_torch_fx_proxy(position_ids) or torch.jit.is_tracing():
             # The logic to conditionally copy to GPU could not be traced, so we do this
             # every time in the torch.fx case
             embed_positions = get_embed_positions(self.embed_positions, position_ids)

From f8b25744163d54612ae12203160e03623ddc9c09 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 24 May 2023 13:52:52 +0100
Subject: [PATCH 216/935] Better TF docstring types (#23477)

* Rework TF type hints to use | None instead of Optional[] for tf.Tensor

* Rework TF type hints to use | None instead of Optional[] for tf.Tensor

* Don't forget the imports

* Add the imports to tests too

* make fixup

* Refactor tests that depended on get_type_hints

* Better test refactor

* Fix an old hidden bug in the test_keras_fit input creation code

* Fix for the Deit tests
---
 src/transformers/modeling_tf_outputs.py       | 190 ++++++++---------
 src/transformers/modeling_tf_utils.py         |   4 +-
 .../models/albert/modeling_tf_albert.py       | 119 ++++++-----
 .../models/bart/modeling_tf_bart.py           | 136 ++++++------
 .../models/bert/modeling_tf_bert.py           | 169 +++++++--------
 .../blenderbot/modeling_tf_blenderbot.py      |  72 ++++---
 .../modeling_tf_blenderbot_small.py           |  74 +++----
 .../models/blip/modeling_tf_blip.py           |  92 ++++----
 .../models/blip/modeling_tf_blip_text.py      |  13 +-
 .../models/camembert/modeling_tf_camembert.py | 137 ++++++------
 .../models/clip/modeling_tf_clip.py           |  52 ++---
 .../models/convbert/modeling_tf_convbert.py   |  78 +++----
 .../models/convnext/modeling_tf_convnext.py   |  10 +-
 .../models/ctrl/modeling_tf_ctrl.py           |  55 ++---
 .../models/cvt/modeling_tf_cvt.py             |  12 +-
 .../data2vec/modeling_tf_data2vec_vision.py   |  35 +--
 .../models/deberta/modeling_tf_deberta.py     |  72 ++++---
 .../deberta_v2/modeling_tf_deberta_v2.py      |  72 ++++---
 .../models/deit/modeling_tf_deit.py           |  36 ++--
 .../distilbert/modeling_tf_distilbert.py      |  63 +++---
 .../models/dpr/modeling_tf_dpr.py             |  54 ++---
 .../models/electra/modeling_tf_electra.py     | 135 ++++++------
 .../modeling_tf_encoder_decoder.py            |  21 +-
 .../models/esm/modeling_tf_esm.py             |  81 +++----
 .../models/flaubert/modeling_tf_flaubert.py   | 129 ++++++------
 .../models/funnel/modeling_tf_funnel.py       |  83 ++++----
 .../models/gpt2/modeling_tf_gpt2.py           |  86 ++++----
 .../models/gptj/modeling_tf_gptj.py           |  76 +++----
 .../models/groupvit/modeling_tf_groupvit.py   |  56 ++---
 .../models/hubert/modeling_tf_hubert.py       |  59 +++---
 .../models/layoutlm/modeling_tf_layoutlm.py   | 117 +++++-----
 .../layoutlmv3/modeling_tf_layoutlmv3.py      | 133 ++++++------
 .../models/led/modeling_tf_led.py             |  84 ++++----
 .../longformer/modeling_tf_longformer.py      | 149 ++++++-------
 .../models/lxmert/modeling_tf_lxmert.py       |  51 ++---
 .../models/marian/modeling_tf_marian.py       | 122 +++++------
 .../models/mbart/modeling_tf_mbart.py         | 110 +++++-----
 .../mobilebert/modeling_tf_mobilebert.py      | 123 +++++------
 .../models/mobilevit/modeling_tf_mobilevit.py |  14 +-
 .../models/mpnet/modeling_tf_mpnet.py         |  56 ++---
 .../models/openai/modeling_tf_openai.py       |  72 ++++---
 .../models/opt/modeling_tf_opt.py             |  52 ++---
 .../models/pegasus/modeling_tf_pegasus.py     | 112 +++++-----
 .../models/rag/modeling_tf_rag.py             | 121 +++++------
 .../models/rembert/modeling_tf_rembert.py     | 130 ++++++------
 .../models/roberta/modeling_tf_roberta.py     | 137 ++++++------
 .../modeling_tf_roberta_prelayernorm.py       | 137 ++++++------
 .../models/roformer/modeling_tf_roformer.py   |  96 ++++-----
 .../models/sam/modeling_tf_sam.py             |  45 ++--
 .../models/segformer/modeling_tf_segformer.py |   9 +-
 .../modeling_tf_speech_to_text.py             |  62 +++---
 .../models/swin/modeling_tf_swin.py           |  72 ++++---
 src/transformers/models/t5/modeling_tf_t5.py  |  49 +++--
 .../models/tapas/modeling_tf_tapas.py         |  99 ++++-----
 .../transfo_xl/modeling_tf_transfo_xl.py      |  54 ++---
 .../modeling_tf_vision_encoder_decoder.py     |  12 +-
 .../modeling_tf_vision_text_dual_encoder.py   |  12 +-
 .../models/vit/modeling_tf_vit.py             |  16 +-
 .../models/vit_mae/modeling_tf_vit_mae.py     |  31 +--
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |  63 +++---
 .../models/whisper/modeling_tf_whisper.py     |  54 ++---
 .../models/xglm/modeling_tf_xglm.py           |  80 +++----
 .../models/xlm/modeling_tf_xlm.py             |  97 ++++-----
 .../xlm_roberta/modeling_tf_xlm_roberta.py    | 137 ++++++------
 .../models/xlnet/modeling_tf_xlnet.py         | 199 +++++++++---------
 ...tf_{{cookiecutter.lowercase_modelname}}.py | 156 +++++++-------
 tests/generation/test_tf_logits_process.py    |   2 +
 tests/generation/test_tf_utils.py             |   2 +
 .../models/albert/test_modeling_tf_albert.py  |   2 +
 tests/models/auto/test_modeling_tf_auto.py    |   2 +
 tests/models/auto/test_modeling_tf_pytorch.py |   2 +
 tests/models/bart/test_modeling_tf_bart.py    |   2 +
 tests/models/bert/test_modeling_tf_bert.py    |   2 +
 .../blenderbot/test_modeling_tf_blenderbot.py |   2 +
 .../test_modeling_tf_blenderbot_small.py      |   2 +
 tests/models/blip/test_modeling_tf_blip.py    |   2 +
 .../models/blip/test_modeling_tf_blip_text.py |   2 +
 tests/models/bort/test_modeling_tf_bort.py    |   2 +
 .../camembert/test_modeling_tf_camembert.py   |   2 +
 tests/models/clip/test_modeling_tf_clip.py    |   2 +
 .../convbert/test_modeling_tf_convbert.py     |   2 +
 .../convnext/test_modeling_tf_convnext.py     |   2 +
 tests/models/ctrl/test_modeling_tf_ctrl.py    |   2 +
 tests/models/cvt/test_modeling_tf_cvt.py      |   2 +
 .../test_modeling_tf_data2vec_vision.py       |   2 +
 .../deberta/test_modeling_tf_deberta.py       |   2 +
 .../deberta_v2/test_modeling_tf_deberta_v2.py |   2 +
 tests/models/deit/test_modeling_tf_deit.py    |   4 +-
 .../distilbert/test_modeling_tf_distilbert.py |   2 +
 tests/models/dpr/test_modeling_tf_dpr.py      |   2 +
 .../electra/test_modeling_tf_electra.py       |   2 +
 .../test_modeling_tf_encoder_decoder.py       |   2 +
 tests/models/esm/test_modeling_tf_esm.py      |   2 +
 .../flaubert/test_modeling_tf_flaubert.py     |   2 +
 .../models/funnel/test_modeling_tf_funnel.py  |   2 +
 tests/models/gpt2/test_modeling_tf_gpt2.py    |   2 +
 tests/models/gptj/test_modeling_tf_gptj.py    |   2 +
 .../groupvit/test_modeling_tf_groupvit.py     |   2 +
 .../models/hubert/test_modeling_tf_hubert.py  |   2 +
 .../layoutlm/test_modeling_tf_layoutlm.py     |   2 +
 .../layoutlmv3/test_modeling_tf_layoutlmv3.py |   2 +
 tests/models/led/test_modeling_tf_led.py      |   2 +
 .../longformer/test_modeling_tf_longformer.py |   2 +
 .../models/lxmert/test_modeling_tf_lxmert.py  |   2 +
 .../models/marian/test_modeling_tf_marian.py  |   2 +
 tests/models/mbart/test_modeling_tf_mbart.py  |   2 +
 .../mobilebert/test_modeling_tf_mobilebert.py |   2 +
 .../mobilevit/test_modeling_tf_mobilevit.py   |   2 +
 tests/models/mpnet/test_modeling_tf_mpnet.py  |   2 +
 tests/models/mt5/test_modeling_tf_mt5.py      |   2 +
 .../models/openai/test_modeling_tf_openai.py  |   2 +
 tests/models/opt/test_modeling_tf_opt.py      |   2 +
 .../pegasus/test_modeling_tf_pegasus.py       |   2 +
 tests/models/rag/test_modeling_tf_rag.py      |   2 +
 .../models/regnet/test_modeling_tf_regnet.py  |   2 +
 .../rembert/test_modeling_tf_rembert.py       |   2 +
 .../models/resnet/test_modeling_tf_resnet.py  |   2 +
 .../roberta/test_modeling_tf_roberta.py       |   2 +
 .../test_modeling_tf_roberta_prelayernorm.py  |   2 +
 .../roformer/test_modeling_tf_roformer.py     |   2 +
 tests/models/sam/test_modeling_tf_sam.py      |   2 +
 .../segformer/test_modeling_tf_segformer.py   |   2 +
 .../test_modeling_tf_speech_to_text.py        |   2 +
 tests/models/swin/test_modeling_tf_swin.py    |   2 +
 tests/models/t5/test_modeling_tf_t5.py        |   2 +
 tests/models/tapas/test_modeling_tf_tapas.py  |   2 +
 .../transfo_xl/test_modeling_tf_transfo_xl.py |   2 +
 ...test_modeling_tf_vision_encoder_decoder.py |   2 +
 ...st_modeling_tf_vision_text_dual_encoder.py |   2 +
 tests/models/vit/test_modeling_tf_vit.py      |   2 +
 .../vit_mae/test_modeling_tf_vit_mae.py       |   2 +
 .../wav2vec2/test_modeling_tf_wav2vec2.py     |   2 +
 .../whisper/test_modeling_tf_whisper.py       |   2 +
 tests/models/xglm/test_modeling_tf_xglm.py    |   2 +
 tests/models/xlm/test_modeling_tf_xlm.py      |   2 +
 .../test_modeling_tf_xlm_roberta.py           |   2 +
 tests/models/xlnet/test_modeling_tf_xlnet.py  |   2 +
 tests/test_modeling_tf_common.py              |  48 ++---
 tests/utils/test_modeling_tf_core.py          |   2 +
 139 files changed, 2907 insertions(+), 2621 deletions(-)

diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py
index f8148b169543..357c34bc1f25 100644
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
@@ -43,8 +45,8 @@ class TFBaseModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -96,8 +98,8 @@ class TFBaseModelOutputWithPooling(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -164,10 +166,10 @@ class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -201,9 +203,9 @@ class TFBaseModelOutputWithPast(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -234,9 +236,9 @@ class TFBaseModelOutputWithCrossAttentions(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -276,10 +278,10 @@ class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -333,13 +335,13 @@ class TFSeq2SeqModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -365,10 +367,10 @@ class TFCausalLMOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -400,11 +402,11 @@ class TFCausalLMOutputWithPast(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -442,12 +444,12 @@ class TFCausalLMOutputWithCrossAttentions(ModelOutput):
             `past_key_values` input) to speed up sequential decoding.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -473,10 +475,10 @@ class TFMaskedLMOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -527,15 +529,15 @@ class TFSeq2SeqLMOutput(ModelOutput):
             self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -562,10 +564,10 @@ class TFNextSentencePredictorOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -591,10 +593,10 @@ class TFSequenceClassifierOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -642,15 +644,15 @@ class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
             self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -684,10 +686,10 @@ class TFSemanticSegmenterOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -716,9 +718,9 @@ class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -742,10 +744,10 @@ class TFImageClassifierOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -773,10 +775,10 @@ class TFMultipleChoiceModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -802,10 +804,10 @@ class TFTokenClassifierOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -833,11 +835,11 @@ class TFQuestionAnsweringModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -884,15 +886,15 @@ class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
             self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -924,11 +926,11 @@ class TFSequenceClassifierOutputWithPast(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -947,7 +949,7 @@ class TFImageClassifierOutputWithNoAttention(ModelOutput):
             feature maps) of the model at the output of each stage.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
     hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
 
@@ -974,10 +976,10 @@ class TFMaskedImageModelingOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     reconstruction: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
     @property
     def logits(self):
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index c58926f476c1..c2b0485b5f4c 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 """TF general model utils."""
 
+from __future__ import annotations
+
 import functools
 import gc
 import inspect
@@ -1154,7 +1156,7 @@ def _from_config(cls, config, **kwargs):
         """
         return cls(config, **kwargs)
 
-    def get_head_mask(self, head_mask: Optional[tf.Tensor], num_hidden_layers: int) -> tf.Tensor:
+    def get_head_mask(self, head_mask: tf.Tensor | None, num_hidden_layers: int) -> tf.Tensor:
         """
         Prepare the head mask if needed.
 
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index c7f76b175b0b..57e2414e720d 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 ALBERT model."""
 
+
+from __future__ import annotations
+
 import math
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -561,12 +564,12 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -676,8 +679,8 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
     loss: tf.Tensor = None
     prediction_logits: tf.Tensor = None
     sop_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 ALBERT_START_DOCSTRING = r"""
@@ -797,12 +800,12 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -863,17 +866,17 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        sentence_order_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        sentence_order_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]:
         r"""
@@ -979,16 +982,16 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1097,16 +1100,16 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1189,16 +1192,16 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1276,17 +1279,17 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1386,16 +1389,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 39537b88bfce..5690e022adaa 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Bart model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -131,7 +133,7 @@ def call(
         self,
         input_shape: Optional[tf.TensorShape] = None,
         past_key_values_length: int = 0,
-        position_ids: Optional[tf.Tensor] = None,
+        position_ids: tf.Tensor | None = None,
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -180,12 +182,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -314,8 +316,8 @@ def __init__(self, config: BartConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]],
-        layer_head_mask: Optional[tf.Tensor],
+        attention_mask: np.ndarray | tf.Tensor | None,
+        layer_head_mask: tf.Tensor | None,
         training: Optional[bool] = False,
     ) -> tf.Tensor:
         """
@@ -383,11 +385,11 @@ def __init__(self, config: BartConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
@@ -700,10 +702,10 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Em
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -851,14 +853,14 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Em
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1073,18 +1075,18 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1187,18 +1189,18 @@ def get_decoder(self):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1311,23 +1313,23 @@ def set_bias(self, value):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1481,23 +1483,23 @@ def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwarg
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSeq2SeqSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 50ff7f2dddaa..df78d03a0074 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 BERT model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from dataclasses import dataclass
@@ -452,9 +455,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -530,9 +533,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -735,14 +738,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -943,7 +946,7 @@ class TFBertForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     prediction_logits: tf.Tensor = None
     seq_relationship_logits: tf.Tensor = None
     hidden_states: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None
@@ -1067,14 +1070,14 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1175,17 +1178,17 @@ def get_prefix_bias_name(self) -> str:
     @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFBertForPreTrainingOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1304,16 +1307,16 @@ def get_prefix_bias_name(self) -> str:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1401,20 +1404,20 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
         **kwargs,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
@@ -1513,16 +1516,16 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFNextSentencePredictorOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1621,16 +1624,16 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1714,16 +1717,16 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1848,16 +1851,16 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1941,17 +1944,17 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index ee5755c2035c..66f00d89f897 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Blenderbot model."""
 
 
+from __future__ import annotations
+
 import os
 import random
 import warnings
@@ -126,7 +128,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
         super().__init__(num_embeddings, embedding_dim, **kwargs)
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -175,12 +177,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -380,12 +382,12 @@ def __init__(self, config: BlenderbotConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -1183,18 +1185,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     )
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1327,23 +1329,23 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
         r"""
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index e170085e91c5..541024470d10 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -15,6 +15,8 @@
 """ TF 2.0 BlenderbotSmall model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import List, Optional, Tuple, Union
 
@@ -126,7 +128,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
         super().__init__(num_embeddings, embedding_dim, **kwargs)
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -175,12 +177,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -310,8 +312,8 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]],
-        layer_head_mask: Optional[tf.Tensor],
+        attention_mask: np.ndarray | tf.Tensor | None,
+        layer_head_mask: tf.Tensor | None,
         training: Optional[bool] = False,
     ) -> tf.Tensor:
         """
@@ -380,11 +382,11 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
@@ -1175,18 +1177,18 @@ def get_decoder(self):
     )
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1303,23 +1305,23 @@ def set_bias(self, value):
     @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
-        past_key_values: Optional[List[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: List[tf.Tensor] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
         r"""
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index ae32c2f32a52..95269e4351d9 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ TensorFlow BLIP model."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Union
 
@@ -102,12 +104,12 @@ class TFBlipForConditionalGenerationModelOutput(ModelOutput):
             heads.`
     """
 
-    loss: Optional[Tuple[tf.Tensor]] = None
-    decoder_logits: Optional[Tuple[tf.Tensor]] = None
-    image_embeds: Optional[tf.Tensor] = None
+    loss: Tuple[tf.Tensor] | None = None
+    decoder_logits: Tuple[tf.Tensor] | None = None
+    image_embeds: tf.Tensor | None = None
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -136,11 +138,11 @@ class TFBlipTextVisionModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
-    image_embeds: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -176,14 +178,14 @@ class TFBlipImageTextMatchingModelOutput(ModelOutput):
             The question embeddings obtained by the text projection layer.
     """
 
-    itm_score: Optional[tf.Tensor] = None
-    loss: Optional[tf.Tensor] = None
-    image_embeds: Optional[tf.Tensor] = None
+    itm_score: tf.Tensor | None = None
+    loss: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    vision_pooler_output: Optional[tf.Tensor] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    question_embeds: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    vision_pooler_output: tf.Tensor | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    question_embeds: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -208,7 +210,7 @@ class TFBlipOutput(ModelOutput):
             The output of the [`BlipVisionModel`].
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits_per_image: tf.Tensor = None
     logits_per_text: tf.Tensor = None
     text_embeds: tf.Tensor = None
@@ -359,10 +361,10 @@ def __init__(self, config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: Optional[bool] = None,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[Tuple[tf.Tensor]]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None, Tuple[tf.Tensor] | None]:
         """Input shape: Batch x Time x Channel"""
 
         bsz, tgt_len, embed_dim = shape_list(hidden_states)
@@ -573,7 +575,7 @@ def __init__(self, config: BlipConfig, **kwargs):
     def call(
         self,
         inputs_embeds,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -694,7 +696,7 @@ def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOut
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -799,10 +801,10 @@ def build(self, input_shape):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -930,10 +932,10 @@ def serving_output(self, output: TFBlipOutput) -> TFBlipOutput:
     @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -980,9 +982,9 @@ def call(
     @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
     ) -> tf.Tensor:
         r"""
@@ -1018,7 +1020,7 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     def get_image_features(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
     ) -> tf.Tensor:
         r"""
@@ -1128,11 +1130,11 @@ def serving_output(
     def call(
         self,
         pixel_values: tf.Tensor,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = None,
     ) -> Union[Tuple, TFBlipForConditionalGenerationModelOutput]:
@@ -1197,8 +1199,8 @@ def call(
     def generate(
         self,
         pixel_values: tf.Tensor,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
         **generate_kwargs,
     ) -> tf.Tensor:
         r"""
@@ -1342,13 +1344,13 @@ def call(
         self,
         input_ids: tf.Tensor,
         pixel_values: tf.Tensor,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         foutput_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = None,
     ) -> Union[Tuple, TFBlipTextVisionModelOutput]:
@@ -1451,7 +1453,7 @@ def generate(
         self,
         input_ids: tf.Tensor,
         pixel_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         **generate_kwargs,
     ) -> tf.Tensor:
         r"""
@@ -1624,9 +1626,9 @@ def serving_output(self, output: TFBlipImageTextMatchingModelOutput) -> TFBlipIm
     def call(
         self,
         input_ids: tf.Tensor,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         use_itm_head: Optional[bool] = True,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index 02f592c25902..bff81223375c 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Tuple
 
@@ -277,11 +280,11 @@ def __init__(self, config, is_cross_attention=False, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
         output_attentions: Optional[bool] = False,
         training: Optional[bool] = None,
     ):
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index c9e4c98c1467..980462f4be7c 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 CamemBERT model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -526,9 +529,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -605,9 +608,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -705,14 +708,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -922,14 +925,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1085,16 +1088,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1199,16 +1202,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1290,16 +1293,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1387,16 +1390,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1499,17 +1502,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1615,20 +1618,20 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 7cf52500aed6..9b7976f41366 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -15,6 +15,8 @@
 """ TF 2.0 CLIP model."""
 
 
+from __future__ import annotations
+
 import math
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Union
@@ -111,7 +113,7 @@ class TFCLIPOutput(ModelOutput):
             The output of the [`TFCLIPVisionModel`].
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits_per_image: tf.Tensor = None
     logits_per_text: tf.Tensor = None
     text_embeds: tf.Tensor = None
@@ -586,9 +588,9 @@ def set_input_embeddings(self, value: tf.Variable):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -675,7 +677,7 @@ def get_input_embeddings(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -751,9 +753,9 @@ def build(self, input_shape: tf.TensorShape):
     @unpack_inputs
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -785,7 +787,7 @@ def get_text_features(
     @unpack_inputs
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -810,10 +812,10 @@ def get_image_features(
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1050,9 +1052,9 @@ def __init__(self, config: CLIPTextConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1158,7 +1160,7 @@ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1260,9 +1262,9 @@ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFCLIPOutput:
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1300,7 +1302,7 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1343,10 +1345,10 @@ def get_image_features(
     @replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index e853da762771..19c2d700dc06 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -15,6 +15,8 @@
 """ TF 2.0 ConvBERT model."""
 
 
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -742,12 +744,12 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         token_type_ids: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -858,16 +860,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMaskedLMOutput]:
         r"""
@@ -965,16 +967,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
@@ -1057,16 +1059,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMultipleChoiceModelOutput]:
         r"""
@@ -1170,16 +1172,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFTokenClassifierOutput]:
         r"""
@@ -1247,17 +1249,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[tf.Tensor] = None,
-        end_positions: Optional[tf.Tensor] = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFQuestionAnsweringModelOutput]:
         r"""
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 00db1f0b7884..f258abe24cbb 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -15,6 +15,8 @@
 """ TF 2.0 ConvNext model."""
 
 
+from __future__ import annotations
+
 from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
@@ -297,7 +299,7 @@ def __init__(self, config: ConvNextConfig, add_pooling_layer: bool = True, **kwa
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -458,7 +460,7 @@ def __init__(self, config, *inputs, add_pooling_layer=True, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -543,10 +545,10 @@ def __init__(self, config: ConvNextConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index f4742b4e33d7..cddfd4a9e352 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 CTRL model."""
 
+
+from __future__ import annotations
+
 import warnings
 from typing import Optional, Tuple, Union
 
@@ -256,13 +259,13 @@ def _prune_heads(self, heads_to_prune):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -532,13 +535,13 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -645,18 +648,18 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cac
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFCausalLMOutputWithPast]:
         r"""
@@ -749,18 +752,18 @@ def get_output_embeddings(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py
index 6ad86071e47d..3c80f53bfaf2 100644
--- a/src/transformers/models/cvt/modeling_tf_cvt.py
+++ b/src/transformers/models/cvt/modeling_tf_cvt.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Cvt model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -75,7 +77,7 @@ class TFBaseModelOutputWithCLSToken(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     cls_token_value: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
 
 
 class TFCvtDropPath(tf.keras.layers.Layer):
@@ -668,7 +670,7 @@ def __init__(self, config: CvtConfig, **kwargs):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -797,7 +799,7 @@ def __init__(self, config: CvtConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -880,8 +882,8 @@ def __init__(self, config: CvtConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index 06a6f010dd74..1085d6e48d6c 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TF 2.0 Data2Vec Vision model."""
 
+
+from __future__ import annotations
+
 import collections.abc
 import math
 from dataclasses import dataclass
@@ -94,8 +97,8 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFData2VecVisionDropPath(tf.keras.layers.Layer):
@@ -163,7 +166,7 @@ def build(self, input_shape: tf.TensorShape):
 
         super().build(input_shape)
 
-    def call(self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None) -> tf.Tensor:
+    def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, projection_dim = shape_list(embeddings)
 
@@ -609,7 +612,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] =
     def call(
         self,
         hidden_states: tf.Tensor,
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -685,9 +688,9 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -899,9 +902,9 @@ def get_input_embeddings(self):
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -966,12 +969,12 @@ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, tuple]:
         r"""
@@ -1378,9 +1381,9 @@ def masked_loss(real, pred):
     @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index dcd0582777eb..7a045426185e 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -15,6 +15,8 @@
 """ TF 2.0 DeBERTa model."""
 
 
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Sequence, Tuple, Union
 
@@ -922,11 +924,11 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1092,11 +1094,11 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1149,15 +1151,15 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1233,15 +1235,15 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1314,15 +1316,15 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1390,16 +1392,16 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index b3c210352a32..82b0a30c5a50 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -15,6 +15,8 @@
 """ TF 2.0 DeBERTa-v2 model."""
 
 
+from __future__ import annotations
+
 from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
@@ -1014,11 +1016,11 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1186,11 +1188,11 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1244,15 +1246,15 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1329,15 +1331,15 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1411,15 +1413,15 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1488,16 +1490,16 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py
index a3d487021d4b..131939f5bcfa 100644
--- a/src/transformers/models/deit/modeling_tf_deit.py
+++ b/src/transformers/models/deit/modeling_tf_deit.py
@@ -15,6 +15,8 @@
 """ TensorFlow DeiT model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
 from dataclasses import dataclass
@@ -95,8 +97,8 @@ class token).
     logits: tf.Tensor = None
     cls_logits: tf.Tensor = None
     distillation_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFDeiTEmbeddings(tf.keras.layers.Layer):
@@ -142,7 +144,7 @@ def build(self, input_shape: tf.TensorShape):
         super().build(input_shape)
 
     def call(
-        self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None, training: bool = False
+        self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None, training: bool = False
     ) -> tf.Tensor:
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_length, _ = shape_list(embeddings)
@@ -501,9 +503,9 @@ def get_head_mask(self, head_mask):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -658,9 +660,9 @@ def __init__(
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -768,9 +770,9 @@ def __init__(self, config: DeiTConfig) -> None:
     @replace_return_docstrings(output_type=TFMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -898,9 +900,9 @@ def __init__(self, config: DeiTConfig):
     @replace_return_docstrings(output_type=TFImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1016,8 +1018,8 @@ def __init__(self, config: DeiTConfig) -> None:
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 3013f4ca30d7..85a98c2a77fb 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -16,6 +16,9 @@
  TF 2.0 DistilBERT model
 """
 
+
+from __future__ import annotations
+
 import warnings
 from typing import Optional, Tuple, Union
 
@@ -538,10 +541,10 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -639,14 +642,14 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -725,14 +728,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -805,14 +808,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -897,14 +900,14 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1004,15 +1007,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 565ad37b2117..008e6a39fdc9 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -15,8 +15,10 @@
 
 """ TensorFlow DPR model for Open Domain Question Answering."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Tuple, Union
 
 import tensorflow as tf
 
@@ -80,8 +82,8 @@ class TFDPRContextEncoderOutput(ModelOutput):
     """
 
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -108,8 +110,8 @@ class TFDPRQuestionEncoderOutput(ModelOutput):
     """
 
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -141,8 +143,8 @@ class TFDPRReaderOutput(ModelOutput):
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
     relevance_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFDPREncoderLayer(tf.keras.layers.Layer):
@@ -167,9 +169,9 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = None,
         output_hidden_states: bool = None,
         return_dict: bool = None,
@@ -227,8 +229,8 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = False,
@@ -283,9 +285,9 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = False,
@@ -316,9 +318,9 @@ def __init__(self, config: DPRConfig, **kwargs):
     def call(
         self,
         input_ids: tf.Tensor = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = False,
@@ -552,9 +554,9 @@ def get_input_embeddings(self):
     def call(
         self,
         input_ids=None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -639,9 +641,9 @@ def get_input_embeddings(self):
     def call(
         self,
         input_ids=None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -725,8 +727,8 @@ def get_input_embeddings(self):
     def call(
         self,
         input_ids=None,
-        attention_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = None,
         output_hidden_states: bool = None,
         return_dict=None,
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 82c3381724dc..7602d43cc0ca 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TF Electra model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from dataclasses import dataclass
@@ -312,9 +315,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -391,9 +394,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -704,14 +707,14 @@ def get_head_mask(self, head_mask):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -824,8 +827,8 @@ class TFElectraForPreTrainingOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 ELECTRA_START_DOCSTRING = r"""
@@ -941,14 +944,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1034,12 +1037,12 @@ def __init__(self, config, **kwargs):
     @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1171,16 +1174,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1281,16 +1284,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1372,16 +1375,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1490,16 +1493,16 @@ def __init__(self, config, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1573,17 +1576,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 5ec7f2932f59..f5cd5e445aac 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ Classes to support TF Encoder-Decoder architectures"""
 
+
+from __future__ import annotations
+
 import inspect
 import re
 import warnings
@@ -482,15 +485,15 @@ def from_encoder_decoder_pretrained(
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index 135c16a14b36..df4ea54f83bc 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ PyTorch ESM model."""
 
+
+from __future__ import annotations
+
 import os
 from typing import Optional, Tuple, Union
 
@@ -312,11 +315,11 @@ def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -801,13 +804,13 @@ def _prune_heads(self, heads_to_prune):
 
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -984,13 +987,13 @@ def __init__(self, config: EsmConfig, add_pooling_layer=True, *inputs, **kwargs)
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1113,14 +1116,14 @@ def get_lm_head(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1261,12 +1264,12 @@ def __init__(self, config):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1356,12 +1359,12 @@ def __init__(self, config):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index b1dd523dedaf..7f93caebb000 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -16,6 +16,9 @@
  TF 2.0 Flaubert model.
 """
 
+
+from __future__ import annotations
+
 import itertools
 import random
 import warnings
@@ -255,15 +258,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -486,15 +489,15 @@ def set_input_embeddings(self, value):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -754,8 +757,8 @@ class TFFlaubertWithLMHeadModelOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @add_start_docstrings(
@@ -803,15 +806,15 @@ def prepare_inputs_for_generation(self, inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -874,19 +877,19 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -960,20 +963,20 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1064,19 +1067,19 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1172,19 +1175,19 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         if input_ids is not None:
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 84254f2b288c..fa077d612d21 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TF 2.0 Funnel model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -995,8 +998,8 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 FUNNEL_START_DOCSTRING = r"""
@@ -1110,10 +1113,10 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1158,10 +1161,10 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1206,10 +1209,10 @@ def __init__(self, config: FunnelConfig, **kwargs) -> None:
     @replace_return_docstrings(output_type=TFFunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1285,14 +1288,14 @@ def get_prefix_bias_name(self) -> str:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFMaskedLMOutput]:
         r"""
@@ -1357,14 +1360,14 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFSequenceClassifierOutput]:
         r"""
@@ -1441,14 +1444,14 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFMultipleChoiceModelOutput]:
         r"""
@@ -1550,14 +1553,14 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFTokenClassifierOutput]:
         r"""
@@ -1626,15 +1629,15 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFQuestionAnsweringModelOutput]:
         r"""
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index b7cb1b6df2a2..6b7476b71bba 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 """ TF 2.0 OpenAI GPT-2 model."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -345,15 +347,15 @@ def _prune_heads(self, heads_to_prune):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -583,9 +585,9 @@ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
 
     logits: tf.Tensor = None
     mc_logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 GPT2_START_DOCSTRING = r"""
@@ -716,15 +718,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -844,20 +846,20 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -963,14 +965,14 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mc_token_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        mc_token_ids: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1124,18 +1126,18 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutputWithPast, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index fbef4f0effc7..09e4330eb182 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ TF 2.0 GPT-J model."""
 
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -171,8 +173,8 @@ def _attn(
         query: tf.Tensor,
         key: tf.Tensor,
         value: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         # compute causal mask from causal mask buffer
         query_length, key_length = shape_list(query)[-2], shape_list(key)[-2]
@@ -207,9 +209,9 @@ def call(
         self,
         hidden_states: tf.Tensor,
         layer_past: Optional[Tuple[tf.Tensor, tf.Tensor]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         use_cache: bool = False,
         output_attentions: bool = False,
     ):
@@ -301,10 +303,10 @@ def __init__(self, config: GPTJConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        layer_past: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        layer_past: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         use_cache: bool = False,
         output_attentions: bool = False,
     ):
@@ -659,13 +661,13 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -762,14 +764,14 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -865,14 +867,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -987,15 +989,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 4891931c20ab..718884720980 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -15,6 +15,8 @@
 """ TF 2.0 GroupViT model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
 from dataclasses import dataclass
@@ -247,7 +249,7 @@ class TFGroupViTModelOutput(ModelOutput):
             The output of the [`TFGroupViTVisionModel`].
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits_per_image: tf.Tensor = None
     logits_per_text: tf.Tensor = None
     segmentation_logits: tf.Tensor = None
@@ -647,7 +649,7 @@ def split_x(self, x: tf.Tensor) -> tf.Tensor:
         else:
             return x, None
 
-    def concat_x(self, x: tf.Tensor, group_token: Optional[tf.Tensor] = None) -> tf.Tensor:
+    def concat_x(self, x: tf.Tensor, group_token: tf.Tensor | None = None) -> tf.Tensor:
         if group_token is None:
             return x
         return tf.concat([x, group_token], axis=1)
@@ -655,7 +657,7 @@ def concat_x(self, x: tf.Tensor, group_token: Optional[tf.Tensor] = None) -> tf.
     def call(
         self,
         hidden_states: tf.Tensor,
-        prev_group_token: Optional[tf.Tensor] = None,
+        prev_group_token: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -1138,9 +1140,9 @@ def set_input_embeddings(self, value: tf.Variable):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1183,7 +1185,7 @@ def get_input_embeddings(self) -> tf.keras.layers.Layer:
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1262,9 +1264,9 @@ def build(self, input_shape: tf.TensorShape):
     @unpack_inputs
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1298,7 +1300,7 @@ def get_text_features(
     @unpack_inputs
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1325,10 +1327,10 @@ def get_image_features(
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1635,9 +1637,9 @@ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1731,7 +1733,7 @@ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1831,9 +1833,9 @@ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFGroupViTModelOutput:
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def get_text_features(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1872,7 +1874,7 @@ def get_text_features(
     @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
     def get_image_features(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1916,10 +1918,10 @@ def get_image_features(
     @replace_return_docstrings(output_type=TFGroupViTModelOutput, config_class=GroupViTConfig)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        pixel_values: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        pixel_values: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 24cbde9af7c3..fd1f17edfb3a 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TensorFlow Hubert model."""
+
+from __future__ import annotations
+
 import warnings
 from typing import Any, Dict, Optional, Tuple, Union
 
@@ -642,12 +645,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -812,7 +815,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -856,7 +859,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -890,7 +893,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -958,7 +961,7 @@ def __init__(self, config: HubertConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -1048,7 +1051,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optional[tf.Tensor] = None):
+    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
         """
         Masks extracted features along time axis and/or along feature axis according to
         [SpecAugment](https://arxiv.org/abs/1904.08779).
@@ -1096,13 +1099,13 @@ def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optio
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        output_attentions: Optional[tf.Tensor] = None,
-        output_hidden_states: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: tf.Tensor | None = None,
+        output_hidden_states: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
         **kwargs: Any,
@@ -1299,11 +1302,11 @@ def __init__(self, config: HubertConfig, *inputs, **kwargs):
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1401,13 +1404,13 @@ def freeze_feature_encoder(self):
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 2755e055370b..67128e0c1338 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TF 2.0 LayoutLM model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Dict, Optional, Tuple, Union
@@ -423,9 +426,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -502,9 +505,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -694,15 +697,15 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -919,15 +922,15 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1039,17 +1042,17 @@ def get_prefix_bias_name(self) -> str:
     @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1162,17 +1165,17 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1292,17 +1295,17 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1418,18 +1421,18 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        bbox: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        bbox: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
index 491ef186e522..67377c5baf8a 100644
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """TF 2.0 LayoutLMv3 model."""
 
+
+from __future__ import annotations
+
 import collections
 import math
 from typing import Dict, List, Optional, Tuple, Union
@@ -222,11 +225,11 @@ def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) ->
 
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
         bbox: tf.Tensor = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         training: bool = False,
     ) -> tf.Tensor:
         if position_ids is None:
@@ -319,11 +322,11 @@ def cogview_attention(self, attention_scores: tf.Tensor, alpha: Union[float, int
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor],
-        head_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
         output_attentions: bool,
-        rel_pos: Optional[tf.Tensor] = None,
-        rel_2d_pos: Optional[tf.Tensor] = None,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
         key_layer = self.transpose_for_scores(self.key(hidden_states))
@@ -398,11 +401,11 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor],
-        head_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
         output_attentions: bool,
-        rel_pos: Optional[tf.Tensor] = None,
-        rel_2d_pos: Optional[tf.Tensor] = None,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
         self_outputs = self.self_attention(
@@ -469,11 +472,11 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor],
-        head_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
+        head_mask: tf.Tensor | None,
         output_attentions: bool,
-        rel_pos: Optional[tf.Tensor] = None,
-        rel_2d_pos: Optional[tf.Tensor] = None,
+        rel_pos: tf.Tensor | None = None,
+        rel_2d_pos: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]:
         self_attention_outputs = self.attention(
@@ -593,13 +596,13 @@ def _cal_2d_pos_emb(self, bbox: tf.Tensor):
     def call(
         self,
         hidden_states: tf.Tensor,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
-        position_ids: Optional[tf.Tensor] = None,
+        position_ids: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[
         TFBaseModelOutput,
@@ -778,7 +781,7 @@ def get_extended_attention_mask(self, attention_mask: tf.Tensor) -> tf.Tensor:
 
         return extended_attention_mask
 
-    def get_head_mask(self, head_mask: Optional[tf.Tensor]) -> Union[tf.Tensor, List[Optional[tf.Tensor]]]:
+    def get_head_mask(self, head_mask: tf.Tensor | None) -> Union[tf.Tensor, List[tf.Tensor | None]]:
         if head_mask is None:
             return [None] * self.config.num_hidden_layers
 
@@ -806,14 +809,14 @@ def get_head_mask(self, head_mask: Optional[tf.Tensor]) -> Union[tf.Tensor, List
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1145,14 +1148,14 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1272,18 +1275,18 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        bbox: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        bbox: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[
         TFSequenceClassifierOutput,
@@ -1392,18 +1395,18 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     @replace_return_docstrings(output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        bbox: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        bbox: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[
         TFTokenClassifierOutput,
@@ -1514,18 +1517,18 @@ def __init__(self, config: LayoutLMv3Config, **kwargs):
     @replace_return_docstrings(output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        start_positions: Optional[tf.Tensor] = None,
-        end_positions: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        bbox: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
+        bbox: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 324482b4d245..4e815da33d9e 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -15,6 +15,8 @@
 """ TF 2.0 LED model."""
 
 
+from __future__ import annotations
+
 import random
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -1030,12 +1032,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training=False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1238,12 +1240,12 @@ def __init__(self, config: LEDConfig, **kwargs):
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        encoder_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        encoder_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -1389,9 +1391,9 @@ class TFLEDEncoderBaseModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -1452,14 +1454,14 @@ class TFLEDSeq2SeqModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_global_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
+    encoder_global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -1517,16 +1519,16 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_last_hidden_state: Optional[tf.Tensor] = None
-    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
-    encoder_global_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    decoder_hidden_states: Tuple[tf.Tensor] | None = None
+    decoder_attentions: Tuple[tf.Tensor] | None = None
+    cross_attentions: Tuple[tf.Tensor] | None = None
+    encoder_last_hidden_state: tf.Tensor | None = None
+    encoder_hidden_states: Tuple[tf.Tensor] | None = None
+    encoder_attentions: Tuple[tf.Tensor] | None = None
+    encoder_global_attentions: Tuple[tf.Tensor] | None = None
 
 
 LED_START_DOCSTRING = r"""
@@ -2383,22 +2385,22 @@ def set_output_embeddings(self, value):
     @replace_return_docstrings(output_type=TFLEDSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFLEDEncoderBaseModelOutput] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ):
         """
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index c47df169655c..b5adb2c803e9 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """Tensorflow Longformer model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
@@ -101,9 +104,9 @@ class TFLongformerBaseModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -149,9 +152,9 @@ class TFLongformerBaseModelOutputWithPooling(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     pooler_output: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -193,11 +196,11 @@ class TFLongformerMaskedLMOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -241,12 +244,12 @@ class TFLongformerQuestionAnsweringModelOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -288,11 +291,11 @@ class TFLongformerSequenceClassifierOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -336,11 +339,11 @@ class TFLongformerMultipleChoiceModelOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -382,11 +385,11 @@ class TFLongformerTokenClassifierOutput(ModelOutput):
             in the sequence.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    global_attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    global_attentions: Tuple[tf.Tensor] | None = None
 
 
 def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_sep_token=True):
@@ -2038,13 +2041,13 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -2113,17 +2116,17 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2206,18 +2209,18 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2369,17 +2372,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerSequenceClassifierOutput, Tuple[tf.Tensor]]:
         if input_ids is not None and not isinstance(input_ids, tf.Tensor):
@@ -2491,17 +2494,17 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -2619,13 +2622,13 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        global_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        global_attention_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index f02d0ff6446a..59bc18591405 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -16,6 +16,9 @@
 # limitations under the License.
 """ TF 2.0 LXMERT model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -90,14 +93,14 @@ class TFLxmertModelOutput(ModelOutput):
             the self-attention heads.
     """
 
-    language_output: Optional[tf.Tensor] = None
-    vision_output: Optional[tf.Tensor] = None
-    pooled_output: Optional[tf.Tensor] = None
-    language_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    vision_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    language_attentions: Optional[Tuple[tf.Tensor]] = None
-    vision_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    language_output: tf.Tensor | None = None
+    vision_output: tf.Tensor | None = None
+    pooled_output: tf.Tensor | None = None
+    language_hidden_states: Tuple[tf.Tensor] | None = None
+    vision_hidden_states: Tuple[tf.Tensor] | None = None
+    language_attentions: Tuple[tf.Tensor] | None = None
+    vision_attentions: Tuple[tf.Tensor] | None = None
+    cross_encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -137,15 +140,15 @@ class TFLxmertForPreTrainingOutput(ModelOutput):
 
     """
 
-    loss: Optional[tf.Tensor] = None
-    prediction_logits: Optional[tf.Tensor] = None
-    cross_relationship_score: Optional[tf.Tensor] = None
-    question_answering_score: Optional[tf.Tensor] = None
-    language_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    vision_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    language_attentions: Optional[Tuple[tf.Tensor]] = None
-    vision_attentions: Optional[Tuple[tf.Tensor]] = None
-    cross_encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    loss: tf.Tensor | None = None
+    prediction_logits: tf.Tensor | None = None
+    cross_relationship_score: tf.Tensor | None = None
+    question_answering_score: tf.Tensor | None = None
+    language_hidden_states: Tuple[tf.Tensor] | None = None
+    vision_hidden_states: Tuple[tf.Tensor] | None = None
+    language_attentions: Tuple[tf.Tensor] | None = None
+    vision_attentions: Tuple[tf.Tensor] | None = None
+    cross_encoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFLxmertVisualFeatureEncoder(tf.keras.layers.Layer):
@@ -945,13 +948,13 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        visual_feats: Optional[tf.Tensor] = None,
-        visual_pos: Optional[tf.Tensor] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        visual_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        visual_feats: tf.Tensor | None = None,
+        visual_pos: tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        visual_attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 175115883201..208e9b8335d7 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Marian model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -165,7 +167,7 @@ def _init_weight(n_pos: int, dim: int):
         return table
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -212,12 +214,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -347,8 +349,8 @@ def __init__(self, config: MarianConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]],
-        layer_head_mask: Optional[tf.Tensor],
+        attention_mask: np.ndarray | tf.Tensor | None,
+        layer_head_mask: tf.Tensor | None,
         training: Optional[bool] = False,
     ) -> tf.Tensor:
         """
@@ -417,11 +419,11 @@ def __init__(self, config: MarianConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
@@ -708,10 +710,10 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -870,15 +872,15 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1097,18 +1099,18 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Tuple[Tuple[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1202,18 +1204,18 @@ def get_decoder(self):
     )
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
-        encoder_outputs: Optional[tf.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        encoder_outputs: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1330,23 +1332,23 @@ def set_bias(self, value):
     @add_end_docstrings(MARIAN_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ):
         r"""
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index 13453bd22dba..293c564141b3 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -15,6 +15,8 @@
 """ TF 2.0 MBart model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -131,7 +133,7 @@ def call(
         self,
         input_shape: Optional[tf.TensorShape] = None,
         past_key_values_length: int = 0,
-        position_ids: Optional[tf.Tensor] = None,
+        position_ids: tf.Tensor | None = None,
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -181,12 +183,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -384,12 +386,12 @@ def __init__(self, config: MBartConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -700,10 +702,10 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -868,14 +870,14 @@ def set_embed_tokens(self, embed_tokens):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1100,17 +1102,17 @@ def set_input_embeddings(self, new_embeddings):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1208,17 +1210,17 @@ def get_decoder(self):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1336,22 +1338,22 @@ def set_bias(self, value):
     def call(
         self,
         input_ids: TFModelInputType = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Tuple[Tuple[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         """
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index c47cde847de4..eddb339074a3 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 MobileBERT model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
@@ -846,11 +849,11 @@ class TFMobileBertForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     prediction_logits: tf.Tensor = None
     seq_relationship_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 MOBILEBERT_START_DOCSTRING = r"""
@@ -969,12 +972,12 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1033,17 +1036,17 @@ def get_prefix_bias_name(self):
     @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMobileBertForPreTrainingOutput]:
         r"""
@@ -1141,16 +1144,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMaskedLMOutput]:
         r"""
@@ -1224,16 +1227,16 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        next_sentence_label: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFNextSentencePredictorOutput]:
         r"""
@@ -1335,16 +1338,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
@@ -1430,17 +1433,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFQuestionAnsweringModelOutput]:
         r"""
@@ -1546,16 +1549,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFMultipleChoiceModelOutput]:
         r"""
@@ -1674,16 +1677,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFTokenClassifierOutput]:
         r"""
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index 1b06f36536d6..879c642800fe 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -16,6 +16,8 @@
 # Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
 """ TensorFlow 2.0 MobileViT model."""
 
+from __future__ import annotations
+
 from typing import Dict, Optional, Tuple, Union
 
 import tensorflow as tf
@@ -663,7 +665,7 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -846,7 +848,7 @@ def __init__(self, config: MobileViTConfig, expand_output: bool = True, *inputs,
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -893,9 +895,9 @@ def __init__(self, config: MobileViTConfig, *inputs, **kwargs) -> None:
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[tuple, TFImageClassifierOutputWithNoAttention]:
@@ -1083,8 +1085,8 @@ def masked_loss(real, pred):
     @replace_return_docstrings(output_type=TFSemanticSegmenterOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index 08db31017308..2f4178d6cfc9 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -16,6 +16,8 @@
 """ TF 2.0 MPNet model."""
 
 
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -682,11 +684,11 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -795,15 +797,15 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -898,15 +900,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -987,15 +989,15 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1096,15 +1098,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1176,16 +1178,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
         position_ids: Optional[Union[np.array, tf.Tensor]] = None,
         head_mask: Optional[Union[np.array, tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[tf.Tensor] = None,
-        end_positions: Optional[tf.Tensor] = None,
+        start_positions: tf.Tensor | None = None,
+        end_positions: tf.Tensor | None = None,
         training: bool = False,
         **kwargs,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 7c04520c9c1f..3f8967241946 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 """ TF 2.0 OpenAI GPT model."""
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -237,12 +239,12 @@ def _prune_heads(self, heads_to_prune):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -394,8 +396,8 @@ class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
 
     logits: tf.Tensor = None
     mc_logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 OPENAI_GPT_START_DOCSTRING = r"""
@@ -514,12 +516,12 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -576,16 +578,16 @@ def set_output_embeddings(self, value):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFCausalLMOutput]:
         r"""
@@ -661,13 +663,13 @@ def __init__(self, config, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mc_token_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        mc_token_ids: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -809,16 +811,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFSequenceClassifierOutput]:
         r"""
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 1855fcb1bc03..227e56fdef55 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -15,6 +15,8 @@
 """ TF 2.0 OPT model."""
 
 
+from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -152,12 +154,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -291,8 +293,8 @@ def __init__(self, config: OPTConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         training: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
@@ -552,10 +554,10 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -732,11 +734,11 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -804,11 +806,11 @@ def set_input_embeddings(self, new_embeddings):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -901,13 +903,13 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
+        input_ids: TFModelInputType | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 1ccccc2dc5ce..7de1542ebe47 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Pegasus model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Optional, Tuple, Union
 
@@ -167,7 +169,7 @@ def _init_weight(n_pos: int, dim: int):
         return table
 
     def call(
-        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: Optional[tf.Tensor] = None
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
     ):
         """Input is expected to be of size [bsz x seqlen]."""
         if position_ids is None:
@@ -214,12 +216,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -419,12 +421,12 @@ def __init__(self, config: PegasusConfig, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -712,10 +714,10 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -877,14 +879,14 @@ def set_embed_tokens(self, embed_tokens):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         past_key_values: Tuple[Tuple[tf.Tensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1106,18 +1108,18 @@ def set_input_embeddings(self, new_embeddings):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        decoder_input_ids: Optional[tf.Tensor] = None,
-        decoder_attention_mask: Optional[tf.Tensor] = None,
-        decoder_position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        decoder_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_head_mask: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Tuple[Tuple[tf.Tensor]] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
-        decoder_inputs_embeds: Optional[tf.Tensor] = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1211,18 +1213,18 @@ def get_decoder(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1339,23 +1341,23 @@ def set_bias(self, value):
     @add_end_docstrings(PEGASUS_GENERATION_EXAMPLE)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[TFBaseModelOutput] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]:
         """
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 0ea2e554489b..d91fa71df8a6 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -15,6 +15,9 @@
 
 """TFRAG model implementation."""
 
+
+from __future__ import annotations
+
 import copy
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -111,22 +114,22 @@ class TFRetrievAugLMMarginOutput(ModelOutput):
             average in the self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    doc_scores: Optional[tf.Tensor] = None
-    retrieved_doc_embeds: Optional[tf.Tensor] = None
-    retrieved_doc_ids: Optional[tf.Tensor] = None
-    context_input_ids: Optional[tf.Tensor] = None
-    context_attention_mask: Optional[tf.Tensor] = None
-    question_encoder_last_hidden_state: Optional[tf.Tensor] = None
-    question_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    question_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_last_hidden_state: Optional[tf.Tensor] = None
-    generator_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    doc_scores: tf.Tensor | None = None
+    retrieved_doc_embeds: tf.Tensor | None = None
+    retrieved_doc_ids: tf.Tensor | None = None
+    context_input_ids: tf.Tensor | None = None
+    context_attention_mask: tf.Tensor | None = None
+    question_encoder_last_hidden_state: tf.Tensor | None = None
+    question_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    question_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_enc_last_hidden_state: tf.Tensor | None = None
+    generator_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_dec_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_dec_attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -196,20 +199,20 @@ class TFRetrievAugLMOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    past_key_values: Optional[List[tf.Tensor]] = None
-    doc_scores: Optional[tf.Tensor] = None
-    retrieved_doc_embeds: Optional[tf.Tensor] = None
-    retrieved_doc_ids: Optional[tf.Tensor] = None
-    context_input_ids: Optional[tf.Tensor] = None
-    context_attention_mask: Optional[tf.Tensor] = None
-    question_encoder_last_hidden_state: Optional[tf.Tensor] = None
-    question_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    question_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_last_hidden_state: Optional[tf.Tensor] = None
-    generator_enc_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_enc_attentions: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    generator_dec_attentions: Optional[Tuple[tf.Tensor]] = None
+    past_key_values: List[tf.Tensor] | None = None
+    doc_scores: tf.Tensor | None = None
+    retrieved_doc_embeds: tf.Tensor | None = None
+    retrieved_doc_ids: tf.Tensor | None = None
+    context_input_ids: tf.Tensor | None = None
+    context_attention_mask: tf.Tensor | None = None
+    question_encoder_last_hidden_state: tf.Tensor | None = None
+    question_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    question_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_enc_last_hidden_state: tf.Tensor | None = None
+    generator_enc_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_enc_attentions: Tuple[tf.Tensor] | None = None
+    generator_dec_hidden_states: Tuple[tf.Tensor] | None = None
+    generator_dec_attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFRagPreTrainedModel(TFPreTrainedModel):
@@ -545,15 +548,15 @@ def set_retriever(self, retriever: RagRetriever):
     @replace_return_docstrings(output_type=TFRetrievAugLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        doc_scores: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        doc_scores: np.ndarray | tf.Tensor | None = None,
+        context_input_ids: np.ndarray | tf.Tensor | None = None,
+        context_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -841,22 +844,22 @@ def marginalize(self, seq_logits, doc_scores, n_docs=None):
     @replace_return_docstrings(output_type=TFRetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        doc_scores: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        doc_scores: np.ndarray | tf.Tensor | None = None,
+        context_input_ids: np.ndarray | tf.Tensor | None = None,
+        context_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_retrieved: Optional[bool] = None,
         n_docs: Optional[int] = None,
         do_marginalize: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         reduce_loss: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -993,8 +996,8 @@ def call(
 
     def generate(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
         context_input_ids=None,
         context_attention_mask=None,
         doc_scores=None,
@@ -1347,22 +1350,22 @@ def question_encoder(self):
     @replace_return_docstrings(output_type=TFRetrievAugLMMarginOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        doc_scores: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        context_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        doc_scores: np.ndarray | tf.Tensor | None = None,
+        context_input_ids: np.ndarray | tf.Tensor | None = None,
+        context_attention_mask: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_retrieved: Optional[bool] = None,
         n_docs: Optional[int] = None,
         exclude_bos_score: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         reduce_loss: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
@@ -1579,8 +1582,8 @@ def gather2d(target, id_tensor):
 
     def generate(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[tf.Tensor] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
         context_input_ids=None,
         context_attention_mask=None,
         doc_scores=None,
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index c4dc8c5a148b..097bd977a4a1 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -15,6 +15,8 @@
 """ TF 2.0 RemBERT model."""
 
 
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Tuple, Union
 
@@ -382,9 +384,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -645,14 +647,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -946,14 +948,14 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1048,16 +1050,16 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1137,20 +1139,20 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1259,16 +1261,16 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1348,16 +1350,16 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1466,16 +1468,16 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1544,17 +1546,17 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 7aa2c9e07a3d..585c4d31ad0d 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 RoBERTa model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -431,9 +434,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -510,9 +513,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -609,14 +612,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -923,14 +926,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1081,16 +1084,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1178,20 +1181,20 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1329,16 +1332,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1423,16 +1426,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1539,16 +1542,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1624,17 +1627,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
index fedfea56a7a9..80a834ad5854 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 RoBERTa-PreLayerNorm model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -435,9 +438,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -514,9 +517,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -610,14 +613,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -925,14 +928,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1090,16 +1093,16 @@ def get_prefix_bias_name(self):
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.call with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1194,20 +1197,20 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1349,16 +1352,16 @@ def __init__(self, config, *inputs, **kwargs):
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification.call with roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1446,16 +1449,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1563,16 +1566,16 @@ def __init__(self, config, *inputs, **kwargs):
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification.call with roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1649,17 +1652,17 @@ def __init__(self, config, *inputs, **kwargs):
     # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering.call with roberta->roberta_prelayernorm
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 2d1387d2d8d8..50b57571461d 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -15,6 +15,8 @@
 """ TF 2.0 RoFormer model."""
 
 
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Tuple, Union
 
@@ -604,11 +606,11 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -809,11 +811,11 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -866,15 +868,15 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -940,15 +942,15 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1049,15 +1051,15 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1138,15 +1140,15 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1250,15 +1252,15 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1326,16 +1328,16 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index ddd8e526a79a..8b00f13b4415 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -17,6 +17,9 @@
 discrepancy, the original file should be regarded as the 'reference' version.
 """
 
+
+from __future__ import annotations
+
 import collections
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Union
@@ -26,7 +29,7 @@
 
 from ...activations_tf import ACT2FN
 from ...modeling_tf_outputs import TFBaseModelOutput
-from ...modeling_tf_utils import TFPreTrainedModel, shape_list, unpack_inputs
+from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, shape_list, unpack_inputs
 from ...tf_utils import flatten, functional_layernorm
 from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
@@ -69,10 +72,10 @@ class TFSamVisionEncoderOutput(ModelOutput):
             heads.
     """
 
-    image_embeds: Optional[tf.Tensor] = None
+    image_embeds: tf.Tensor | None = None
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -106,9 +109,9 @@ class TFSamImageSegmentationOutput(ModelOutput):
 
     iou_scores: tf.Tensor = None
     pred_masks: tf.Tensor = None
-    vision_hidden_states: Optional[Tuple[tf.Tensor]] = None
-    vision_attentions: Optional[Tuple[tf.Tensor]] = None
-    mask_decoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    vision_hidden_states: Tuple[tf.Tensor] | None = None
+    vision_attentions: Tuple[tf.Tensor] | None = None
+    mask_decoder_attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFSamPatchEmbeddings(tf.keras.layers.Layer):
@@ -734,9 +737,9 @@ def call(
         self,
         batch_size: Optional[int],
         input_points: Optional[Tuple[tf.Tensor, tf.Tensor]],
-        input_labels: Optional[tf.Tensor],
-        input_boxes: Optional[tf.Tensor],
-        input_masks: Optional[tf.Tensor],
+        input_labels: tf.Tensor | None,
+        input_boxes: tf.Tensor | None,
+        input_masks: tf.Tensor | None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """
         Embeds different types of prompts, returning both sparse and dense embeddings.
@@ -1084,7 +1087,7 @@ def get_input_embeddings(self):
 
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1326,10 +1329,10 @@ def get_image_embeddings(
 
     def get_prompt_embeddings(
         self,
-        input_points: Optional[tf.Tensor] = None,
-        input_labels: Optional[tf.Tensor] = None,
-        input_boxes: Optional[tf.Tensor] = None,
-        input_masks: Optional[tf.Tensor] = None,
+        input_points: tf.Tensor | None = None,
+        input_labels: tf.Tensor | None = None,
+        input_boxes: tf.Tensor | None = None,
+        input_masks: tf.Tensor | None = None,
     ):
         r"""
         Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.
@@ -1360,12 +1363,12 @@ def get_prompt_embeddings(
     @add_start_docstrings_to_model_forward(SAM_INPUTS_DOCSTRING)
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        input_points: Optional[tf.Tensor] = None,
-        input_labels: Optional[tf.Tensor] = None,
-        input_boxes: Optional[tf.Tensor] = None,
-        input_masks: Optional[tf.Tensor] = None,
-        image_embeddings: Optional[tf.Tensor] = None,
+        pixel_values: TFModelInputType | None = None,
+        input_points: tf.Tensor | None = None,
+        input_labels: tf.Tensor | None = None,
+        input_boxes: tf.Tensor | None = None,
+        input_masks: tf.Tensor | None = None,
+        image_embeddings: tf.Tensor | None = None,
         multimask_output: bool = True,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index c877e86acfab..47b7ce8e8c5c 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TensorFlow SegFormer model."""
 
+
+from __future__ import annotations
+
 import math
 from typing import Dict, Optional, Tuple, Union
 
@@ -664,8 +667,8 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -816,7 +819,7 @@ def masked_loss(real, pred):
     def call(
         self,
         pixel_values: tf.Tensor,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index e5c38afa83cb..3651506894c7 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -15,6 +15,8 @@
 """ TensorFlow Speech2Text model."""
 
 
+from __future__ import annotations
+
 import random
 from typing import Dict, Optional, Tuple, Union
 
@@ -273,12 +275,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -476,12 +478,12 @@ def __init__(self, config: Speech2TextConfig, **kwargs):
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -1253,16 +1255,16 @@ def get_decoder(self):
     )
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1343,17 +1345,17 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index 61352843c2f2..f75bf230c0ad 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -15,6 +15,8 @@
 """ TF 2.0 Swin Transformer model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
 import warnings
@@ -95,9 +97,9 @@ class TFSwinEncoderOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -130,10 +132,10 @@ class TFSwinModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    pooler_output: Optional[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    pooler_output: tf.Tensor | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -165,11 +167,11 @@ class TFSwinMaskedImageModelingOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     reconstruction: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
     @property
     def logits(self):
@@ -210,11 +212,11 @@ class TFSwinImageClassifierOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-    reshaped_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+    reshaped_hidden_states: Tuple[tf.Tensor] | None = None
 
 
 def window_partition(input_feature: tf.Tensor, window_size: int) -> tf.Tensor:
@@ -529,8 +531,8 @@ def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor, ...]:
@@ -619,8 +621,8 @@ def prune_heads(self, heads):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> tf.Tensor:
@@ -683,7 +685,7 @@ def __init__(
         self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
         self.swin_output = TFSwinOutput(config, dim, name="output")
 
-    def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> Optional[tf.Tensor]:
+    def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
         img_mask = tf.zeros((height, width))
         height_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
         width_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
@@ -725,7 +727,7 @@ def call(
         self,
         hidden_states: tf.Tensor,
         input_dimensions: Tuple[int, int],
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> tf.Tensor:
@@ -832,7 +834,7 @@ def call(
         self,
         hidden_states: tf.Tensor,
         input_dimensions: Tuple[int, int],
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor, ...]:
@@ -886,7 +888,7 @@ def call(
         self,
         hidden_states: tf.Tensor,
         input_dimensions: Tuple[int, int],
-        head_mask: Optional[tf.Tensor] = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -1128,9 +1130,9 @@ def get_head_mask(self, head_mask: Optional[Any]) -> List:
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1210,9 +1212,9 @@ def __init__(
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1317,9 +1319,9 @@ def __init__(self, config: SwinConfig):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        bool_masked_pos: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        bool_masked_pos: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1449,9 +1451,9 @@ def __init__(self, config: SwinConfig):
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        labels: Optional[tf.Tensor] = None,
+        pixel_values: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index ec3e67db26d1..012f0c41b017 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 T5 model."""
 
+
+from __future__ import annotations
+
 import copy
 import itertools
 import math
@@ -1148,16 +1151,16 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1327,17 +1330,17 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_outputs: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1548,10 +1551,10 @@ def get_encoder(self):
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index f876730b095d..b17fddc32720 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """TF 2.0 TAPAS model."""
 
+
+from __future__ import annotations
+
 import enum
 import math
 from dataclasses import dataclass
@@ -132,11 +135,11 @@ class TFTableQuestionAnsweringOutput(ModelOutput):
             the self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    logits_aggregation: Optional[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    logits_aggregation: tf.Tensor | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 class TFTapasEmbeddings(tf.keras.layers.Layer):
@@ -486,9 +489,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -565,9 +568,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -758,12 +761,12 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -984,12 +987,12 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1069,16 +1072,16 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1281,21 +1284,21 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFTableQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        table_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        aggregation_labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        float_answer: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        numeric_values: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        numeric_values_scale: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        table_mask: np.ndarray | tf.Tensor | None = None,
+        aggregation_labels: np.ndarray | tf.Tensor | None = None,
+        float_answer: np.ndarray | tf.Tensor | None = None,
+        numeric_values: np.ndarray | tf.Tensor | None = None,
+        numeric_values_scale: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTableQuestionAnsweringOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1606,16 +1609,16 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs):
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index 93af21651112..decf18b8a7a0 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -17,6 +17,8 @@
  TF 2.0 Transformer XL model.
 """
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -541,14 +543,14 @@ def _update_mems(self, hids, mems, mlen, qlen):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ):
         # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
@@ -722,8 +724,8 @@ class TFTransfoXLModelOutput(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     mems: List[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -755,8 +757,8 @@ class TFTransfoXLLMHeadModelOutput(ModelOutput):
 
     prediction_scores: tf.Tensor = None
     mems: List[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -786,11 +788,11 @@ class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
     mems: List[tf.Tensor] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 TRANSFO_XL_START_DOCSTRING = r"""
@@ -892,10 +894,10 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -971,14 +973,14 @@ def init_mems(self, bsz):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ):
         if input_ids is not None:
@@ -1075,14 +1077,14 @@ def get_output_embeddings(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        mems: Optional[List[tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        mems: List[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFTransfoXLSequenceClassifierOutputWithPast]:
         r"""
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 439c5d668a93..ad39a0ae82ba 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -15,6 +15,8 @@
 """ Classes to support TF Vision-Encoder-Text-Decoder architectures"""
 
 
+from __future__ import annotations
+
 import re
 import warnings
 from typing import Optional, Tuple, Union
@@ -492,13 +494,13 @@ def from_encoder_decoder_pretrained(
     @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        decoder_inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index a2211f245ec5..6e0c65a813f1 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -15,6 +15,8 @@
 """TensorFlow VisionTextDualEncoder model."""
 
 
+from __future__ import annotations
+
 import re
 from typing import Optional, Tuple, Union
 
@@ -340,12 +342,12 @@ def get_image_features(
     @replace_return_docstrings(output_type=TFCLIPOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[tf.Tensor] = None,
-        pixel_values: Optional[tf.Tensor] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
         return_loss: Optional[bool] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
+        token_type_ids: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 6d0c579a43db..6a07719c916c 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -15,6 +15,8 @@
 """ TF 2.0 ViT model."""
 
 
+from __future__ import annotations
+
 import collections.abc
 import math
 from typing import Dict, Optional, Tuple, Union
@@ -487,8 +489,8 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
@@ -675,8 +677,8 @@ def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs)
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
@@ -766,13 +768,13 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs):
     )
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index afb40478ccda..5f5c1a6830d6 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TF 2.0 ViT MAE (masked autoencoder) model."""
 
+
+from __future__ import annotations
+
 import collections.abc
 import math
 from copy import deepcopy
@@ -74,8 +77,8 @@ class TFViTMAEModelOutput(ModelOutput):
     last_hidden_state: tf.Tensor = None
     mask: tf.Tensor = None
     ids_restore: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -97,8 +100,8 @@ class TFViTMAEDecoderOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -125,12 +128,12 @@ class TFViTMAEForPreTrainingOutput(ModelOutput):
             the self-attention heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
     mask: tf.Tensor = None
     ids_restore: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
@@ -232,7 +235,7 @@ def build(self, input_shape: tf.TensorShape):
 
         super().build(input_shape)
 
-    def random_masking(self, sequence: tf.Tensor, noise: Optional[tf.Tensor] = None):
+    def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
         """
         Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
         noise.
@@ -639,9 +642,9 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         noise: tf.Tensor = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -816,9 +819,9 @@ def get_input_embeddings(self):
     @replace_return_docstrings(output_type=TFViTMAEModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         noise: tf.Tensor = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1107,9 +1110,9 @@ def forward_loss(self, pixel_values, pred, mask):
     @replace_return_docstrings(output_type=TFViTMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        pixel_values: Optional[TFModelInputType] = None,
+        pixel_values: TFModelInputType | None = None,
         noise: tf.Tensor = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index dcc59d7f7322..3ee16127b323 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 """ TensorFlow Wav2Vec2 model."""
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Union
@@ -84,8 +87,8 @@ class TFWav2Vec2BaseModelOutput(ModelOutput):
 
     last_hidden_state: tf.Tensor = None
     extract_features: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 def _sample_without_replacement(distribution, num_samples):
@@ -673,12 +676,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -841,7 +844,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -884,7 +887,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -917,7 +920,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -984,7 +987,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs):
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
@@ -1074,7 +1077,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optional[tf.Tensor] = None):
+    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
         """
         Masks extracted features along time axis and/or along feature axis according to
         [SpecAugment](https://arxiv.org/abs/1904.08779).
@@ -1122,11 +1125,11 @@ def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: Optio
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1367,11 +1370,11 @@ def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1473,13 +1476,13 @@ def freeze_feature_encoder(self):
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        token_type_ids: Optional[tf.Tensor] = None,
-        position_ids: Optional[tf.Tensor] = None,
-        head_mask: Optional[tf.Tensor] = None,
-        inputs_embeds: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
@@ -1639,11 +1642,11 @@ def freeze_base_model(self):
     def call(
         self,
         input_values: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
+        attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[tf.Tensor] = None,
+        labels: tf.Tensor | None = None,
         training: bool = False,
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 0d2a2682cc97..11168df3f9ca 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -15,6 +15,8 @@
 """ TensorFlow Whisper model."""
 
 
+from __future__ import annotations
+
 import math
 import random
 from typing import Dict, Optional, Tuple, Union
@@ -171,12 +173,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -376,12 +378,12 @@ def __init__(self, config: WhisperConfig, **kwargs):
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -1119,13 +1121,13 @@ def encoder(self):
     @unpack_inputs
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         decoder_inputs_embeds: Optional[Tuple[Union[np.ndarray, tf.Tensor]]] = None,
@@ -1234,17 +1236,17 @@ def resize_token_embeddings(self, new_num_tokens: int) -> tf.keras.layers.Embedd
     @unpack_inputs
     def call(
         self,
-        input_features: Optional[TFModelInputType] = None,
-        decoder_input_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        decoder_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_features: TFModelInputType | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         encoder_outputs: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         decoder_inputs_embeds: Optional[Tuple[Union[np.ndarray, tf.Tensor]]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 1a0146bf19d7..236720ae49df 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -15,6 +15,8 @@
 """ TF 2.0 XGLM model."""
 
 
+from __future__ import annotations
+
 import math
 import random
 from typing import Any, Optional, Tuple, Union
@@ -185,12 +187,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training: Optional[bool] = False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -337,12 +339,12 @@ def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training: Optional[bool] = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
@@ -456,13 +458,13 @@ def set_input_embeddings(self, value: TFSharedEmbeddings) -> None:
 
     def _prepare_decoder_attention_mask(
         self,
-        attention_mask: Optional[tf.Tensor],
+        attention_mask: tf.Tensor | None,
         input_shape: tf.TensorShape,
         past_key_values_length: int,
     ) -> tf.Tensor:
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask: Optional[tf.Tensor] = None
+        combined_attention_mask: tf.Tensor | None = None
         if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
 
@@ -476,7 +478,7 @@ def _prepare_decoder_attention_mask(
 
         return combined_attention_mask
 
-    def embed_positions(self, position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None) -> tf.Tensor:
+    def embed_positions(self, position_ids: np.ndarray | tf.Tensor | None = None) -> tf.Tensor:
         position_ids += self.offset
         positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
         return positions
@@ -484,15 +486,15 @@ def embed_positions(self, position_ids: Optional[Union[np.ndarray, tf.Tensor]] =
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -785,15 +787,15 @@ def __init__(
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -905,16 +907,16 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index da9bd1c6034f..1815b27c8595 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -16,6 +16,9 @@
  TF 2.0 XLM model.
 """
 
+
+from __future__ import annotations
+
 import itertools
 import warnings
 from dataclasses import dataclass
@@ -558,8 +561,8 @@ class TFXLMWithLMHeadModelOutput(ModelOutput):
     """
 
     logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 XLM_START_DOCSTRING = r"""
@@ -833,15 +836,15 @@ def prepare_inputs_for_generation(self, inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -904,19 +907,19 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1010,19 +1013,19 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         if input_ids is not None:
@@ -1133,19 +1136,19 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1217,20 +1220,20 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        langs: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        lengths: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
         cache: Optional[Dict[str, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index 2f51c032f150..ae2bae7d7aa7 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 """ TF 2.0 XLM-RoBERTa model."""
 
+
+from __future__ import annotations
+
 import math
 import warnings
 from typing import Optional, Tuple, Union
@@ -520,9 +523,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -599,9 +602,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -699,14 +702,14 @@ class PreTrainedModel
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -917,14 +920,14 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1077,16 +1080,16 @@ def get_prefix_bias_name(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1179,20 +1182,20 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1332,16 +1335,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1429,16 +1432,16 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1546,16 +1549,16 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1632,17 +1635,17 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 52538ced57ed..1d8a6692c090 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -17,6 +17,9 @@
  TF 2.0 XLNet model.
 """
 
+
+from __future__ import annotations
+
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -195,9 +198,9 @@ def call(
         attn_mask_g,
         r,
         seg_mat,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ):
@@ -369,9 +372,9 @@ def call(
         attn_mask,
         pos_emb,
         seg_mat,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ):
@@ -582,15 +585,15 @@ def relative_positional_encoding(self, qlen, klen, bsz=None):
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -824,9 +827,9 @@ class TFXLNetModelOutput(ModelOutput):
     """
 
     last_hidden_state: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -859,11 +862,11 @@ class TFXLNetLMHeadModelOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -893,11 +896,11 @@ class TFXLNetForSequenceClassificationOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -927,11 +930,11 @@ class TFXLNetForTokenClassificationOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -963,11 +966,11 @@ class TFXLNetForMultipleChoiceOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 @dataclass
@@ -999,12 +1002,12 @@ class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
             heads.
     """
 
-    loss: Optional[tf.Tensor] = None
+    loss: tf.Tensor | None = None
     start_logits: tf.Tensor = None
     end_logits: tf.Tensor = None
-    mems: Optional[List[tf.Tensor]] = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
 
 
 XLNET_START_DOCSTRING = r"""
@@ -1140,15 +1143,15 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1249,20 +1252,20 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_mems=N
     @replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetLMHeadModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1379,20 +1382,20 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForSequenceClassificationOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1484,20 +1487,20 @@ def dummy_inputs(self):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForMultipleChoiceOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1604,20 +1607,20 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForTokenClassificationOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1689,21 +1692,21 @@ def __init__(self, config, *inputs, **kwargs):
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        mems: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        perm_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        target_mapping: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        input_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         use_mems: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: bool = False,
     ) -> Union[TFXLNetForQuestionAnsweringSimpleOutput, Tuple[tf.Tensor]]:
         r"""
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index ffe5e7de95b9..80e2d8ed1e09 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -386,9 +386,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_value: Optional[Tuple[tf.Tensor]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
         output_attentions: bool,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -465,9 +465,9 @@ def call(
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
         head_mask: tf.Tensor,
-        encoder_hidden_states: Optional[tf.Tensor],
-        encoder_attention_mask: Optional[tf.Tensor],
-        past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
         use_cache: Optional[bool],
         output_attentions: bool,
         output_hidden_states: bool,
@@ -639,14 +639,14 @@ class PreTrainedModel
     @unpack_inputs
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -937,14 +937,14 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1038,16 +1038,16 @@ def get_lm_head(self) -> tf.keras.layers.Layer:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1129,20 +1129,20 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, attention_
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
         past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
         r"""
@@ -1274,16 +1274,16 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1362,16 +1362,16 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1487,16 +1487,16 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1566,17 +1566,17 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
     )
     def call(
         self,
-        input_ids: Optional[TFModelInputType] = None,
-        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
-        end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
         training: Optional[bool] = False,
     ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
         r"""
@@ -1777,12 +1777,12 @@ def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
     def call(
         self,
         hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
         training=False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1962,12 +1962,12 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs)
     def call(
         self,
         hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        layer_head_mask: Optional[tf.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
         training=False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
diff --git a/tests/generation/test_tf_logits_process.py b/tests/generation/test_tf_logits_process.py
index a1f665c9a761..e87c843d9cb4 100644
--- a/tests/generation/test_tf_logits_process.py
+++ b/tests/generation/test_tf_logits_process.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py
index 6fdad1ef6365..186e0c8d4327 100644
--- a/tests/generation/test_tf_utils.py
+++ b/tests/generation/test_tf_utils.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import os
 import tempfile
 import unittest
diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
index 104fb092529a..97b073e850e2 100644
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ b/tests/models/albert/test_modeling_tf_albert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import AlbertConfig, is_tf_available
diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py
index 1a355d88bb5a..c8754ca42702 100644
--- a/tests/models/auto/test_modeling_tf_auto.py
+++ b/tests/models/auto/test_modeling_tf_auto.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import copy
 import tempfile
 import unittest
diff --git a/tests/models/auto/test_modeling_tf_pytorch.py b/tests/models/auto/test_modeling_tf_pytorch.py
index c60b8fc2f517..3e213f29562a 100644
--- a/tests/models/auto/test_modeling_tf_pytorch.py
+++ b/tests/models/auto/test_modeling_tf_pytorch.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available, is_torch_available
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index 0f0f8f9793c0..c113011c567d 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import copy
 import tempfile
 import unittest
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index 59521acec398..a8a2159fe13b 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import BertConfig, is_tf_available
diff --git a/tests/models/blenderbot/test_modeling_tf_blenderbot.py b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
index 2db959e9f7f5..5fd6faefec86 100644
--- a/tests/models/blenderbot/test_modeling_tf_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import BlenderbotConfig, BlenderbotTokenizer, is_tf_available
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
index 67a4f7ad7bfb..5bc5c4afe91d 100644
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import BlenderbotSmallConfig, BlenderbotSmallTokenizer, is_tf_available
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index 3bb7b87edbb5..af7533c69893 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow Blip model. """
 
 
+from __future__ import annotations
+
 import inspect
 import tempfile
 import unittest
diff --git a/tests/models/blip/test_modeling_tf_blip_text.py b/tests/models/blip/test_modeling_tf_blip_text.py
index 261056e918ee..2733a9fa6a4c 100644
--- a/tests/models/blip/test_modeling_tf_blip_text.py
+++ b/tests/models/blip/test_modeling_tf_blip_text.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Testing suite for the TensorFlow Blip model. """
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
diff --git a/tests/models/bort/test_modeling_tf_bort.py b/tests/models/bort/test_modeling_tf_bort.py
index 8053afbd30cf..35abe53d89d7 100644
--- a/tests/models/bort/test_modeling_tf_bort.py
+++ b/tests/models/bort/test_modeling_tf_bort.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/camembert/test_modeling_tf_camembert.py b/tests/models/camembert/test_modeling_tf_camembert.py
index dc542526852d..425bdbc4b0ac 100644
--- a/tests/models/camembert/test_modeling_tf_camembert.py
+++ b/tests/models/camembert/test_modeling_tf_camembert.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py
index 6cd20a47a7f0..10b9954fc884 100644
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ b/tests/models/clip/test_modeling_tf_clip.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow CLIP model. """
 
 
+from __future__ import annotations
+
 import inspect
 import os
 import tempfile
diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py
index 0c259110e716..84ed4de818f6 100644
--- a/tests/models/convbert/test_modeling_tf_convbert.py
+++ b/tests/models/convbert/test_modeling_tf_convbert.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
 import os
 import tempfile
 import unittest
diff --git a/tests/models/convnext/test_modeling_tf_convnext.py b/tests/models/convnext/test_modeling_tf_convnext.py
index 72981c09d65e..8d049cf9f501 100644
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow ConvNext model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from typing import List, Tuple
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index c71c96bc9da9..4d94a97828cf 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import CTRLConfig, is_tf_available
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index f0f7b9c5d823..78d95931b3b5 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -1,6 +1,8 @@
 """ Testing suite for the Tensorflow CvT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from math import floor
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index dfa890d25a9e..6a30c83ebaf9 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow Data2VecVision model. """
 
+from __future__ import annotations
+
 import collections.abc
 import inspect
 import unittest
diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py
index 424d9e0b2b41..9b69d55001cf 100644
--- a/tests/models/deberta/test_modeling_tf_deberta.py
+++ b/tests/models/deberta/test_modeling_tf_deberta.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import DebertaConfig, is_tf_available
diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
index 60391635eedf..96ebe375d97b 100644
--- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import DebertaV2Config, is_tf_available
diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py
index 223d164d4aaf..b350a5d546b0 100644
--- a/tests/models/deit/test_modeling_tf_deit.py
+++ b/tests/models/deit/test_modeling_tf_deit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow DeiT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
@@ -242,7 +244,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class.__name__ == "DeiTForImageClassificationWithTeacher":
+            if "labels" in inputs_dict and "labels" not in inspect.signature(model_class.call).parameters:
                 del inputs_dict["labels"]
 
         return inputs_dict
diff --git a/tests/models/distilbert/test_modeling_tf_distilbert.py b/tests/models/distilbert/test_modeling_tf_distilbert.py
index 1f4f3c2b4697..4e96c909765a 100644
--- a/tests/models/distilbert/test_modeling_tf_distilbert.py
+++ b/tests/models/distilbert/test_modeling_tf_distilbert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import DistilBertConfig, is_tf_available
diff --git a/tests/models/dpr/test_modeling_tf_dpr.py b/tests/models/dpr/test_modeling_tf_dpr.py
index 64dea041b53f..f788a5163398 100644
--- a/tests/models/dpr/test_modeling_tf_dpr.py
+++ b/tests/models/dpr/test_modeling_tf_dpr.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py
index ae092e8a17dc..fe60c5627103 100644
--- a/tests/models/electra/test_modeling_tf_electra.py
+++ b/tests/models/electra/test_modeling_tf_electra.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import ElectraConfig, is_tf_available
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
index 76ebd687f77e..aa22e961f653 100644
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import os
 import tempfile
diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py
index dc9d430d07ed..d06e3c59ba87 100644
--- a/tests/models/esm/test_modeling_tf_esm.py
+++ b/tests/models/esm/test_modeling_tf_esm.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import EsmConfig, is_tf_available
diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py
index 6b7f4fc0316e..b751445d12cc 100644
--- a/tests/models/flaubert/test_modeling_tf_flaubert.py
+++ b/tests/models/flaubert/test_modeling_tf_flaubert.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/funnel/test_modeling_tf_funnel.py b/tests/models/funnel/test_modeling_tf_funnel.py
index 6780605e8936..5aea7e4309b5 100644
--- a/tests/models/funnel/test_modeling_tf_funnel.py
+++ b/tests/models/funnel/test_modeling_tf_funnel.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import FunnelConfig, is_tf_available
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index 7171997546d6..c69ab863373d 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import GPT2Config, is_tf_available
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
index 3aa63d2790a4..0e4dc9f5831d 100644
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ b/tests/models/gptj/test_modeling_tf_gptj.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import AutoTokenizer, GPTJConfig, is_tf_available
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index bd499a50fb6c..a80ef606e5fc 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow GroupViT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import os
 import random
diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py
index a48ed0634e84..0b8e1e2df94a 100644
--- a/tests/models/hubert/test_modeling_tf_hubert.py
+++ b/tests/models/hubert/test_modeling_tf_hubert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import inspect
 import math
diff --git a/tests/models/layoutlm/test_modeling_tf_layoutlm.py b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
index 95e24023bb23..2d134f23d42c 100644
--- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index df103194ab25..a1e2cd590836 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow LayoutLMv3 model. """
 
+from __future__ import annotations
+
 import copy
 import inspect
 import unittest
diff --git a/tests/models/led/test_modeling_tf_led.py b/tests/models/led/test_modeling_tf_led.py
index 7bac1ced835b..8735aeb721d7 100644
--- a/tests/models/led/test_modeling_tf_led.py
+++ b/tests/models/led/test_modeling_tf_led.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import LEDConfig, is_tf_available
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index b5452bc80ac5..dcdd68b18ffd 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
index cd2095f693d6..411de960f31d 100644
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ b/tests/models/lxmert/test_modeling_tf_lxmert.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import os
 import tempfile
 import unittest
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
index 16b19b0f9763..5a624fda9a38 100644
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ b/tests/models/marian/test_modeling_tf_marian.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import tempfile
 import unittest
 import warnings
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index b143fc6877b5..6c36d705e81b 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import tempfile
 import unittest
 
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
index 69d2fc6768e4..293126ab6147 100644
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import MobileBertConfig, is_tf_available
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index e4a956dff2c3..37d7db39e68d 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow MobileViT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
diff --git a/tests/models/mpnet/test_modeling_tf_mpnet.py b/tests/models/mpnet/test_modeling_tf_mpnet.py
index 4936a5289903..381b6e81dd8d 100644
--- a/tests/models/mpnet/test_modeling_tf_mpnet.py
+++ b/tests/models/mpnet/test_modeling_tf_mpnet.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import MPNetConfig, is_tf_available
diff --git a/tests/models/mt5/test_modeling_tf_mt5.py b/tests/models/mt5/test_modeling_tf_mt5.py
index 0c934f0314c8..facb63dd7931 100644
--- a/tests/models/mt5/test_modeling_tf_mt5.py
+++ b/tests/models/mt5/test_modeling_tf_mt5.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index a4cf71bf1a9f..a82da911a4c8 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import OpenAIGPTConfig, is_tf_available
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
index 0ae3411812dc..85514c9d7204 100644
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ b/tests/models/opt/test_modeling_tf_opt.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 import numpy as np
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
index 6816cc34ef82..b34a3dcfb5cc 100644
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/models/pegasus/test_modeling_tf_pegasus.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import tempfile
 import unittest
 
diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
index 4a0e4176b4e3..b4720f7c7f0d 100644
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ b/tests/models/rag/test_modeling_tf_rag.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import os
 import shutil
diff --git a/tests/models/regnet/test_modeling_tf_regnet.py b/tests/models/regnet/test_modeling_tf_regnet.py
index f5f5cfd4b99d..cee3995d2100 100644
--- a/tests/models/regnet/test_modeling_tf_regnet.py
+++ b/tests/models/regnet/test_modeling_tf_regnet.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow RegNet model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from typing import List, Tuple
diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py
index 7ab71e9c6e24..e70bd7033fc1 100644
--- a/tests/models/rembert/test_modeling_tf_rembert.py
+++ b/tests/models/rembert/test_modeling_tf_rembert.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RemBertConfig, is_tf_available
diff --git a/tests/models/resnet/test_modeling_tf_resnet.py b/tests/models/resnet/test_modeling_tf_resnet.py
index 0a8ccc00415c..e6f8d121c278 100644
--- a/tests/models/resnet/test_modeling_tf_resnet.py
+++ b/tests/models/resnet/test_modeling_tf_resnet.py
@@ -15,6 +15,8 @@
 """ Testing suite for the Tensorflow ResNet model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
index efa54ba45f9c..3d7b6953c085 100644
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RobertaConfig, is_tf_available
diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
index 6de20c0e1d04..4e1bd2e319af 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RobertaPreLayerNormConfig, is_tf_available
diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py
index 0a632e39a2ed..52c630e2beae 100644
--- a/tests/models/roformer/test_modeling_tf_roformer.py
+++ b/tests/models/roformer/test_modeling_tf_roformer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import RoFormerConfig, is_tf_available
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index fc8dd79765a0..4e918a1cd13e 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow SAM model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 79c58ebe401b..b831e8ddbc2b 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow SegFormer model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 from typing import List, Tuple
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
index 75789fd6d9b8..b283b4478bda 100644
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow Speech2Text model. """
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
index 32de917a11cb..a898d22fb132 100644
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ b/tests/models/swin/test_modeling_tf_swin.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TF 2.0 Swin model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index a1d784ae2f9a..7a75f51cd768 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import T5Config, is_tf_available
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
index c3cc5fae3a9c..ce98394cb868 100644
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import copy
 import unittest
 
diff --git a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
index 47880013b97e..ac820ea8fab0 100644
--- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import random
 import unittest
 
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index 1e594f5de551..04062014b846 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow VisionEncoderDecoder model. """
 
 
+from __future__ import annotations
+
 import copy
 import os
 import tempfile
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
index 696a302722ab..1f27f831e8d7 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
@@ -15,6 +15,8 @@
 """ Testing suite for the PyTorch VisionTextDualEncoder model. """
 
 
+from __future__ import annotations
+
 import collections
 import tempfile
 import unittest
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
index 111223de323b..72ca1b19dc2b 100644
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ b/tests/models/vit/test_modeling_tf_vit.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow ViT model. """
 
 
+from __future__ import annotations
+
 import inspect
 import unittest
 
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 53d68b644ac8..d5e16e963850 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -15,6 +15,8 @@
 """ Testing suite for the TensorFlow ViTMAE model. """
 
 
+from __future__ import annotations
+
 import copy
 import inspect
 import json
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index e30aeb6aaa82..ef4c38e2a303 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import glob
 import inspect
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index a52994899ad8..b9ad982176ef 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Testing suite for the TensorFlow Whisper model. """
 
+from __future__ import annotations
+
 import inspect
 import tempfile
 import traceback
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index 61fd80572594..e2b8cc2e6cbc 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import XGLMConfig, XGLMTokenizer, is_tf_available
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
index 2b1fb2f963c0..5b576f02c91e 100644
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ b/tests/models/xlm/test_modeling_tf_xlm.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
index 695a403b7b0b..1ecac55310fb 100644
--- a/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_tf_xlm_roberta.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import unittest
 
 from transformers import is_tf_available
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index bbc310aa8b1c..6d76462fda9e 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import inspect
 import random
 import unittest
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 220560d9238a..02d5077e233c 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import inspect
 import json
@@ -22,10 +24,9 @@
 import tempfile
 import unittest
 import unittest.mock as mock
-from dataclasses import fields
 from importlib import import_module
 from math import isnan
-from typing import List, Tuple, get_type_hints
+from typing import List, Tuple
 
 from datasets import Dataset
 from huggingface_hub import HfFolder, Repository, delete_repo
@@ -140,26 +141,6 @@ def _config_zero_init(config):
     return configs_no_init
 
 
-def _return_type_has_loss(model):
-    return_type = get_type_hints(model.call)
-    if "return" not in return_type:
-        return False
-    return_type = return_type["return"]
-    if hasattr(return_type, "__args__"):  # Awkward check for union because UnionType only turns up in 3.10
-        for type_annotation in return_type.__args__:
-            if inspect.isclass(type_annotation) and issubclass(type_annotation, ModelOutput):
-                field_names = [field.name for field in fields(type_annotation)]
-                if "loss" in field_names:
-                    return True
-        return False
-    elif isinstance(return_type, tuple):
-        return False
-    elif isinstance(return_type, ModelOutput):
-        class_fields = fields(return_type)
-        return "loss" in class_fields
-    return False
-
-
 @require_tf
 class TFModelTesterMixin:
     model_tester = None
@@ -1464,8 +1445,6 @@ def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
-            if not getattr(model, "hf_compute_loss", None) and not _return_type_has_loss(model):
-                continue
             # The number of elements in the loss should be the same as the number of elements in the label
             prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
             added_label_names = sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)
@@ -1480,7 +1459,11 @@ def test_loss_computation(self):
             input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
             model_input = prepared_for_class.pop(input_name)
 
-            loss = model(model_input, **prepared_for_class)[0]
+            outputs = model(model_input, **prepared_for_class)
+            if not isinstance(outputs, ModelOutput) or not hasattr(outputs, "loss"):
+                continue
+
+            loss = outputs.loss
             self.assertTrue(loss.shape.as_list() == expected_loss_size or loss.shape.as_list() == [1])
 
             # Test that model correctly compute the loss when we mask some positions
@@ -1540,18 +1523,16 @@ def test_keras_fit(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config)
-            if not getattr(model, "hf_compute_loss", False) and not _return_type_has_loss(model):
-                continue
             # Test that model correctly compute the loss with kwargs
             prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-            # Is there a better way to remove these decoder inputs?
             # We also remove "return_loss" as this is covered by the train_step when using fit()
             prepared_for_class = {
                 key: val
                 for key, val in prepared_for_class.items()
-                if key
-                not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "decoder_input_ids", "return_loss")
+                if key not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "return_loss")
             }
+            if "labels" in prepared_for_class and "decoder_input_ids" in prepared_for_class:
+                del prepared_for_class["decoder_input_ids"]
 
             accuracy_classes = [
                 "ForPreTraining",
@@ -1575,8 +1556,10 @@ def test_keras_fit(self):
                 sample_weight = tf.convert_to_tensor([0.5] * self.model_tester.batch_size, dtype=tf.float32)
             else:
                 sample_weight = None
-
-            model(model.dummy_inputs)  # Build the model so we can get some constant weights
+            # Build the model so we can get some constant weights and check outputs
+            outputs = model(prepared_for_class)
+            if getattr(outputs, "loss", None) is None:
+                continue
             model_weights = model.get_weights()
 
             # Run eagerly to save some expensive compilation times
@@ -1648,7 +1631,6 @@ def test_keras_fit(self):
             # Pass in all samples as a batch to match other `fit` calls
             weighted_dataset = weighted_dataset.batch(len(dataset))
             dataset = dataset.batch(len(dataset))
-
             # Reinitialize to fix batchnorm again
             model.set_weights(model_weights)
 
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index f144a7b8d933..ea5bc26986ae 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 
+from __future__ import annotations
+
 import copy
 import os
 import tempfile

From 28aa438cd2759cfdf1da9326ea6cde4496d1be23 Mon Sep 17 00:00:00 2001
From: pagarsky <36376725+pagarsky@users.noreply.github.com>
Date: Wed, 24 May 2023 15:57:52 +0300
Subject: [PATCH 217/935] Minor awesome-transformers.md fixes (#23453)

Minor docs fixes
---
 awesome-transformers.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/awesome-transformers.md b/awesome-transformers.md
index 36059eafa62a..a90f43578842 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -29,7 +29,7 @@ Keywords: inpainting, SD, Stable Diffusion
 
 ## [flair](https://github.com/flairNLP/flair)
 
-FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and ducoment embeddings, among other things.
+FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.
 
 Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis
 
@@ -360,7 +360,7 @@ Keywords: Model inspection, Model interpretation, Black-box, White-box
 
 ## [tortoise-tts](https://github.com/neonbjb/tortoise-tts)
 
-Tortoise is a text-to-speech program built with the following priorities: strong multi-voice capabilities., and highly realistic prosody and intonation.
+Tortoise is a text-to-speech program built with the following priorities: strong multi-voice capabilities, and highly realistic prosody and intonation.
 
 Keywords: Text-to-speech
 
@@ -405,7 +405,7 @@ Keywords: Training, Generation
 
 Diffgram aims to integrate human supervision into platforms. We support your team programmatically changing the UI (Schema, layout, etc.) like in Streamlit. This means that you can collect and annotate timely data from users. In other words, we are the platform behind your platform, an integrated part of your application, to ship new & better AI products faster.
 
-Keywords: Human supervision, Platfor,
+Keywords: Human supervision, Platform
 
 ## [ecco](https://github.com/jalammar/ecco)
 
@@ -431,11 +431,11 @@ Keywords: DALL-E, Russian
 
 Keywords: Knowledge Extraction, Knowledge Graphs
 
-## [nebullvm](https://github.com/nebuly-ai/nebullvm)
+## [Nebuly](https://github.com/nebuly-ai/nebuly)
 
-Nebullvm is an ecosystem of plug and play modules to optimize the performances of your AI systems. The optimization modules are stack-agnostic and work with any library. They are designed to be easily integrated into your system, providing a quick and seamless boost to its performance. Simply plug and play to start realizing the benefits of optimized performance right away.
+Nebuly is the next-generation platform to monitor and optimize your AI costs in one place. The platform connects to all your AI cost sources (compute, API providers, AI software licenses, etc) and centralizes them in one place to give you full visibility on a model basis. The platform also provides optimization recommendations and a co-pilot model that can guide during the optimization process. The platform builds on top of the open-source tools allowing you to optimize the different steps of your AI stack to squeeze out the best possible cost performances.
 
-Keywords: Optimization, Performance
+Keywords: Optimization, Performance, Monitoring
 
 ## [imaginAIry](https://github.com/brycedrennan/imaginAIry)
 

From d2d8822604cf29f28b258a70a78af323d0ff4000 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 24 May 2023 15:59:02 +0100
Subject: [PATCH 218/935] TF SAM memory reduction (#23732)

* Extremely small change to TF SAM dummies to reduce memory usage on build

* remove debug breakpoint

* Debug print statement to track array sizes

* More debug shape printing

* More debug shape printing

* Now remove the debug shape printing

* make fixup

* make fixup
---
 src/transformers/models/sam/modeling_tf_sam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index 8b00f13b4415..fbf38d6bc15b 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -1155,7 +1155,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         """
         VISION_DUMMY_INPUTS = tf.random.uniform(
             shape=(
-                3,
+                1,
                 self.config.vision_config.num_channels,
                 self.config.vision_config.image_size,
                 self.config.vision_config.image_size,

From 50a56bedb6ec8a4f9ba455c184d187cfee2e9c81 Mon Sep 17 00:00:00 2001
From: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Date: Thu, 25 May 2023 00:20:50 +0900
Subject: [PATCH 219/935] fix: delete duplicate sentences in
 `document_question_answering.mdx` (#23735)

fix: delete duplicate sentence
---
 docs/source/en/tasks/document_question_answering.mdx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/source/en/tasks/document_question_answering.mdx b/docs/source/en/tasks/document_question_answering.mdx
index 4c5208820642..7294e9f8fda2 100644
--- a/docs/source/en/tasks/document_question_answering.mdx
+++ b/docs/source/en/tasks/document_question_answering.mdx
@@ -40,9 +40,6 @@ LayoutLMv2 solves the document question-answering task by adding a question-answ
 states of the tokens, to predict the positions of the start and end tokens of the
 answer. In other words, the problem is treated as extractive question answering: given the context, extract which piece
 of information answers the question. The context comes from the output of an OCR engine, here it is Google's Tesseract.
-states of the tokens, in order to predict which token is at the start of the answer and which token is at the end of the
-answer. In other words, the problem is treated as extractive question answering: given the context, extract which piece
-of information answers the question. The context comes from the output of an OCR engine, here it is Google's Tesseract.
 
 Before you begin, make sure you have all the necessary libraries installed. LayoutLMv2 depends on detectron2, torchvision and tesseract.
 

From 3d7baef1141e22520901310593c106b15493e6a9 Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Wed, 24 May 2023 11:34:21 -0400
Subject: [PATCH 220/935] fix: Whisper generate, move text_prompt_ids trim up
 for max_new_tokens calculation (#23724)

move text_prompt_ids trimming to top
---
 src/transformers/models/whisper/modeling_whisper.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 96f91a0a43dd..6175009e4128 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1633,6 +1633,9 @@ def generate(
                 )
             prompt_ids = prompt_ids.tolist()
             decoder_start_token_id, *text_prompt_ids = prompt_ids
+            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
+            # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
+            text_prompt_ids = text_prompt_ids[-self.config.max_length // 2 - 1 :]
             # Set the decoder_start_token_id to <|startofprev|>
             kwargs.update({"decoder_start_token_id": decoder_start_token_id})
 
@@ -1647,9 +1650,7 @@ def generate(
                 kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
             )
             forced_decoder_ids = [
-                # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
-                # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
-                *text_prompt_ids[-self.config.max_length // 2 - 1 :],
+                *text_prompt_ids,
                 generation_config.decoder_start_token_id,
                 *[token for _rank, token in non_prompt_forced_decoder_ids],
             ]

From 814de8fac7456bd2ce50d1847505da829761bfdc Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 24 May 2023 17:03:24 +0100
Subject: [PATCH 221/935] Overhaul TF serving signatures + dummy inputs
 (#23234)

* Let's try autodetecting serving sigs

* Don't clobber existing sigs

* Change shapes for multiplechoice models

* Make default dummy inputs smarter too

* Fix missing f-string

* Let's YOLO a serving output too

* Read __class__.__name__ properly

* Don't just pass naked lists in there and expect it to be okay

* Code cleanup

* Update default serving sig

* Clearer error messages

* Further updates to the default serving output

* make fixup

* Update the serving output a bit more

* Cleanups and renames, raise errors appropriately when we can't infer inputs

* More renames

* we're building in a functional context again, yolo

* import DUMMY_INPUTS from the right place

* import DUMMY_INPUTS from the right place

* Support cross-attention in the dummies

* Support cross-attention in the dummies

* Complete removal of dummy/serving overrides in BERT

* Complete removal of dummy/serving overrides in RoBERTa

* Obliterate lots and lots of serving sig and dummy overrides

* merge type hint changes

* Fix for token_type_ids with vocab_size 1

* Add missing property decorator

* Fix T5 and hopefully some models that take conv inputs

* More signature pruning

* Fix T5's signature

* Fix Wav2Vec2 signature

* Fix LongformerForMultipleChoice input signature

* Fix BLIP and LED

* Better default serving output error handling

* Fix BART dummies

* Fix dummies for cross-attention, esp encoder-decoder models

* Fix visionencoderdecoder signature

* Fix BLIP serving output

* Small tweak to BART dummies

* Cleanup the ugly parameter inspection line that I used in a few places

* committed a breakpoint again

* Move the text_dims check

* Remove blip_text serving_output

* Add decoder_input_ids to the default input sig

* Remove all the manual overrides for encoder-decoder model signatures

* Tweak longformer/led input sigs

* Tweak default serving output

* output.keys() -> output

* make fixup
---
 src/transformers/modeling_tf_utils.py         | 131 ++++++++---
 .../models/albert/modeling_tf_albert.py       |  85 -------
 .../models/bart/modeling_tf_bart.py           |  40 +---
 .../models/bert/modeling_tf_bert.py           | 126 -----------
 .../blenderbot/modeling_tf_blenderbot.py      |  29 ---
 .../modeling_tf_blenderbot_small.py           |  29 ---
 .../models/blip/modeling_tf_blip.py           | 212 ++----------------
 .../models/blip/modeling_tf_blip_text.py      |  68 +-----
 .../models/camembert/modeling_tf_camembert.py | 129 -----------
 .../models/clip/modeling_tf_clip.py           | 107 +--------
 .../models/convbert/modeling_tf_convbert.py   |  64 ------
 .../models/convnext/modeling_tf_convnext.py   |  51 +----
 .../models/ctrl/modeling_tf_ctrl.py           |  23 --
 .../models/cvt/modeling_tf_cvt.py             |  41 +---
 .../data2vec/modeling_tf_data2vec_vision.py   |  55 +----
 .../models/deberta/modeling_tf_deberta.py     |  32 ---
 .../deberta_v2/modeling_tf_deberta_v2.py      |  32 ---
 .../models/deit/modeling_tf_deit.py           |  73 +-----
 .../distilbert/modeling_tf_distilbert.py      |  80 -------
 .../models/dpr/modeling_tf_dpr.py             |  37 ---
 .../models/electra/modeling_tf_electra.py     | 108 +--------
 .../modeling_tf_encoder_decoder.py            |  41 ----
 .../models/esm/modeling_tf_esm.py             |  93 --------
 .../models/flaubert/modeling_tf_flaubert.py   |  58 -----
 .../models/funnel/modeling_tf_funnel.py       |  27 +--
 .../models/gpt2/modeling_tf_gpt2.py           | 110 +--------
 .../models/gptj/modeling_tf_gptj.py           |  61 -----
 .../models/groupvit/modeling_tf_groupvit.py   | 117 +---------
 .../models/hubert/modeling_tf_hubert.py       |  40 +---
 .../models/layoutlm/modeling_tf_layoutlm.py   |  47 ----
 .../layoutlmv3/modeling_tf_layoutlmv3.py      |  70 +-----
 .../models/led/modeling_tf_led.py             |  31 +--
 .../longformer/modeling_tf_longformer.py      | 116 +---------
 .../models/lxmert/modeling_tf_lxmert.py       | 102 +++------
 .../models/marian/modeling_tf_marian.py       |  29 ---
 .../models/mbart/modeling_tf_mbart.py         |  29 ---
 .../mobilebert/modeling_tf_mobilebert.py      |  94 +-------
 .../models/mobilevit/modeling_tf_mobilevit.py |  49 ----
 .../models/mpnet/modeling_tf_mpnet.py         |  85 -------
 .../models/openai/modeling_tf_openai.py       |  61 +----
 .../models/opt/modeling_tf_opt.py             |  24 --
 .../models/pegasus/modeling_tf_pegasus.py     |  29 ---
 .../models/regnet/modeling_tf_regnet.py       |  45 +---
 .../models/rembert/modeling_tf_rembert.py     | 111 ---------
 .../models/resnet/modeling_tf_resnet.py       |  36 +--
 .../models/roberta/modeling_tf_roberta.py     | 129 -----------
 .../modeling_tf_roberta_prelayernorm.py       | 129 -----------
 .../models/roformer/modeling_tf_roformer.py   |  70 ------
 .../models/segformer/modeling_tf_segformer.py |  52 +----
 .../modeling_tf_speech_to_text.py             |  48 ++--
 .../models/swin/modeling_tf_swin.py           |  51 -----
 src/transformers/models/t5/modeling_tf_t5.py  |  71 ------
 .../models/tapas/modeling_tf_tapas.py         |  53 +----
 .../transfo_xl/modeling_tf_transfo_xl.py      |  42 ----
 .../modeling_tf_vision_encoder_decoder.py     |  42 ++--
 .../models/vit/modeling_tf_vit.py             |  51 +----
 .../models/vit_mae/modeling_tf_vit_mae.py     |  56 +----
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |  74 +-----
 .../models/whisper/modeling_tf_whisper.py     |  19 +-
 .../models/xglm/modeling_tf_xglm.py           |  61 -----
 .../models/xlm/modeling_tf_xlm.py             |  58 -----
 .../xlm_roberta/modeling_tf_xlm_roberta.py    | 129 -----------
 .../models/xlnet/modeling_tf_xlnet.py         |  77 -------
 ...tf_{{cookiecutter.lowercase_modelname}}.py | 145 ------------
 tests/test_modeling_tf_common.py              |   5 +-
 65 files changed, 275 insertions(+), 4144 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index c2b0485b5f4c..bac575e249df 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -42,7 +42,6 @@
 from .generation import GenerationConfig, TFGenerationMixin
 from .tf_utils import expand_1d, load_attributes_from_hdf5_group, save_attributes_to_hdf5_group, shape_list
 from .utils import (
-    DUMMY_INPUTS,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
     TF2_WEIGHTS_INDEX_NAME,
@@ -1114,9 +1113,25 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         Returns:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-        }
+        dummies = {}
+        sig = self._prune_signature(self.input_signature)
+        for key, spec in sig.items():
+            # 3 is the most correct arbitrary size. I will not be taking questions
+            dummies[key] = tf.ones(shape=[dim if dim is not None else 3 for dim in spec.shape], dtype=spec.dtype)
+            if key == "token_type_ids":
+                # Some models have token_type_ids but with a vocab_size of 1
+                dummies[key] = tf.zeros_like(dummies[key])
+        if self.config.add_cross_attention and "encoder_hidden_states" in inspect.signature(self.call).parameters:
+            if "encoder_hidden_states" not in dummies:
+                if self.main_input_name == "input_ids":
+                    dummies["encoder_hidden_states"] = tf.ones(
+                        shape=(3, 3, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
+                    )
+                else:
+                    raise NotImplementedError(
+                        "Model has cross-attention but we couldn't infer the shape for the encoder hidden states. Please manually override dummy_inputs!"
+                    )
+        return dummies
 
     @property
     def framework(self) -> str:
@@ -1137,6 +1152,10 @@ def __init__(self, config, *inputs, **kwargs):
         self.config = config
         self.name_or_path = config.name_or_path
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
+        if not hasattr(self, "serving"):  # Don't overwrite existing serving signatures
+            self.serving = tf.function(
+                self.eager_serving, input_signature=[self._prune_signature(self.input_signature)]
+            )
         # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
         self._set_save_spec(self.serving.input_signature[0])
 
@@ -1201,36 +1220,82 @@ def eager_serving(self, inputs):
 
         return self.serving_output(output)
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
+    @property
+    def input_signature(self) -> Dict[str, tf.TensorSpec]:
         """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
+        This property should return a dict mapping input names to tf.TensorSpec objects, representing the expected
+        shape and dtype for model inputs. It is used for both serving and for generating the dummy inputs used to build
+        the model.
         """
-        output = self.call(inputs)
+        model_inputs = list(inspect.signature(self.call).parameters)
+        sig = {}
+        if "input_ids" in model_inputs:
+            if self.__class__.__name__.endswith("ForMultipleChoice"):
+                text_dims = 3
+            else:
+                text_dims = 2
+            for input_name in (
+                "input_ids",
+                "attention_mask",
+                "token_type_ids",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ):
+                if input_name in model_inputs:
+                    sig[input_name] = tf.TensorSpec([None] * text_dims, tf.int32, name=input_name)
+        if "pixel_values" in model_inputs:
+            pixel_values_shape = [None, None, None, None]
+            if hasattr(self.config, "vision_config"):
+                vision_config = self.config.vision_config
+            else:
+                vision_config = self.config
+            if hasattr(vision_config, "num_channels"):
+                pixel_values_shape[1] = vision_config.num_channels
+            else:
+                raise NotImplementedError(
+                    "Could not infer number of channels from config, please override input_signature to specify input shapes."
+                )
+            if hasattr(vision_config, "image_size"):
+                pixel_values_shape[2] = pixel_values_shape[3] = vision_config.image_size
+            elif hasattr(vision_config, "input_size"):
+                pixel_values_shape[2] = pixel_values_shape[3] = vision_config.input_size
+            else:
+                raise NotImplementedError(
+                    "Could not infer input image shape from config, please override input_signature to specify input shapes."
+                )
+            sig["pixel_values"] = tf.TensorSpec(pixel_values_shape, tf.float32, name="pixel_values")
+        if "input_features" in model_inputs:
+            raise NotImplementedError("Audio models need a manually defined input_signature")
+        return sig
 
-        return self.serving_output(output)
+    def _prune_signature(self, signature):
+        """Keeps only the keys of a given input signature that are valid for this model."""
+        model_inputs = list(inspect.signature(self.call).parameters)
+        return {key: val for key, val in signature.items() if key in model_inputs}
 
     def serving_output(self, output):
         """
-        Prepare the output of the saved model. Each model must implement this function.
-
-        Args:
-            output ([`TFBaseModelOutput`]):
-                The output returned by the model.
-        """
-        raise NotImplementedError
+        Prepare the output of the saved model. Can be overridden if specific serving modifications are required.
+        """
+        if not isinstance(output, ModelOutput):
+            return output
+        for key in output:
+            if key.endswith("hidden_states") and not getattr(self.config, "output_hidden_states", False):
+                output[key] = None
+            elif key.endswith("attentions") and not getattr(self.config, "output_attentions", False):
+                output[key] = None
+            elif key == "past_key_values" and not getattr(self.config, "use_cache", False):
+                output[key] = None
+            elif key == "cross_attentions" and not (
+                getattr(self.config, "output_attentions", False) and getattr(self.config, "add_cross_attention", False)
+            ):
+                output[key] = None
+            if isinstance(output[key], (tuple, list)):
+                try:
+                    output[key] = tf.convert_to_tensor(output[key])
+                except (ValueError, tf.errors.InvalidArgumentError):
+                    pass  # Layers may not have the same dimensions
+        return output
 
     def can_generate(self) -> bool:
         """
@@ -1384,7 +1449,7 @@ def prepare_tf_dataset(
 
         if not isinstance(dataset, datasets.Dataset):
             raise TypeError("Dataset argument should be a datasets.Dataset!")
-        model_inputs = list(dict(inspect.signature(self.call).parameters).keys())
+        model_inputs = list(inspect.signature(self.call).parameters)
         model_labels = find_labels(self.__class__)
         if "cols_to_retain" in list(inspect.signature(dataset._get_output_signature).parameters.keys()):
             output_signature, _ = dataset._get_output_signature(
@@ -1496,7 +1561,7 @@ def compute_loss(self, *args, **kwargs):
             return self.hf_compute_loss(*args, **kwargs)
 
     def get_label_to_output_name_mapping(self):
-        arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+        arg_names = list(inspect.signature(self.call).parameters)
         if self._label_to_output_map is not None:
             return self._label_to_output_map
         elif "start_positions" in arg_names:
@@ -1519,7 +1584,7 @@ def train_step(self, data):
         """
 
         # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map`
-        arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+        arg_names = list(inspect.signature(self.call).parameters)
         label_kwargs = find_labels(self.__class__)
         label_to_output = self.get_label_to_output_name_mapping()
         output_to_label = {val: key for key, val in label_to_output.items()}
@@ -1626,7 +1691,7 @@ def test_step(self, data):
         that they are available to the model during the forward pass.
         """
         # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map`
-        arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+        arg_names = list(inspect.signature(self.call).parameters)
         label_kwargs = find_labels(self.__class__)
         label_to_output = self.get_label_to_output_name_mapping()
         output_to_label = {val: key for key, val in label_to_output.items()}
@@ -1645,7 +1710,7 @@ def test_step(self, data):
         # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
         # if those keys are not already present in the input dict
         if self._using_dummy_loss and y is not None:
-            arg_names = list(dict(inspect.signature(self.call).parameters).keys())
+            arg_names = list(inspect.signature(self.call).parameters)
             # If y is a tensor and the model only has one label-like input, map y to that input
             if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
                 if isinstance(x, tf.Tensor):
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 57e2414e720d..ad35b6182a4e 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -49,7 +49,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -826,17 +825,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -933,17 +921,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFAlbertForPreTrainingOutput) -> TFAlbertForPreTrainingOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFAlbertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            sop_logits=output.sop_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFAlbertSOPHead(tf.keras.layers.Layer):
     def __init__(self, config: AlbertConfig, **kwargs):
@@ -1058,13 +1035,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1147,13 +1117,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1237,13 +1200,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1339,15 +1295,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1370,16 +1317,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1457,25 +1394,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 5690e022adaa..e2555381f4bd 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -34,7 +34,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
@@ -487,31 +486,14 @@ class TFBartPretrainedModel(TFPreTrainedModel):
 
     @property
     def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
+        dummy_inputs = super().dummy_inputs
+        # Dummy inputs should not contain the default val of 1
+        # as this is the padding token and some assertions check it
+        dummy_inputs["input_ids"] = dummy_inputs["input_ids"] * 2
+        if "decoder_input_ids" in dummy_inputs:
+            dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"] * 2
         return dummy_inputs
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 BART_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -1461,16 +1443,6 @@ def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
     BART_START_DOCSTRING,
 )
 class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassificationLoss):
-    @property
-    def dummy_inputs(self):
-        pad_token = self.config.pad_token_id
-        input_ids = tf.constant([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]])
-        dummy_inputs = {
-            "attention_mask": tf.cast(tf.math.not_equal(input_ids, (pad_token)), dtype=tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
     def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model")
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index df78d03a0074..fd0a07b415f4 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -54,8 +54,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -903,24 +901,6 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
     config_class = BertConfig
     base_model_prefix = "bert"
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
 
 @dataclass
 class TFBertForPreTrainingOutput(ModelOutput):
@@ -1123,26 +1103,6 @@ def call(
         )
         return outputs
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1255,17 +1215,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFBertForPreTrainingOutput) -> TFBertForPreTrainingOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            seq_relationship_logits=output.seq_relationship_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
 class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1352,12 +1301,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
@@ -1483,19 +1426,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 @add_start_docstrings(
     """Bert Model with a `next sentence prediction (classification)` head on top.""",
@@ -1578,12 +1508,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFNextSentencePredictorOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1670,12 +1594,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1698,16 +1616,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1785,26 +1693,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1895,12 +1783,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -2002,11 +1884,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index 66f00d89f897..d0e745503705 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -34,7 +34,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     keras_serializable,
@@ -464,34 +463,6 @@ class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
     config_class = BlenderbotConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 BLENDERBOT_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 541024470d10..2e8d2e11cae7 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -33,7 +33,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     keras_serializable,
@@ -464,34 +463,6 @@ class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
     config_class = BlenderbotSmallConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 BLENDERBOT_SMALL_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 95269e4351d9..428151ea9a3c 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -17,13 +17,12 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import tensorflow as tf
 
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFPreTrainedModel,
     get_initializer,
     get_tf_activation,
@@ -648,38 +647,6 @@ def __init__(self, config: BlipVisionConfig, *args, **kwargs):
         self.encoder = TFBlipEncoder(config, name="encoder")
         self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
         hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
         attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -881,44 +848,6 @@ def __init__(self, config: BlipConfig, *inputs, **kwargs):
 
         self.blip = TFBlipMainLayer(config, name="blip")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-            "pixel_values": VISION_DUMMY_INPUTS,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBlipOutput:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     def serving_output(self, output: TFBlipOutput) -> TFBlipOutput:
         return TFBlipOutput(
             logits_per_image=output.logits_per_image,
@@ -1082,48 +1011,6 @@ def __init__(self, config: BlipConfig, *args, **kwargs):
     def get_input_embeddings(self) -> tf.keras.layers.Layer:
         return self.vision_model.embeddings.patch_embedding
 
-    @property
-    def dummy_inputs(self):
-        input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {"input_ids": input_ids, "pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(
-        self, output: TFBlipForConditionalGenerationModelOutput
-    ) -> TFBlipForConditionalGenerationModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBlipForConditionalGenerationModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            image_embeds=output.image_embeds,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig)
@@ -1297,46 +1184,30 @@ def __init__(self, config: BlipConfig, *args, **kwargs):
     def get_input_embeddings(self) -> tf.keras.layers.Layer:
         return self.vision_model.embeddings.patch_embedding
 
-    @property
-    def dummy_inputs(self):
-        input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {"input_ids": input_ids, "pixel_values": VISION_DUMMY_INPUTS, "decoder_input_ids": input_ids}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
+    # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.decoder_start_token_id
+        pad_token_id = self.decoder_pad_token_id
 
-        return self.serving_output(output)
+        if decoder_start_token_id is None or pad_token_id is None:
+            raise ValueError("decoder_start_token_id and pad_token_id must be defined!")
 
-    def serving_output(self, output: TFBlipTextVisionModelOutput) -> TFBlipTextVisionModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+        start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
+        start_tokens = tf.cast(start_tokens, input_ids.dtype)  # Ensure compatible dtypes for concatenation
+        shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
 
-        return TFBlipTextVisionModelOutput(
-            image_embeds=output.image_embeds,
-            last_hidden_state=output.last_hidden_state,
-            hidden_states=hs,
-            attentions=attns,
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids = tf.where(
+            shifted_input_ids == -100,
+            tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
+            shifted_input_ids,
         )
 
+        # "Verify that `labels` has only positive values and -100"
+        tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))
+
+        return shifted_input_ids
+
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig)
@@ -1389,7 +1260,7 @@ def call(
         ```"""
         if labels is None and decoder_input_ids is None:
             raise ValueError(
-                "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with"
+                "Either `decoder_input_ids` or `labels` should be passed when calling"
                 " `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you"
                 " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`"
             )
@@ -1579,47 +1450,6 @@ def __init__(self, config: BlipConfig, *args, **kwargs):
     def get_input_embeddings(self) -> tf.keras.layers.Layer:
         return self.vision_model.embeddings.patch_embedding
 
-    @property
-    def dummy_inputs(self):
-        input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {"input_ids": input_ids, "pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFBlipImageTextMatchingModelOutput) -> TFBlipImageTextMatchingModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBlipImageTextMatchingModelOutput(
-            itm_score=output.itm_score,
-            last_hidden_state=hs,
-            hidden_states=output.hidden_states,
-            attentions=attns,
-            question_embeds=output.question_embeds,
-        )
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig)
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index bff81223375c..19ebdac62e22 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import math
-from typing import Dict, Optional, Tuple
+from typing import Optional, Tuple
 
 import tensorflow as tf
 
@@ -27,7 +27,6 @@
     TFCausalLMOutputWithCrossAttentions,
 )
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFPreTrainedModel,
     get_initializer,
     get_tf_activation,
@@ -593,31 +592,6 @@ def __init__(self, config, add_pooling_layer=True, name=None, **kwargs):
         self.encoder = TFBlipTextEncoder(config, name="encoder")
         self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output = self.call(inputs)
-        return self.serving_output(output)
-
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 
@@ -844,46 +818,6 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.cls.predictions.decoder = new_embeddings
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        return {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFCausalLMOutputWithCrossAttentions:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits,
-            cross_attentions=output.cross_attentions,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
     @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
     @unpack_inputs
     def call(
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index 980462f4be7c..8def74a5b304 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -51,8 +51,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -873,38 +871,6 @@ class TFCamembertPreTrainedModel(TFPreTrainedModel):
     config_class = CamembertConfig
     base_model_prefix = "roberta"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @add_start_docstrings(
     "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
@@ -979,27 +945,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
 class TFCamembertLMHead(tf.keras.layers.Layer):
@@ -1135,13 +1080,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
 class TFCamembertClassificationHead(tf.keras.layers.Layer):
@@ -1248,13 +1186,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1339,13 +1270,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1369,16 +1293,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -1449,26 +1363,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1562,15 +1456,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
@@ -1696,17 +1581,3 @@ def call(
             attentions=outputs.attentions,
             cross_attentions=outputs.cross_attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 9b7976f41366..778f1ed2c92e 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -19,7 +19,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -29,7 +29,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFModelInputType,
     TFPreTrainedModel,
     get_initializer,
@@ -1090,29 +1089,6 @@ def call(
 
         return outputs
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        output = self.call(inputs)
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFCLIPVisionModel(TFCLIPPreTrainedModel):
     config_class = CLIPVisionConfig
@@ -1123,38 +1099,6 @@ def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs):
 
         self.clip = TFCLIPVisionMainLayer(config, name="clip")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig)
@@ -1199,17 +1143,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(CLIP_START_DOCSTRING)
 class TFCLIPModel(TFCLIPPreTrainedModel):
@@ -1220,44 +1153,6 @@ def __init__(self, config: CLIPConfig, *inputs, **kwargs):
 
         self.clip = TFCLIPMainLayer(config, name="clip")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-            "pixel_values": VISION_DUMMY_INPUTS,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFCLIPOutput:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def get_text_features(
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 19c2d700dc06..9b2bf2383bb7 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -46,7 +46,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -770,12 +769,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 class TFConvBertMaskedLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
@@ -907,13 +900,6 @@ def call(
             attentions=generator_hidden_states.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFConvBertClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1012,12 +998,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1038,16 +1018,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -1121,26 +1091,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1216,12 +1166,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1307,11 +1251,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index f258abe24cbb..23a77a928ecc 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -17,7 +17,7 @@
 
 from __future__ import annotations
 
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -351,43 +351,6 @@ class TFConvNextPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "convnext"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(
-                3,
-                self.config.num_channels,
-                self.config.image_size,
-                self.config.image_size,
-            ),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 CONVNEXT_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -509,14 +472,6 @@ def call(
             hidden_states=outputs.hidden_states,
         )
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -609,7 +564,3 @@ def call(
             logits=logits,
             hidden_states=outputs.hidden_states,
         )
-
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index cddfd4a9e352..4dd9e7392507 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -564,15 +564,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPast(
-            last_hidden_state=output.last_hidden_state, past_key_values=pkv, hidden_states=hs, attentions=attns
-        )
-
 
 class TFCTRLLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
@@ -705,13 +696,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutputWithPast(logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -839,10 +823,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py
index 3c80f53bfaf2..80e15a196f85 100644
--- a/src/transformers/models/cvt/modeling_tf_cvt.py
+++ b/src/transformers/models/cvt/modeling_tf_cvt.py
@@ -19,7 +19,7 @@
 
 import collections.abc
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -707,35 +707,6 @@ class TFCvtPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "cvt"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 TFCVT_START_DOCSTRING = r"""
 
@@ -844,13 +815,6 @@ def call(
             hidden_states=outputs.hidden_states,
         )
 
-    def serving_output(self, output: TFBaseModelOutputWithCLSToken) -> TFBaseModelOutputWithCLSToken:
-        return TFBaseModelOutputWithCLSToken(
-            last_hidden_state=output.last_hidden_state,
-            cls_token_value=output.cls_token_value,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -945,6 +909,3 @@ def call(
             return ((loss,) + output) if loss is not None else output
 
         return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
-
-    def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
-        return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index 1085d6e48d6c..8ebb8c68ff8d 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -20,7 +20,7 @@
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -774,36 +774,6 @@ class TFData2VecVisionPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
     _keys_to_ignore_on_load_unexpected = [r"relative_position_index"]
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 DATA2VEC_VISION_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -926,17 +896,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFData2VecVisionModelOutputWithPooling) -> TFData2VecVisionModelOutputWithPooling:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFData2VecVisionModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1009,12 +968,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 class TFData2VecVisionConvModule(tf.keras.layers.Layer):
     """
@@ -1475,9 +1428,3 @@ def reshape_features(x):
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFSemanticSegmenterOutput) -> TFSemanticSegmenterOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSemanticSegmenterOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 7a045426185e..57e6ea8b1e9b 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -1118,12 +1118,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1194,12 +1188,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1281,12 +1269,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1358,12 +1340,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1448,11 +1424,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 82b0a30c5a50..1075cc855a02 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -1212,12 +1212,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForMaskedLM with Deberta->DebertaV2
@@ -1289,12 +1283,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1377,12 +1365,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1455,12 +1437,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1546,11 +1522,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py
index 131939f5bcfa..efd25788b033 100644
--- a/src/transformers/models/deit/modeling_tf_deit.py
+++ b/src/transformers/models/deit/modeling_tf_deit.py
@@ -20,7 +20,7 @@
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -568,38 +568,6 @@ class TFDeiTPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "deit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 DEIT_START_DOCSTRING = r"""
     This model is a TensorFlow
@@ -679,17 +647,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 # Copied from transformers.models.vit.modeling_tf_vit.TFViTPooler with ViT->DeiT
 class TFDeiTPooler(tf.keras.layers.Layer):
@@ -865,14 +822,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedImageModelingOutput) -> TFMaskedImageModelingOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedImageModelingOutput(
-            reconstruction=output.reconstruction, hidden_states=hidden_states, attentions=attentions
-        )
-
 
 @add_start_docstrings(
     """
@@ -970,12 +919,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFImageClassifierOutput) -> TFImageClassifierOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFImageClassifierOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 @add_start_docstrings(
     """
@@ -1055,17 +998,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(
-        self, output: TFDeiTForImageClassificationWithTeacherOutput
-    ) -> TFDeiTForImageClassificationWithTeacherOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDeiTForImageClassificationWithTeacherOutput(
-            logits=output.logits,
-            cls_logits=output.cls_logits,
-            distillation_logits=output.distillation_logits,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 85a98c2a77fb..6b0e1b0f3feb 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -48,7 +48,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -424,19 +423,6 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
     config_class = DistilBertConfig
     base_model_prefix = "distilbert"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 DISTILBERT_START_DOCSTRING = r"""
 
@@ -562,12 +548,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 class TFDistilBertLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
@@ -687,13 +667,6 @@ def call(
             attentions=distilbert_output.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -773,13 +746,6 @@ def call(
             attentions=distilbert_output.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -848,13 +814,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -879,16 +838,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -959,26 +908,6 @@ def call(
             attentions=distilbert_output.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1062,12 +991,3 @@ def call(
             hidden_states=distilbert_output.hidden_states,
             attentions=distilbert_output.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 008e6a39fdc9..759e22c8c71c 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -372,19 +372,6 @@ class TFDPRPretrainedReader(TFPreTrainedModel):
     config_class = DPRConfig
     base_model_prefix = "reader"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 ###############
 # Actual Models
@@ -612,12 +599,6 @@ def call(
             pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDPRContextEncoderOutput(pooler_output=output.pooler_output, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
@@ -698,12 +679,6 @@ def call(
             pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDPRQuestionEncoderOutput(pooler_output=output.pooler_output, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     "The bare DPRReader transformer outputting span predictions.",
@@ -777,15 +752,3 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFDPRReaderOutput(
-            start_logits=output.start_logits,
-            end_logits=output.end_logits,
-            relevance_logits=output.relevance_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 7602d43cc0ca..41c64eed369d 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -20,7 +20,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -49,8 +49,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -596,25 +594,6 @@ class TFElectraPreTrainedModel(TFPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"generator_lm_head.weight"]
     _keys_to_ignore_on_load_missing = [r"dropout"]
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
 
 @keras_serializable
 class TFElectraMainLayer(tf.keras.layers.Layer):
@@ -998,23 +977,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1087,12 +1049,6 @@ def call(
             attentions=discriminator_hidden_states.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFElectraForPreTrainingOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFElectraMaskedLMHead(tf.keras.layers.Layer):
     def __init__(self, config, input_embeddings, **kwargs):
@@ -1221,13 +1177,6 @@ def call(
             attentions=generator_hidden_states.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFElectraClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1329,13 +1278,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1356,16 +1298,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1438,28 +1370,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1538,13 +1448,6 @@ def call(
             attentions=discriminator_hidden_states.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1638,12 +1541,3 @@ def call(
             hidden_states=discriminator_hidden_states.hidden_states,
             attentions=discriminator_hidden_states.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index f5cd5e445aac..19fc47546b0f 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -36,7 +36,6 @@
 )
 from ...tf_utils import shape_list
 from ...utils import (
-    DUMMY_INPUTS,
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -276,19 +275,6 @@ def __init__(
                 "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
             )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        # Add `decoder_input_ids` because `self.decoder` requires it.
-        input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        dummy = {"input_ids": input_ids, "decoder_input_ids": input_ids}
-        return dummy
-
     def get_encoder(self):
         return self.encoder
 
@@ -642,33 +628,6 @@ def call(
             encoder_attentions=encoder_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.decoder.use_cache else None
-        dec_hs = (
-            tf.convert_to_tensor(output.decoder_hidden_states) if self.config.decoder.output_hidden_states else None
-        )
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.decoder.output_attentions else None
-        enc_hs = (
-            tf.convert_to_tensor(output.encoder_hidden_states) if self.config.encoder.output_hidden_states else None
-        )
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.encoder.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.decoder.output_attentions and output.cross_attentions is not None
-            else None
-        )
-
-        return TFSeq2SeqLMOutput(
-            logits=output.logits,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-            cross_attentions=cross_attns,
-        )
-
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index df4ea54f83bc..126473ee529a 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -1038,39 +1038,6 @@ def call(
         )
         return outputs
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
     def predict_contacts(self, tokens, attention_mask):
         return self.esm.predict_contacts(tokens, attention_mask)
 
@@ -1170,26 +1137,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     def predict_contacts(self, tokens, attention_mask):
         return self.esm.predict_contacts(tokens, attention_mask)
 
@@ -1310,26 +1257,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @add_start_docstrings(
     """
@@ -1406,26 +1333,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 class TFEsmClassificationHead(Layer):
     """Head for sentence-level classification tasks."""
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index 7f93caebb000..068119d35f17 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -290,13 +290,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert
 class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer):
@@ -845,12 +838,6 @@ def call(
             logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFFlaubertWithLMHeadModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -930,13 +917,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1029,15 +1009,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1119,13 +1090,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1247,25 +1211,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index fa077d612d21..9c472674cf65 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -19,7 +19,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -47,7 +47,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -1425,16 +1424,6 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
         self.funnel = TFFunnelBaseLayer(config, name="funnel")
         self.classifier = TFFunnelClassificationHead(config, 1, name="classifier")
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1504,20 +1493,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output=output)
-
     def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
         # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of
         # different dimensions
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 6b7476b71bba..ab6bc07947cc 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -42,7 +42,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -522,37 +521,6 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"]
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @dataclass
 class TFGPT2DoubleHeadsModelOutput(ModelOutput):
@@ -773,26 +741,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions
-            and self.config.add_cross_attention
-            and output.cross_attentions is not None
-            else None
-        )
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -925,22 +873,6 @@ def call(
             cross_attentions=transformer_outputs.cross_attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions
-            and self.config.add_cross_attention
-            and output.cross_attentions is not None
-            else None
-        )
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1062,32 +994,13 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFGPT2DoubleHeadsModelOutput(
-            logits=output.logits,
-            mc_logits=output.mc_logits,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-        )
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
+        }
 
 
 @add_start_docstrings(
@@ -1210,12 +1123,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutputWithPast(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index 09e4330eb182..bbcdf3bd240a 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -23,7 +23,6 @@
 
 from ...activations_tf import get_tf_activation
 from ...file_utils import (
-    DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -513,30 +512,6 @@ class TFGPTJPreTrainedModel(TFPreTrainedModel):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias"]
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 GPTJ_START_DOCSTRING = r"""
 
@@ -697,18 +672,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPast(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -821,13 +784,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutputWithPast(logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -952,15 +908,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutputWithPast(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1051,11 +998,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 718884720980..5c989356a5de 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -20,7 +20,7 @@
 import collections.abc
 import math
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -28,7 +28,6 @@
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFModelInputType,
     TFPreTrainedModel,
     get_initializer,
@@ -1608,30 +1607,6 @@ def __init__(self, config: GroupViTTextConfig, *inputs, **kwargs):
 
         self.groupvit = TFGroupViTTextMainLayer(config, name="groupvit")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        output = self.call(inputs)
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
@@ -1675,17 +1650,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFGroupViTVisionModel(TFGroupViTPreTrainedModel):
     config_class = GroupViTVisionConfig
@@ -1696,38 +1660,6 @@ def __init__(self, config: GroupViTVisionConfig, *inputs, **kwargs):
 
         self.groupvit = TFGroupViTVisionMainLayer(config, name="groupvit")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": VISION_DUMMY_INPUTS}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
@@ -1772,15 +1704,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-        )
-
 
 @add_start_docstrings(GROUPVIT_START_DOCSTRING)
 class TFGroupViTModel(TFGroupViTPreTrainedModel):
@@ -1791,44 +1714,6 @@ def __init__(self, config: GroupViTConfig, *inputs, **kwargs):
 
         self.groupvit = TFGroupViTMainLayer(config, name="groupvit")
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
-            dtype=tf.float32,
-        )
-        return {
-            "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
-            "pixel_values": VISION_DUMMY_INPUTS,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float64, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFGroupViTModelOutput:
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def get_text_features(
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index fd1f17edfb3a..c237616bf2a4 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -1157,14 +1157,12 @@ class TFHubertPreTrainedModel(TFPreTrainedModel):
     main_input_name = "input_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        pad_token = 0.0
-        input_values = tf.convert_to_tensor(np.random.rand(1, 16000), tf.float32)
-        dummy_inputs = {
-            "input_values": input_values,
-            "attention_mask": tf.cast(tf.not_equal(input_values, pad_token), tf.float32),
+    def input_signature(self):
+        return {
+            "input_values": tf.TensorSpec((None, 16000), tf.float32, name="input_values"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
         }
-        return dummy_inputs
 
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
@@ -1173,20 +1171,6 @@ def __init__(self, config, *inputs, **kwargs):
             "to train/fine-tine this model, you need a GPU or a TPU"
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(input_values=inputs, training=False)
-
-        return self.serving_output(output)
-
 
 HUBERT_START_DOCSTRING = r"""
 
@@ -1359,13 +1343,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        return TFBaseModelOutput(
-            last_hidden_state=output.last_hidden_state, hidden_states=hidden_states, attentions=attentions
-        )
-
 
 @add_start_docstrings(
     """TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
@@ -1518,8 +1495,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 67128e0c1338..c75660946859 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -986,27 +986,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
 class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1128,12 +1107,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1252,12 +1225,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1380,12 +1347,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1524,11 +1485,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
index 67377c5baf8a..feba69eafc2a 100644
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -19,7 +19,7 @@
 
 import collections
 import math
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -980,37 +980,10 @@ class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "layoutlmv3"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        size = self.config.input_size
-        image_shape = (2, self.config.num_channels, size, size)
-        pixel_values = tf.random.uniform(shape=image_shape, minval=-1, maxval=1)
-        return {
-            "input_ids": tf.constant(_DUMMY_INPUT_IDS, dtype=tf.int32),
-            "bbox": tf.constant(_DUMMY_BBOX, dtype=tf.int32),
-            "pixel_values": pixel_values,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "bbox": tf.TensorSpec((None, None, 4), tf.int32, name="bbox"),
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        sig = super().input_signature
+        sig["bbox"] = tf.TensorSpec((None, None, 4), tf.int32, name="bbox")
+        return sig
 
 
 LAYOUTLMV3_START_DOCSTRING = r"""
@@ -1207,16 +1180,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer):
     """
@@ -1354,13 +1317,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1484,13 +1440,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1618,12 +1567,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 4e815da33d9e..6e962ea4934e 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1323,33 +1323,10 @@ class TFLEDPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "led"
 
     @property
-    def dummy_inputs(self):
-        input_ids = tf.convert_to_tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0]], dtype=tf.int32)
-        # make sure global layers are initialized
-        attention_mask = tf.convert_to_tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0]], dtype=tf.int32)
-        global_attention_mask = tf.convert_to_tensor([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0]], dtype=tf.int32)
-        dummy_inputs = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "global_attention_mask": global_attention_mask,
-            "decoder_input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        sig = super().input_signature
+        sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
+        return sig
 
 
 @dataclass
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index b5adb2c803e9..60cee2a83e89 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -39,7 +39,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -1874,31 +1873,10 @@ class TFLongformerPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "longformer"
 
     @property
-    def dummy_inputs(self):
-        input_ids = tf.convert_to_tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
-        # make sure global layers are initialized
-        attention_mask = tf.convert_to_tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
-        global_attention_mask = tf.convert_to_tensor(
-            [[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]], dtype=tf.int32
-        )
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "global_attention_mask": global_attention_mask,
-        }
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        sig = super().input_signature
+        sig["global_attention_mask"] = tf.TensorSpec((None, None), tf.int32, name="global_attention_mask")
+        return sig
 
 
 LONGFORMER_START_DOCSTRING = r"""
@@ -2069,19 +2047,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-            global_attentions=g_attns,
-        )
-
 
 @add_start_docstrings(
     """Longformer Model with a `language modeling` head on top.""",
@@ -2166,15 +2131,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerMaskedLMOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -2305,19 +2261,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerQuestionAnsweringModelOutput(
-            start_logits=output.start_logits,
-            end_logits=output.end_logits,
-            hidden_states=hs,
-            attentions=attns,
-            global_attentions=g_attns,
-        )
-
 
 class TFLongformerClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -2446,15 +2389,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerSequenceClassifierOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -2477,11 +2411,12 @@ def __init__(self, config, *inputs, **kwargs):
         )
 
     @property
-    def dummy_inputs(self):
-        input_ids = tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)
-        # make sure global layers are initialized
-        global_attention_mask = tf.convert_to_tensor([[[0, 0, 0, 1], [0, 0, 0, 1]]] * 2, dtype=tf.int32)
-        return {"input_ids": input_ids, "global_attention_mask": global_attention_mask}
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "global_attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="global_attention_mask"),
+        }
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
@@ -2568,28 +2503,6 @@ def call(
             global_attentions=outputs.global_attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerMultipleChoiceModelOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -2669,12 +2582,3 @@ def call(
             attentions=outputs.attentions,
             global_attentions=outputs.global_attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        g_attns = tf.convert_to_tensor(output.global_attentions) if self.config.output_attentions else None
-
-        return TFLongformerTokenClassifierOutput(
-            logits=output.logits, hidden_states=hs, attentions=attns, global_attentions=g_attns
-        )
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index 59bc18591405..0b54702d761d 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -636,26 +636,6 @@ def call(
 class TFLxmertMainLayer(tf.keras.layers.Layer):
     config_class = LxmertConfig
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        batch_size = 2
-        num_visual_features = 10
-        input_ids = tf.constant([[3, 5, 6], [2, 3, 4]], dtype=tf.int32)
-        visual_feats = tf.random.uniform((batch_size, num_visual_features, self.config.visual_feat_dim))
-        visual_pos = tf.random.uniform((batch_size, num_visual_features, 4))
-
-        return {
-            "input_ids": input_ids,
-            "visual_feats": visual_feats,
-            "visual_pos": visual_pos,
-        }
-
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
 
@@ -802,25 +782,35 @@ class TFLxmertPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "lxmert"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        return getattr(self, self.base_model_prefix).dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "visual_feats": tf.TensorSpec((None, None, None), tf.float32, name="visual_feats"),
-                "visual_pos": tf.TensorSpec((None, None, None), tf.float32, name="visual_pos"),
-                "visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
 
-        return self.serving_output(output)
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        batch_size = 2
+        num_visual_features = 10
+        input_ids = tf.constant([[3, 5, 6], [2, 3, 4]], dtype=tf.int32)
+        visual_feats = tf.random.uniform((batch_size, num_visual_features, self.config.visual_feat_dim))
+        visual_pos = tf.random.uniform((batch_size, num_visual_features, 4))
+
+        return {
+            "input_ids": input_ids,
+            "visual_feats": visual_feats,
+            "visual_pos": visual_pos,
+        }
+
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "visual_feats": tf.TensorSpec((None, None, self.config.visual_feat_dim), tf.float32, name="visual_feats"),
+            "visual_pos": tf.TensorSpec((None, None, 4), tf.float32, name="visual_pos"),
+            "visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
+        }
 
 
 LXMERT_START_DOCSTRING = r"""
@@ -976,24 +966,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        l_hs = tf.convert_to_tensor(output.language_hidden_states) if self.config.output_hidden_states else None
-        v_hs = tf.convert_to_tensor(output.vision_hidden_states) if self.config.output_hidden_states else None
-        l_attns = tf.convert_to_tensor(output.language_attentions) if self.config.output_attentions else None
-        v_attns = tf.convert_to_tensor(output.vision_attentions) if self.config.output_attentions else None
-        c_enc_attns = tf.convert_to_tensor(output.cross_encoder_attentions) if self.config.output_attentions else None
-
-        return TFLxmertModelOutput(
-            pooled_output=output.pooled_output,
-            language_output=output.language_output,
-            vision_output=output.vision_output,
-            language_hidden_states=l_hs,
-            vision_hidden_states=v_hs,
-            language_attentions=l_attns,
-            vision_attentions=v_attns,
-            cross_encoder_attentions=c_enc_attns,
-        )
-
 
 class TFLxmertPooler(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
@@ -1415,21 +1387,3 @@ def call(
             vision_attentions=lxmert_output.vision_attentions,
             cross_encoder_attentions=lxmert_output.cross_encoder_attentions,
         )
-
-    def serving_output(self, output):
-        l_hs = tf.convert_to_tensor(output.language_hidden_states) if self.config.output_hidden_states else None
-        v_hs = tf.convert_to_tensor(output.vision_hidden_states) if self.config.output_hidden_states else None
-        l_attns = tf.convert_to_tensor(output.language_attentions) if self.config.output_attentions else None
-        v_attns = tf.convert_to_tensor(output.vision_attentions) if self.config.output_attentions else None
-        c_enc_attns = tf.convert_to_tensor(output.cross_encoder_attentions) if self.config.output_attentions else None
-
-        return TFLxmertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            cross_relationship_score=output.cross_relationship_score,
-            question_answering_score=output.question_answering_score,
-            language_hidden_states=l_hs,
-            vision_hidden_states=v_hs,
-            language_attentions=l_attns,
-            vision_attentions=v_attns,
-            cross_encoder_attentions=c_enc_attns,
-        )
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 208e9b8335d7..9632ddeaac8f 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -33,7 +33,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     keras_serializable,
@@ -501,34 +500,6 @@ class TFMarianPreTrainedModel(TFPreTrainedModel):
     config_class = MarianConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 MARIAN_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index 293c564141b3..b0e2d141f4fa 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -32,7 +32,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
@@ -468,34 +467,6 @@ class TFMBartPreTrainedModel(TFPreTrainedModel):
     config_class = MBartConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 MBART_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index eddb339074a3..c454a8b35db1 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -20,7 +20,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -51,7 +51,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -998,17 +997,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1099,17 +1087,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMobileBertForPreTrainingOutput(
-            prediction_logits=output.prediction_logits,
-            seq_relationship_logits=output.seq_relationship_logits,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
 class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1190,13 +1167,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
@@ -1289,13 +1259,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForNextSentencePrediction.serving_output
-    def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFNextSentencePredictorOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1386,13 +1349,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1492,15 +1448,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1528,16 +1475,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -1612,28 +1549,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1722,10 +1637,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index 879c642800fe..4d48ce72725c 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -735,38 +735,6 @@ class TFMobileViTPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "mobilevit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 MOBILEVIT_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -856,14 +824,6 @@ def call(
         output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training)
         return output
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -924,10 +884,6 @@ def call(
 
         return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
 
-    def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
-
 
 class TFMobileViTASPPPooling(tf.keras.layers.Layer):
     def __init__(self, config: MobileViTConfig, out_channels: int, **kwargs) -> None:
@@ -1157,8 +1113,3 @@ def call(
             logits=logits,
             hidden_states=outputs.hidden_states if output_hidden_states else None,
         )
-
-    def serving_output(
-        self, output: TFSemanticSegmenterOutputWithNoAttention
-    ) -> TFSemanticSegmenterOutputWithNoAttention:
-        return TFSemanticSegmenterOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index 2f4178d6cfc9..2982899340d2 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -49,7 +49,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -77,19 +76,6 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel):
     config_class = MPNetConfig
     base_model_prefix = "mpnet"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 class TFMPNetEmbeddings(tf.keras.layers.Layer):
     """Construct the embeddings from word, position embeddings."""
@@ -707,17 +693,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFMPNetLMHead(tf.keras.layers.Layer):
     """MPNet head for masked and permuted language modeling"""
@@ -841,13 +816,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFMPNetClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -945,13 +913,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -970,16 +931,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1048,26 +999,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1142,13 +1073,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1235,12 +1159,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 3f8967241946..70b7f6c05efb 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -357,19 +357,6 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
     config_class = OpenAIGPTConfig
     base_model_prefix = "transformer"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @dataclass
 class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
@@ -541,13 +528,6 @@ def call(
         )
         return outputs
 
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -630,12 +610,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
     def prepare_inputs_for_generation(self, inputs, **kwargs):
         return {"input_ids": inputs}
 
@@ -752,27 +726,13 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFOpenAIGPTDoubleHeadsModelOutput(
-            logits=output.logits, mc_logits=output.mc_logits, hidden_states=hs, attentions=attns
-        )
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
+        }
 
 
 @add_start_docstrings(
@@ -894,10 +854,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 227e56fdef55..5f7dd22369b8 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -27,7 +27,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
@@ -413,29 +412,6 @@ class TFOPTPreTrainedModel(TFPreTrainedModel):
     config_class = OPTConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 OPT_INPUTS_DOCSTRING = r"""
     Args:
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 7de1542ebe47..15c87b938bfa 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -33,7 +33,6 @@
 
 # Public API
 from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
     TFCausalLanguageModelingLoss,
     TFModelInputType,
     TFPreTrainedModel,
@@ -503,34 +502,6 @@ class TFPegasusPreTrainedModel(TFPreTrainedModel):
     config_class = PegasusConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        decoder_input_ids = tf.convert_to_tensor(DUMMY_INPUTS, dtype=tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 PEGASUS_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py
index 2c3a1ac42e50..254d49a9f1ef 100644
--- a/src/transformers/models/regnet/modeling_tf_regnet.py
+++ b/src/transformers/models/regnet/modeling_tf_regnet.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ TensorFlow RegNet model."""
 
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -345,33 +345,8 @@ class TFRegNetPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 224, 224), dtype=tf.float32)}
 
 
 REGNET_START_DOCSTRING = r"""
@@ -443,16 +418,6 @@ def call(
             hidden_states=outputs.hidden_states,
         )
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndNoAttention
-    ) -> TFBaseModelOutputWithPoolingAndNoAttention:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPoolingAndNoAttention(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -514,7 +479,3 @@ def call(
             return ((loss,) + output) if loss is not None else output
 
         return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
-
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 097bd977a4a1..1595fd8118de 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -49,8 +49,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -812,24 +810,6 @@ class TFRemBertPreTrainedModel(TFPreTrainedModel):
     config_class = RemBertConfig
     base_model_prefix = "rembert"
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
 
 REMBERT_START_DOCSTRING = r"""
 
@@ -1002,27 +982,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
 class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1095,12 +1054,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
@@ -1217,20 +1170,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1307,12 +1246,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1331,16 +1264,6 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1419,26 +1342,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1512,12 +1415,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1604,11 +1501,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py
index bb6035adf2df..4ff1b119d428 100644
--- a/src/transformers/models/resnet/modeling_tf_resnet.py
+++ b/src/transformers/models/resnet/modeling_tf_resnet.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ TensorFlow ResNet model."""
 
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -276,24 +276,8 @@ class TFResNetPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 224, 224), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 224, 224), dtype=tf.float32)}
 
 
 RESNET_START_DOCSTRING = r"""
@@ -419,16 +403,6 @@ def call(
         )
         return resnet_outputs
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndNoAttention
-    ) -> TFBaseModelOutputWithPoolingAndNoAttention:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutputWithPoolingAndNoAttention(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -492,7 +466,3 @@ def call(
             return (loss,) + output if loss is not None else output
 
         return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
-
-    def serving_output(self, output: TFImageClassifierOutputWithNoAttention) -> TFImageClassifierOutputWithNoAttention:
-        # hidden_states not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFImageClassifierOutputWithNoAttention(logits=output.logits, hidden_states=output.hidden_states)
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 585c4d31ad0d..9b6c491d2761 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -51,8 +51,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -777,38 +775,6 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
     config_class = RobertaConfig
     base_model_prefix = "roberta"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 ROBERTA_START_DOCSTRING = r"""
 
@@ -980,27 +946,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 class TFRobertaLMHead(tf.keras.layers.Layer):
     """Roberta Head for masked language modeling."""
@@ -1131,13 +1076,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
@@ -1260,20 +1198,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 class TFRobertaClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1378,13 +1302,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1407,16 +1324,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1485,26 +1392,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1588,13 +1475,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1686,12 +1566,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
index 80a834ad5854..2f98a5f5d0cf 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -51,8 +51,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -778,38 +776,6 @@ class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel):
     config_class = RobertaPreLayerNormConfig
     base_model_prefix = "roberta_prelayernorm"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
 
@@ -982,27 +948,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer):
@@ -1140,13 +1085,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1276,20 +1214,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer):
@@ -1398,13 +1322,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1428,16 +1345,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -1508,26 +1415,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1612,13 +1499,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1711,12 +1591,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 50b57571461d..f6067f9237f4 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -50,7 +50,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -835,12 +834,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
 class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -911,12 +904,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
@@ -990,12 +977,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 class TFRoFormerClassificationHead(tf.keras.layers.Layer):
     """Head for sentence-level classification tasks."""
@@ -1094,12 +1075,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1118,17 +1093,6 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -1203,26 +1167,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1294,12 +1238,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1383,11 +1321,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 47b7ce8e8c5c..b3090135afc2 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 import math
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -521,34 +521,8 @@ class TFSegformerPreTrainedModel(TFPreTrainedModel):
     main_input_name = "pixel_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 512, 512), dtype=tf.float32)
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)}
 
 
 SEGFORMER_START_DOCSTRING = r"""
@@ -631,14 +605,6 @@ def call(
         )
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFBaseModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-        )
-
 
 @add_start_docstrings(
     """
@@ -702,12 +668,6 @@ def call(
             loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSequenceClassifierOutput(
-            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
-        )
-
 
 class TFSegformerMLP(tf.keras.layers.Layer):
     """
@@ -892,9 +852,3 @@ def call(
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFSemanticSegmenterOutput) -> TFSemanticSegmenterOutput:
-        # hidden_states and attention not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSemanticSegmenterOutput(
-            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
-        )
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 3651506894c7..59caabffab9c 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 import random
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -563,26 +563,6 @@ class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "model"
     main_input_name = "input_features"
 
-    # Overwritten property due to different expected input shape and type
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        return {
-            self.main_input_name: tf.random.uniform(
-                [
-                    1,
-                    random.randint(1, self.config.max_source_positions),  # time
-                    self.config.input_feat_per_channel * self.config.input_channels,  # input channels
-                ]
-            ),
-            "decoder_input_ids": tf.constant([[2, 3]], dtype=tf.int32),
-        }
-
     def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
         """
         Computes the output length of the convolutional layers
@@ -592,20 +572,18 @@ def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
 
         return input_lengths
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_features": tf.TensorSpec((None, None, None), tf.float32, name="input_features"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_features": tf.TensorSpec(
+                (None, None, self.config.input_feat_per_channel * self.config.input_channels),
+                tf.float32,
+                name="input_features",
+            ),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
+            "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
+        }
 
 
 SPEECH_TO_TEXT_START_DOCSTRING = r"""
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index f75bf230c0ad..02ec39edb0fe 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -957,29 +957,6 @@ def _set_gradient_checkpointing(self, module, value=False) -> None:
         if isinstance(module, TFSwinEncoder):
             module.gradient_checkpointing = value
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 SWIN_START_DOCSTRING = r"""
     This model is a Tensorflow
@@ -1245,16 +1222,6 @@ def call(
 
         return swin_outputs
 
-    def serving_output(self, output: TFSwinModelOutput) -> TFSwinModelOutput:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSwinModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-            reshaped_hidden_states=output.reshaped_hidden_states,
-        )
-
 
 class TFSwinPixelShuffle(tf.keras.layers.Layer):
     """TF layer implementation of torch.nn.PixelShuffle"""
@@ -1410,15 +1377,6 @@ def call(
             reshaped_hidden_states=outputs.reshaped_hidden_states,
         )
 
-    def serving_output(self, output: TFSwinMaskedImageModelingOutput) -> TFSwinMaskedImageModelingOutput:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSwinMaskedImageModelingOutput(
-            reconstruction=output.reconstruction,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-            reshaped_hidden_states=output.reshaped_hidden_states,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1493,12 +1451,3 @@ def call(
             attentions=outputs.attentions,
             reshaped_hidden_states=outputs.reshaped_hidden_states,
         )
-
-    def serving_output(self, output: TFSwinImageClassifierOutput) -> TFSwinImageClassifierOutput:
-        # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
-        return TFSwinImageClassifierOutput(
-            logits=output.logits,
-            hidden_states=output.hidden_states,
-            attentions=output.attentions,
-            reshaped_hidden_states=output.reshaped_hidden_states,
-        )
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 012f0c41b017..daef8bfb7fdd 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -45,8 +45,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
     ContextManagers,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -867,32 +865,6 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
     _keys_to_ignore_on_load_unexpected = [r"decoder\Wblock[\W_0]+layer[\W_1]+EncDecAttention\Wrelative_attention_bias"]
 
-    @property
-    def dummy_inputs(self):
-        inputs = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        input_mask = tf.constant(DUMMY_MASK, dtype=tf.int32)
-        dummy_inputs = {
-            "input_ids": inputs,
-            "decoder_input_ids": inputs,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
     def get_input_embeddings(self):
         return self.shared
 
@@ -1249,25 +1221,6 @@ def call(
             encoder_attentions=encoder_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values[1:]) if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-
-        return TFSeq2SeqModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            cross_attentions=cross_attns,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-        )
-
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
 class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
@@ -1539,10 +1492,6 @@ def __init__(self, config, *inputs, **kwargs):
         encoder_config.use_cache = False
         self.encoder = TFT5MainLayer(encoder_config, self.shared, name="encoder")
 
-    @property
-    def dummy_inputs(self):
-        return {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-
     def get_encoder(self):
         return self.encoder
 
@@ -1600,23 +1549,3 @@ def call(
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index b17fddc32720..62e77a6678de 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -862,18 +862,13 @@ class TFTapasPreTrainedModel(TFPreTrainedModel):
     config_class = TapasConfig
     base_model_prefix = "tapas"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
+            "token_type_ids": tf.TensorSpec((None, None, 7), tf.int32, name="token_type_ids"),
+        }
 
 
 TAPAS_START_DOCSTRING = r"""
@@ -1038,17 +1033,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
 class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1145,12 +1129,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 class TFTapasComputeTokenLogits(tf.keras.layers.Layer):
     def __init__(self, config: TapasConfig, **kwargs):
@@ -1574,17 +1552,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFTableQuestionAnsweringOutput) -> TFTableQuestionAnsweringOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTableQuestionAnsweringOutput(
-            logits=output.logits,
-            logits_aggregation=output.logits_aggregation,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1687,12 +1654,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 """ TAPAS utilities."""
 
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index decf18b8a7a0..2ef67426f87c 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -684,18 +684,6 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
     config_class = TransfoXLConfig
     base_model_prefix = "transformer"
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @dataclass
 class TFTransfoXLModelOutput(ModelOutput):
@@ -916,17 +904,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTransfoXLModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            mems=tf.convert_to_tensor(output.mems),
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -1015,17 +992,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTransfoXLLMHeadModelOutput(
-            prediction_scores=output.prediction_scores,
-            mems=tf.convert_to_tensor(output.mems),
-            hidden_states=hs,
-            attentions=attns,
-        )
-
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
         inputs = {}
 
@@ -1157,11 +1123,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTransfoXLSequenceClassifierOutputWithPast(
-            logits=output.logits, mems=tf.convert_to_tensor(output.mems), hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index ad39a0ae82ba..9667c529b564 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -29,7 +29,6 @@
 from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, unpack_inputs
 from ...tf_utils import shape_list
 from ...utils import (
-    DUMMY_INPUTS,
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -254,29 +253,26 @@ def __init__(
             )
 
     @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        decoder_input_ids = tf.constant(DUMMY_INPUTS, dtype=tf.int32)
-        batch_size, seq_len = decoder_input_ids.shape
-
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(
-                batch_size,
-                self.config.encoder.num_channels,
-                self.config.encoder.image_size,
-                self.config.encoder.image_size,
+    def input_signature(self):
+        vision_config = self.config.encoder
+        if hasattr(vision_config, "vision_config"):
+            vision_config = vision_config.vision_config
+        if hasattr(vision_config, "image_size"):
+            image_size = vision_config.image_size
+        else:
+            image_size = vision_config.input_size
+        return {
+            "pixel_values": tf.TensorSpec(
+                shape=(
+                    None,
+                    vision_config.num_channels,
+                    image_size,
+                    image_size,
+                ),
+                dtype=tf.float32,
             ),
-            dtype=tf.float32,
-        )
-        pixel_values = tf.constant(VISION_DUMMY_INPUTS)
-        # Add `decoder_input_ids` because `self.decoder` requires it.
-        dummy = {"pixel_values": pixel_values, "decoder_input_ids": decoder_input_ids}
-        return dummy
+            "decoder_input_ids": tf.TensorSpec(shape=(None, None), dtype=tf.int32, name="decoder_input_ids"),
+        }
 
     def get_encoder(self):
         return self.encoder
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 6a07719c916c..727db8dfc6c0 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -19,7 +19,7 @@
 
 import collections.abc
 import math
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -550,38 +550,6 @@ class TFViTPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 VIT_START_DOCSTRING = r"""
 
@@ -697,17 +665,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutputWithPooling(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            hidden_states=hs,
-            attentions=attns,
-        )
-
 
 class TFViTPooler(tf.keras.layers.Layer):
     def __init__(self, config: ViTConfig, **kwargs):
@@ -807,9 +764,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index 5f5c1a6830d6..e7d7770bcf26 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -21,7 +21,7 @@
 import math
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -698,36 +698,6 @@ class TFViTMAEPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network. Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-        return self.serving_output(output)
-
 
 VIT_MAE_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -859,18 +829,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output: TFViTMAEModelOutput) -> TFViTMAEModelOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFViTMAEModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            mask=output.mask,
-            ids_restore=output.ids_restore,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 class TFViTMAEDecoder(tf.keras.layers.Layer):
     def __init__(self, config, num_patches, **kwargs):
@@ -1173,15 +1131,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output: TFViTMAEForPreTrainingOutput) -> TFViTMAEForPreTrainingOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFViTMAEForPreTrainingOutput(
-            logits=output.logits,
-            mask=output.mask,
-            ids_restore=output.ids_restore,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 3ee16127b323..39e1539e70a7 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -19,7 +19,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
@@ -1185,14 +1185,18 @@ class TFWav2Vec2PreTrainedModel(TFPreTrainedModel):
     main_input_name = "input_values"
 
     @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        pad_token = 0.0
-        input_values = tf.convert_to_tensor(np.random.rand(1, 16000), tf.float32)
-        dummy_inputs = {
-            "input_values": input_values,
-            "attention_mask": tf.cast(tf.not_equal(input_values, pad_token), tf.float32),
+    def input_signature(self):
+        return {
+            "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
+            "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
+        }
+
+    @property
+    def dummy_inputs(self):
+        return {
+            "input_values": tf.random.uniform(shape=(1, 16000), dtype=tf.float32),
+            "attention_mask": tf.ones(shape=(1, 16000), dtype=tf.float32),
         }
-        return dummy_inputs
 
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
@@ -1201,20 +1205,6 @@ def __init__(self, config, *inputs, **kwargs):
             "to train/fine-tine this model, you need a GPU or a TPU"
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(input_values=inputs, training=False)
-
-        return self.serving_output(output)
-
     def _get_feat_extract_output_lengths(self, input_lengths, add_adapter=None):
         """
         Computes the output length of the convolutional layers
@@ -1427,17 +1417,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFWav2Vec2BaseModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            extract_features=output.extract_features,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
 
 @add_start_docstrings(
     """TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
@@ -1591,11 +1570,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput:
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        return TFCausalLMOutput(logits=output.logits, hidden_states=hidden_states, attentions=attentions)
-
 
 class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
     def __init__(self, config):
@@ -1693,27 +1667,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hidden_states = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attentions = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(
-            logits=output.logits,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(input_values=inputs)
-
-        return self.serving_output(output)
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 11168df3f9ca..b8cd87f67ef0 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -486,18 +486,13 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
             "decoder_input_ids": tf.constant([[2, 3]], dtype=tf.int32),
         }
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_features": tf.TensorSpec((None, None, None), tf.float32, name="input_features"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-        return self.serving_output(output)
+    @property
+    def input_signature(self):
+        return {
+            "input_features": tf.TensorSpec((None, self.config.num_mel_bins, None), tf.float32, name="input_features"),
+            "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
+            "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
+        }
 
 
 WHISPER_START_DOCSTRING = r"""
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 236720ae49df..6cc9db021cf9 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -28,7 +28,6 @@
 
 # Public API
 from ...file_utils import (
-    DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -620,29 +619,6 @@ class TFXGLMPreTrainedModel(TFPreTrainedModel):
     config_class = XGLMConfig
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "input_ids": input_ids,
-            "attention_mask": tf.cast(input_ids != pad_token, tf.int32),
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 XGLM_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -821,24 +797,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and self.config.add_cross_attention
-            else None
-        )
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.hidden_states,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 @add_start_docstrings(
     """
@@ -971,22 +929,3 @@ def call(
             attentions=outputs.attentions,
             cross_attentions=outputs.cross_attentions,
         )
-
-    def serving_output(self, output):
-        pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = (
-            tf.convert_to_tensor(output.cross_attentions)
-            if self.config.output_attentions and self.config.add_cross_attention
-            else None
-        )
-
-        return TFCausalLMOutputWithCrossAttentions(
-            loss=output.loss,
-            logits=output.logits,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index 1815b27c8595..80a214280cb6 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -732,13 +732,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns)
-
 
 class TFXLMPredLayer(tf.keras.layers.Layer):
     """
@@ -876,12 +869,6 @@ def call(
             logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFXLMWithLMHeadModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -960,13 +947,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1086,28 +1066,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]):
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1188,13 +1146,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1285,12 +1236,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index ae2bae7d7aa7..65f3be9e2f27 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -51,8 +51,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -868,38 +866,6 @@ class TFXLMRobertaPreTrainedModel(TFPreTrainedModel):
     config_class = XLMRobertaConfig
     base_model_prefix = "roberta"
 
-    @property
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertPreTrainedModel.dummy_inputs
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 @add_start_docstrings(
     "The bare XLM RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
@@ -974,27 +940,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output
-    def serving_output(
-        self, output: TFBaseModelOutputWithPoolingAndCrossAttentions
-    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            pooler_output=output.pooler_output,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta
 class TFXLMRobertaLMHead(tf.keras.layers.Layer):
@@ -1127,13 +1072,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
@@ -1261,20 +1199,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
-
 
 # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta
 class TFXLMRobertaClassificationHead(tf.keras.layers.Layer):
@@ -1381,13 +1305,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1411,16 +1328,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(
         XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
@@ -1491,26 +1398,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1595,13 +1482,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1694,12 +1574,3 @@ def call(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 1d8a6692c090..c5f3805ec987 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -44,7 +44,6 @@
 )
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -1177,15 +1176,6 @@ def call(
 
         return outputs
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetModelOutput(
-            last_hidden_state=output.last_hidden_state, mems=mems, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1345,13 +1335,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetLMHeadModelOutput(logits=output.logits, mems=mems, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1439,15 +1422,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForSequenceClassificationOutput(
-            logits=output.logits, mems=mems, hidden_states=hs, attentions=attns
-        )
-
 
 @add_start_docstrings(
     """
@@ -1468,16 +1442,6 @@ def __init__(self, config, *inputs, **kwargs):
             1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
         )
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1559,27 +1523,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-                "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForMultipleChoiceOutput(logits=output.logits, mems=mems, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1660,13 +1603,6 @@ def call(
             attentions=transformer_outputs.attentions,
         )
 
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForTokenClassificationOutput(logits=output.logits, mems=mems, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """
@@ -1760,16 +1696,3 @@ def call(
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
-
-    def serving_output(self, output):
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        mems = tf.convert_to_tensor(output.mems) if output.mems is not None else None
-
-        return TFXLNetForQuestionAnsweringSimpleOutput(
-            start_logits=output.start_logits,
-            end_logits=output.end_logits,
-            mems=mems,
-            hidden_states=hs,
-            attentions=attns,
-        )
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 80e2d8ed1e09..a0da6fc492f6 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -803,23 +803,6 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
     config_class = {{cookiecutter.camelcase_modelname}}Config
     base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
 
-    @property
-    def dummy_inputs(self):
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
-        dummy = {"input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int64)}
-        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
-        if self.config.add_cross_attention:
-            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
-            shape = (batch_size, seq_len) + (self.config.hidden_size,)
-            h = tf.random.uniform(shape=shape)
-            dummy["encoder_hidden_states"] = h
-
-        return dummy
 
 
 {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
@@ -991,24 +974,6 @@ def call(
 
         return outputs
 
-    def serving_output(
-        self, output: TFBaseModelOutputWithPastAndCrossAttentions
-    ) -> TFBaseModelOutputWithPastAndCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFBaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            hidden_states=hs,
-            attentions=attns,
-            cross_attentions=cross_attns,
-        )
 
 
 @add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
@@ -1084,13 +1049,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output
-    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a `language modeling` head on top for CLM fine-tuning. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING
@@ -1206,19 +1164,6 @@ def call(
             cross_attentions=outputs.cross_attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output
-    def serving_output(self, output: TFCausalLMOutputWithCrossAttentions) -> TFCausalLMOutputWithCrossAttentions:
-        output_cache = self.config.use_cache and self.config.is_decoder
-        pkv = tf.convert_to_tensor(output.past_key_values) if output_cache else None
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if output.cross_attentions is not None else None
-        if not (self.config.output_attentions and self.config.add_cross_attention):
-            cross_attns = None
-
-        return TFCausalLMOutputWithCrossAttentions(
-            logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns, cross_attentions=cross_attns
-        )
 
 
 class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer):
@@ -1318,13 +1263,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output
-    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of
@@ -1343,16 +1281,6 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs,
             units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int64)}
-
     @unpack_inputs
     @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
@@ -1441,24 +1369,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    @tf.function(input_signature=[{
-        "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
-        "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
-        "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"),
-    }])
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
-    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFMultipleChoiceModelOutput:
-        output = self.call(input_ids=inputs)
-
-        return self.serving_output(output)
-
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output
-    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFMultipleChoiceModelOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of
@@ -1532,13 +1442,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output
-    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
-
 
 @add_start_docstrings(
     """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
@@ -1625,14 +1528,6 @@ def call(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output
-    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
-        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
-        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
-
-        return TFQuestionAnsweringModelOutput(
-            start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
-        )
 
 {% else %}
 import random
@@ -2777,26 +2672,6 @@ def call(
 
         return outputs
 
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output
-    def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-
-        return TFSeq2SeqModelOutput(
-            last_hidden_state=output.last_hidden_state,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            cross_attentions=cross_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-        )
-
 
 # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
 class BiasLayer(tf.keras.layers.Layer):
@@ -2944,26 +2819,6 @@ def call(
             encoder_attentions=outputs.encoder_attentions,  # 2 of e out
         )
 
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output
-    def serving_output(self, output):
-        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
-        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
-        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
-        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
-        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
-        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
-
-        return TFSeq2SeqLMOutput(
-            logits=output.logits,
-            past_key_values=pkv,
-            decoder_hidden_states=dec_hs,
-            decoder_attentions=dec_attns,
-            cross_attentions=cross_attns,
-            encoder_last_hidden_state=output.encoder_last_hidden_state,
-            encoder_hidden_states=enc_hs,
-            encoder_attentions=enc_attns,
-        )
-
     def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 02d5077e233c..69363686837b 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1677,7 +1677,10 @@ def test_int_support(self):
 
             # After testing that the model accepts all int inputs, confirm that its dummies are int32
             for key, tensor in model.dummy_inputs.items():
-                self.assertTrue(isinstance(tensor, tf.Tensor), "Dummy inputs should be tf.Tensor!")
+                self.assertTrue(
+                    isinstance(tensor, tf.Tensor) or tf.keras.backend.is_keras_tensor(tensor),
+                    "Dummy inputs should be tf.Tensor!",
+                )
                 if tensor.dtype.is_integer:
                     self.assertTrue(tensor.dtype == tf.int32, "Integer dummy inputs should be tf.int32!")
 

From d8222be57ecc92646433a0b4b9025168894062bb Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Wed, 24 May 2023 17:31:25 +0100
Subject: [PATCH 222/935] [Whisper] Reduce batch size in tests (#23736)

---
 tests/models/whisper/test_modeling_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 98bbbb3214a7..2be7f6884e72 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -94,7 +94,7 @@ class WhisperModelTester:
     def __init__(
         self,
         parent,
-        batch_size=13,
+        batch_size=2,
         seq_length=1500,
         is_training=True,
         use_labels=False,
@@ -1537,7 +1537,7 @@ class WhisperEncoderModelTester:
     def __init__(
         self,
         parent,
-        batch_size=13,
+        batch_size=2,
         seq_length=3000,
         is_training=True,
         use_labels=True,

From 89159651babbd4008a42ed08730d0f707ed91758 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 24 May 2023 12:40:19 -0700
Subject: [PATCH 223/935] Fix the regex in `get_imports` to support multiline
 try blocks and excepts with specific exception types (#23725)

* fix and test get_imports for multiline try blocks, and excepts with specific errors

* fixup

* add some more tests

* add license
---
 src/transformers/dynamic_module_utils.py |   2 +-
 tests/utils/test_dynamic_module_utils.py | 129 +++++++++++++++++++++++
 2 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 tests/utils/test_dynamic_module_utils.py

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index e7ee18a278fa..8492b4c2d56c 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -123,7 +123,7 @@ def get_imports(filename):
         content = f.read()
 
     # filter out try/except block so in custom code we can have try/except imports
-    content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*:", "", content, flags=re.MULTILINE)
+    content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL)
 
     # Imports of the form `import xxx`
     imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
diff --git a/tests/utils/test_dynamic_module_utils.py b/tests/utils/test_dynamic_module_utils.py
new file mode 100644
index 000000000000..dfdc63460cd3
--- /dev/null
+++ b/tests/utils/test_dynamic_module_utils.py
@@ -0,0 +1,129 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from transformers.dynamic_module_utils import get_imports
+
+
+TOP_LEVEL_IMPORT = """
+import os
+"""
+
+IMPORT_IN_FUNCTION = """
+def foo():
+    import os
+    return False
+"""
+
+DEEPLY_NESTED_IMPORT = """
+def foo():
+    def bar():
+        if True:
+            import os
+        return False
+    return bar()
+"""
+
+TOP_LEVEL_TRY_IMPORT = """
+import os
+
+try:
+    import bar
+except ImportError:
+    raise ValueError()
+"""
+
+TRY_IMPORT_IN_FUNCTION = """
+import os
+
+def foo():
+    try:
+        import bar
+    except ImportError:
+        raise ValueError()
+"""
+
+MULTIPLE_EXCEPTS_IMPORT = """
+import os
+
+try:
+    import bar
+except (ImportError, AttributeError):
+    raise ValueError()
+"""
+
+EXCEPT_AS_IMPORT = """
+import os
+
+try:
+    import bar
+except ImportError as e:
+    raise ValueError()
+"""
+
+GENERIC_EXCEPT_IMPORT = """
+import os
+
+try:
+    import bar
+except:
+    raise ValueError()
+"""
+
+MULTILINE_TRY_IMPORT = """
+import os
+
+try:
+    import bar
+    import baz
+except ImportError:
+    raise ValueError()
+"""
+
+MULTILINE_BOTH_IMPORT = """
+import os
+
+try:
+    import bar
+    import baz
+except ImportError:
+    x = 1
+    raise ValueError()
+"""
+
+CASES = [
+    TOP_LEVEL_IMPORT,
+    IMPORT_IN_FUNCTION,
+    DEEPLY_NESTED_IMPORT,
+    TOP_LEVEL_TRY_IMPORT,
+    GENERIC_EXCEPT_IMPORT,
+    MULTILINE_TRY_IMPORT,
+    MULTILINE_BOTH_IMPORT,
+    MULTIPLE_EXCEPTS_IMPORT,
+    EXCEPT_AS_IMPORT,
+    TRY_IMPORT_IN_FUNCTION,
+]
+
+
+@pytest.mark.parametrize("case", CASES)
+def test_import_parsing(tmp_path, case):
+    tmp_file_path = os.path.join(tmp_path, "test_file.py")
+    with open(tmp_file_path, "w") as _tmp_file:
+        _tmp_file.write(case)
+
+    parsed_imports = get_imports(tmp_file_path)
+    assert parsed_imports == ["os"]

From 75bbf20bce5bb6b0042deff41e051db5324f9d54 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Wed, 24 May 2023 15:51:09 -0400
Subject: [PATCH 224/935] Fix sagemaker DP/MP (#23681)

* Check for use_sagemaker_dp

* Add a check for is_sagemaker_mp when setting _n_gpu again. Should be last broken thing

* Try explicit check?

* Quality
---
 src/transformers/trainer.py       | 4 +++-
 src/transformers/training_args.py | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 72fcd34d7ff7..79f19f0a3423 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3398,7 +3398,9 @@ def _nested_gather(self, tensors, name=None):
             tensors = nested_xla_mesh_reduce(tensors, name)
         elif is_sagemaker_mp_enabled():
             tensors = smp_gather(tensors)
-        elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+        elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or (
+            self.args.distributed_state is None and self.local_rank != -1
+        ):
             tensors = distributed_concat(tensors)
         return tensors
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 57aca25712de..36258c1508f9 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1629,6 +1629,9 @@ def _setup_devices(self) -> "torch.device":
             device = torch.device("cuda", local_rank)
             self._n_gpu = 1
             torch.cuda.set_device(device)
+        elif is_sagemaker_dp_enabled():
+            self.distributed_state = PartialState(_use_sagemaker_dp=True)
+            self._n_gpu = 1
         elif self.deepspeed:
             # Need to do similar for Accelerator init
             os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
@@ -1653,8 +1656,9 @@ def _setup_devices(self) -> "torch.device":
         if is_torch_tpu_available():
             device = self.distributed_state.device
             self._n_gpu = 0
-        elif is_sagemaker_dp_enabled():
-            self._n_gpu = 1
+        elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled():
+            # Already set _n_gpu
+            pass
         elif self.distributed_state.distributed_type == DistributedType.NO:
             if self.use_mps_device:
                 if not torch.backends.mps.is_available():

From 9850e6ddab8191991e3ff5358c51d34c66e751e1 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 24 May 2023 16:09:13 -0400
Subject: [PATCH 225/935] Enable prompts on the Hub (#23662)

* Enable prompts on the Hub

* Update src/transformers/tools/prompts.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Address review comments

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/custom_tools.mdx   |   7 ++
 src/transformers/tools/agents.py  |  39 +++++--
 src/transformers/tools/prompts.py | 180 ++++--------------------------
 3 files changed, 56 insertions(+), 170 deletions(-)

diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.mdx
index 23ca4bccbadc..feb4c7ac46d7 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.mdx
@@ -414,6 +414,13 @@ of the tools, it has available to it.
 
 </Tip>
 
+In both cases, you can pass a repo ID instead of the prompt template if you would like to use a template hosted by someone in the community. The default prompts live in [this repo](https://huggingface.co/datasets/huggingface-tools/default-prompts) as an example.
+
+To upload your custom prompt on a repo on the Hub and share it with the community just make sure:
+- to use a dataset repository
+- to put the prompt template for the `run` command in a file named `run_prompt_template.txt`
+- to put the prompt template for the `chat` command in a file named `chat_prompt_template.txt`
+
 ## Using custom tools
 
 In this section, we'll be leveraging two existing custom tools that are specific to image generation:
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index b04f622e15cf..0ecb600d9212 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -27,7 +27,7 @@
 from ..models.auto import AutoTokenizer
 from ..utils import is_openai_available, is_torch_available, logging
 from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
-from .prompts import CHAT_MESSAGE_PROMPT, CHAT_PROMPT_TEMPLATE, RUN_PROMPT_TEMPLATE
+from .prompts import CHAT_MESSAGE_PROMPT, download_prompt
 from .python_interpreter import evaluate
 
 
@@ -193,9 +193,13 @@ class Agent:
 
     Args:
         chat_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `chat` method.
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
         run_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `run` method.
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
         additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
             Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
             one of the default tools, that default tool will be overridden.
@@ -204,8 +208,9 @@ class Agent:
     def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
         _setup_default_tools()
 
-        self.chat_prompt_template = CHAT_PROMPT_TEMPLATE if chat_prompt_template is None else chat_prompt_template
-        self.run_prompt_template = RUN_PROMPT_TEMPLATE if run_prompt_template is None else run_prompt_template
+        agent_name = self.__class__.__name__
+        self.chat_prompt_template = download_prompt(chat_prompt_template, agent_name, mode="chat")
+        self.run_prompt_template = download_prompt(run_prompt_template, agent_name, mode="run")
         self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()
         self.log = print
         if additional_tools is not None:
@@ -367,9 +372,13 @@ class OpenAiAgent(Agent):
         api_key (`str`, *optional*):
             The API key to use. If unset, will look for the environment variable `"OPENAI_API_KEY"`.
         chat_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `chat` method.
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
         run_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `run` method.
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
         additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
             Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
             one of the default tools, that default tool will be overridden.
@@ -455,9 +464,13 @@ class HfAgent(Agent):
             The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
             running `huggingface-cli login` (stored in `~/.huggingface`).
         chat_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `chat` method.
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
         run_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `run` method.
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
         additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
             Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
             one of the default tools, that default tool will be overridden.
@@ -521,9 +534,13 @@ class LocalAgent(Agent):
         tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer to use for the agent.
         chat_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `chat` method.
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
         run_prompt_template (`str`, *optional*):
-            Pass along your own prompt if you want to override the default template for the `run` method.
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
         additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
             Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
             one of the default tools, that default tool will be overridden.
diff --git a/src/transformers/tools/prompts.py b/src/transformers/tools/prompts.py
index 796b3c242ccf..2dbb799f859f 100644
--- a/src/transformers/tools/prompts.py
+++ b/src/transformers/tools/prompts.py
@@ -14,173 +14,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 
-# docstyle-ignore
-RUN_PROMPT_TEMPLATE = """I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
-To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns.
-You should first explain which tool you will use to perform the task and for what reason, then write the code in Python.
-Each instruction in Python should be a simple assignment. You can print intermediate results if it makes sense to do so.
-
-Tools:
-<<all_tools>>
-
-
-Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
-
-I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
-
-Answer:
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-print(f"The answer is {answer}")
-```
-
-Task: "Identify the oldest person in the `document` and create an image showcasing the result."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator(answer)
-```
-
-Task: "Generate an image using the text given in the variable `caption`."
-
-I will use the following tool: `image_generator` to generate an image.
-
-Answer:
-```py
-image = image_generator(prompt=caption)
-```
-
-Task: "Summarize the text given in the variable `text` and read it out loud."
-
-I will use the following tools: `summarizer` to create a summary of the input text, then `text_reader` to read it out loud.
-
-Answer:
-```py
-summarized_text = summarizer(text)
-print(f"Summary: {summarized_text}")
-audio_summary = text_reader(summarized_text)
-```
-
-Task: "Answer the question in the variable `question` about the text in the variable `text`. Use the answer to generate an image."
-
-I will use the following tools: `text_qa` to create the answer, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = text_qa(text=text, question=question)
-print(f"The answer is {answer}.")
-image = image_generator(answer)
-```
-
-Task: "Caption the following `image`."
-
-I will use the following tool: `image_captioner` to generate a caption for the image.
-
-Answer:
-```py
-caption = image_captioner(image)
-```
-
-Task: "<<prompt>>"
-
-I will use the following"""
+from ..utils import cached_file
 
 
 # docstyle-ignore
-CHAT_PROMPT_TEMPLATE = """Below are a series of dialogues between various people and an AI assistant specialized in coding. The AI assistant tries to be helpful, polite, honest, and humble-but-knowledgeable.
-
-The job of the AI assistant is to come up with a series of simple commands in Python that will perform the task the human wants to perform.
-To help with that, the AI assistant has access to a set of tools. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns.
-The AI assistant should first explain the tools it will use to perform the task and for what reason, then write the code in Python.
-Each instruction in Python should be a simple assignment. The AI assistant can print intermediate results if it makes sense to do so.
-
-Tools:
-<<all_tools>>
-
-=====
-
-Human: Answer the question in the variable `question` about the image stored in the variable `image`.
-
-Assistant: I will use the tool `image_qa` to answer the question on the input image.
-
-```py
-answer = image_qa(text=question, image=image)
-print(f"The answer is {answer}")
-```
-
-Human: I tried this code, it worked but didn't give me a good result. The question is in French
-
-Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
-
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(text=translated_question, image=image)
-print(f"The answer is {answer}")
-```
-
-=====
-
-Human: Identify the oldest person in the `document`.
-
-Assistant: I will use the tool `document_qa` to find the oldest person in the document.
-
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-```
-
-Human: Can you generate an image with the result?
-
-Assistant: I will use the tool `image_generator` to do that.
-
-```py
-image = image_generator(answer)
-```
-
-=====
-
-Human: Summarize the text given in the variable `text` and read it out loud.
-
-Assistant: I will use the tool `summarizer` to create a summary of the input text, then the tool `text_reader` to read it out loud.
-
-```py
-summarized_text = summarizer(text)
-print(f"Summary: {summarized_text}")
-audio_summary = text_reader(text=summary)
-```
-
-Human: I got the following error: "The variable `summary` is not defined."
-
-Assistant: My bad! Let's try this code instead.
-
-```py
-summarized_text = summarizer(text)
-print(f"Summary: {summarized_text}")
-audio_summary = text_reader(text=summarized_text)
-```
+CHAT_MESSAGE_PROMPT = """
+Human: <<task>>
 
-Human: It worked! Can you translate the summary in German?
+Assistant: """
 
-Assistant: I will use the tool `translator` to translate the text in German.
 
-```py
-translated_summary = translator(summarized_text, src_lang="English", tgt_lang="German")
-```
+DEFAULT_PROMPTS_REPO = "huggingface-tools/default-prompts"
+PROMPT_FILES = {"chat": "chat_prompt_template.txt", "run": "run_prompt_template.txt"}
 
-====
-"""
 
+def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
+    """
+    Downloads and caches the prompt from a repo and returns it contents (if necessary)
+    """
+    if prompt_or_repo_id is None:
+        prompt_or_repo_id = DEFAULT_PROMPTS_REPO
 
-# docstyle-ignore
-CHAT_MESSAGE_PROMPT = """
-Human: <<task>>
+    # prompt is considered a repo ID when it does not contain any kind of space
+    if re.search("\\s", prompt_or_repo_id) is not None:
+        return prompt_or_repo_id
 
-Assistant: """
+    prompt_file = cached_file(
+        prompt_or_repo_id, PROMPT_FILES[mode], repo_type="dataset", user_agent={"agent": agent_name}
+    )
+    with open(prompt_file, "r", encoding="utf-8") as f:
+        return f.read()

From e45e756d22206ca8fa9fb057c8c3d8fa79bf81c6 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 24 May 2023 21:19:44 +0100
Subject: [PATCH 226/935] Remove the last few TF serving sigs (#23738)

Remove some more serving methods that (I think?) turned up while this PR was open
---
 .../models/sam/modeling_tf_sam.py             | 27 ++----------------
 ...tf_{{cookiecutter.lowercase_modelname}}.py | 28 -------------------
 2 files changed, 2 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index fbf38d6bc15b..794587cdde07 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -1147,12 +1147,8 @@ class TFSamPreTrainedModel(TFPreTrainedModel):
 
     @property
     def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        """
-        Dummy inputs to build the network.
-
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
+        # We override the default dummy inputs here because SAM has some really explosive memory usage in the
+        # attention layers, so we want to pass the smallest possible batches
         VISION_DUMMY_INPUTS = tf.random.uniform(
             shape=(
                 1,
@@ -1164,25 +1160,6 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         )
         return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
 
-    @tf.function(
-        input_signature=[
-            {
-                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
-            }
-        ]
-    )
-    def serving(self, inputs):
-        """
-        Method used for serving the model.
-
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 SAM_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index a0da6fc492f6..982a5807d206 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -1938,34 +1938,6 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
     config_class = {{cookiecutter.camelcase_modelname}}Config
     base_model_prefix = "model"
 
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.math.not_equal(input_ids, pad_token),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-    @tf.function(
-        input_signature=[
-            {
-                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-                "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
-                "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
-            }
-        ]
-    )
-    # Copied from transformers.models.bart.modeling_tf_bart.TFBartPretrainedModel.serving
-    def serving(self, inputs):
-        output = self.call(inputs)
-
-        return self.serving_output(output)
-
 
 {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the

From f0a2a82ab48170921c8c48a3c1fb4cc8674a5afe Mon Sep 17 00:00:00 2001
From: "Eric J. Wang" <eric.james.wang@gmail.com>
Date: Thu, 25 May 2023 04:48:48 -0700
Subject: [PATCH 227/935] Fix `pip install --upgrade accelerate` command in
 modeling_utils.py (#23747)

Fix command in modeling_utils.py
---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 6495b997b89a..38be4e736a7a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2727,7 +2727,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     raise ValueError(
                         "You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute"
                         " the appropriate device map, you should upgrade your `accelerate` library,"
-                        "`pip install --upgrade accelerare` or install it from source to support fp4 auto device map"
+                        "`pip install --upgrade accelerate` or install it from source to support fp4 auto device map"
                         "calculation. You may encounter unexpected behavior, or pass your own device map"
                     )
             elif load_in_8bit:

From 06c28cd0fc7f6ab3cf1bc866e25a08cf07ddd790 Mon Sep 17 00:00:00 2001
From: Ravi Theja <ravi03071991@gmail.com>
Date: Thu, 25 May 2023 19:05:10 +0530
Subject: [PATCH 228/935] Add LlamaIndex to awesome-transformers.md (#23484)

---
 awesome-transformers.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/awesome-transformers.md b/awesome-transformers.md
index a90f43578842..446b30bdada9 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -45,6 +45,12 @@ Keywords: Database, low-code, AI table
 
 Keywords: LLMs, Large Language Models, Agents, Chains
 
+## [LlamaIndex](https://github.com/jerryjliu/llama_index)
+
+[LlamaIndex](https://github.com/jerryjliu/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
+
+Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation 
+
 ## [ParlAI](https://github.com/facebookresearch/ParlAI)
 
 [ParlAI](https://github.com/facebookresearch/ParlAI) is a python framework for sharing, training and testing dialogue models, from open-domain chitchat, to task-oriented dialogue, to visual question answering. It provides more than 100 datasets under the same API, a large zoo of pretrained models, a set of agents, and has several integrations.

From 7d4fe85ef33f79df5009251e85f6c38e9edfcea4 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 25 May 2023 09:38:09 -0400
Subject: [PATCH 229/935] Fix psuh_to_hub in Trainer when nothing needs pushing
 (#23751)

---
 src/transformers/trainer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 79f19f0a3423..590c5da19535 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3699,9 +3699,10 @@ def _push_from_checkpoint(self, checkpoint_folder):
                 commit_message = f"Training in progress, step {self.state.global_step}"
             else:
                 commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
-            _, self.push_in_progress = self.repo.push_to_hub(
-                commit_message=commit_message, blocking=False, auto_lfs_prune=True
-            )
+            push_work = self.repo.push_to_hub(commit_message=commit_message, blocking=False, auto_lfs_prune=True)
+            # Return type of `Repository.push_to_hub` is either None or a tuple.
+            if push_work is not None:
+                self.push_in_progress = push_work[1]
         except Exception as e:
             logger.error(f"Error when pushing to hub: {e}")
         finally:

From 6e4bc670993e498e0709dfdce7fa54e5ec94fdba Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 25 May 2023 09:38:21 -0400
Subject: [PATCH 230/935] Revamp test selection for the example tests (#23737)

* Revamp test selection for the example tests

* Rename old XLA test and fake modif in run_glue

* Fixes

* Fake Trainer modif

* Remove fake modifs
---
 .circleci/config.yml                          |  21 ++--
 .circleci/create_circleci_config.py           |  15 ++-
 ...a_examples.py => old_test_xla_examples.py} |   0
 tests/repo_utils/test_tests_fetcher.py        | 113 +++++++++++++++++-
 utils/tests_fetcher.py                        |  63 +++++++---
 5 files changed, 172 insertions(+), 40 deletions(-)
 rename examples/pytorch/{test_xla_examples.py => old_test_xla_examples.py} (100%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 66678d0d4a0f..0a5060b34900 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -43,6 +43,12 @@ jobs:
                 else
                     touch test_preparation/test_list.txt
                 fi
+            - run: |
+                  if [ -f examples_test_list.txt ]; then
+                      mv examples_test_list.txt test_preparation/examples_test_list.txt
+                  else
+                      touch test_preparation/examples_test_list.txt
+                  fi
             - run: |
                 if [ -f doctest_list.txt ]; then
                     cp doctest_list.txt test_preparation/doctest_list.txt
@@ -62,19 +68,6 @@ jobs:
                 else
                     touch test_preparation/filtered_test_list.txt
                 fi
-            - run: python utils/tests_fetcher.py --filters tests examples | tee examples_tests_fetched_summary.txt
-            - run: |
-                  if [ -f test_list.txt ]; then
-                      mv test_list.txt test_preparation/examples_test_list.txt
-                  else
-                      touch test_preparation/examples_test_list.txt
-                  fi
-            - run: |
-                  if [ -f filtered_test_list_cross_tests.txt ]; then
-                      mv filtered_test_list_cross_tests.txt test_preparation/filtered_test_list_cross_tests.txt
-                  else
-                      touch test_preparation/filtered_test_list_cross_tests.txt
-                  fi
             - store_artifacts:
                   path: test_preparation/test_list.txt
             - store_artifacts:
@@ -111,7 +104,7 @@ jobs:
             - run: |
                   mkdir test_preparation
                   echo -n "tests" > test_preparation/test_list.txt
-                  echo -n "tests" > test_preparation/examples_test_list.txt
+                  echo -n "all" > test_preparation/examples_test_list.txt
                   echo -n "tests/repo_utils" > test_preparation/test_repo_utils.txt
             - run: |
                   echo -n "tests" > test_list.txt
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 4bc5ce17d08c..c9d56754a940 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -342,7 +342,6 @@ def job_name(self):
         "pip install .[sklearn,torch,sentencepiece,testing,torch-speech]",
         "pip install -r examples/pytorch/_tests_requirements.txt",
     ],
-    tests_to_run="./examples/pytorch/",
 )
 
 
@@ -355,7 +354,6 @@ def job_name(self):
         "pip install .[sklearn,tensorflow,sentencepiece,testing]",
         "pip install -r examples/tensorflow/_tests_requirements.txt",
     ],
-    tests_to_run="./examples/tensorflow/",
 )
 
 
@@ -367,7 +365,6 @@ def job_name(self):
         "pip install .[flax,testing,sentencepiece]",
         "pip install -r examples/flax/_tests_requirements.txt",
     ],
-    tests_to_run="./examples/flax/",
 )
 
 
@@ -551,7 +548,17 @@ def create_circleci_config(folder=None):
 
     example_file = os.path.join(folder, "examples_test_list.txt")
     if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
-        jobs.extend(EXAMPLES_TESTS)
+        with open(example_file, "r", encoding="utf-8") as f:
+            example_tests = f.read().split(" ")
+        for job in EXAMPLES_TESTS:
+            framework = job.name.replace("examples_", "").replace("torch", "pytorch")
+            if example_tests == "all":
+                job.tests_to_run = [f"examples/{framework}"]
+            else:
+                job.tests_to_run = [f for f in example_tests if f.startswith(f"examples/{framework}")]
+            
+            if len(job.tests_to_run) > 0:
+                jobs.append(job)
 
     doctest_file = os.path.join(folder, "doctest_list.txt")
     if os.path.exists(doctest_file):
diff --git a/examples/pytorch/test_xla_examples.py b/examples/pytorch/old_test_xla_examples.py
similarity index 100%
rename from examples/pytorch/test_xla_examples.py
rename to examples/pytorch/old_test_xla_examples.py
diff --git a/tests/repo_utils/test_tests_fetcher.py b/tests/repo_utils/test_tests_fetcher.py
index e02a917700dd..6ab213b70cae 100644
--- a/tests/repo_utils/test_tests_fetcher.py
+++ b/tests/repo_utils/test_tests_fetcher.py
@@ -42,6 +42,7 @@
     get_module_dependencies,
     get_tree_starting_at,
     infer_tests_to_run,
+    init_test_examples_dependencies,
     parse_commit_message,
     print_tree_deps_of,
 )
@@ -149,7 +150,19 @@ def create_tmp_repo(tmp_dir, models=None):
                 f"from transformers import {cls}Config, {cls}Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
             )
 
-    repo.index.add(["src", "tests"])
+    example_dir = tmp_dir / "examples"
+    example_dir.mkdir(exist_ok=True)
+    for framework in ["flax", "pytorch", "tensorflow"]:
+        framework_dir = example_dir / framework
+        framework_dir.mkdir(exist_ok=True)
+        with open(framework_dir / f"test_{framework}_examples.py", "w") as f:
+            f.write("""test_args = "run_glue.py"\n""")
+        glue_dir = framework_dir / "text-classification"
+        glue_dir.mkdir(exist_ok=True)
+        with open(glue_dir / "run_glue.py", "w") as f:
+            f.write("from transformers import BertModel\n\ncode")
+
+    repo.index.add(["examples", "src", "tests"])
     repo.index.commit("Initial commit")
     repo.create_head("main")
     repo.head.reference = repo.refs.main
@@ -164,12 +177,14 @@ def patch_transformer_repo_path(new_folder):
     """
     old_repo_path = tests_fetcher.PATH_TO_REPO
     tests_fetcher.PATH_TO_REPO = Path(new_folder).resolve()
+    tests_fetcher.PATH_TO_EXAMPLES = tests_fetcher.PATH_TO_REPO / "examples"
     tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
     tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
     try:
         yield
     finally:
         tests_fetcher.PATH_TO_REPO = old_repo_path
+        tests_fetcher.PATH_TO_EXAMPLES = tests_fetcher.PATH_TO_REPO / "examples"
         tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
         tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
 
@@ -409,6 +424,17 @@ def test_get_module_dependencies(self):
             with patch_transformer_repo_path(tmp_folder):
                 assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
 
+            # Test with an example
+            create_tmp_repo(tmp_folder)
+
+            expected_example_dependencies = ["src/transformers/models/bert/modeling_bert.py"]
+
+            with patch_transformer_repo_path(tmp_folder):
+                assert (
+                    get_module_dependencies("examples/pytorch/text-classification/run_glue.py")
+                    == expected_example_dependencies
+                )
+
     def test_create_reverse_dependency_tree(self):
         with tempfile.TemporaryDirectory() as tmp_folder:
             tmp_folder = Path(tmp_folder)
@@ -494,6 +520,33 @@ def test_print_tree_deps_of(self):
 
             assert cs.out.strip() in [expected_std_out, expected_std_out_2]
 
+    def test_init_test_examples_dependencies(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            expected_example_deps = {
+                "examples/flax/test_flax_examples.py": ["examples/flax/text-classification/run_glue.py"],
+                "examples/pytorch/test_pytorch_examples.py": ["examples/pytorch/text-classification/run_glue.py"],
+                "examples/tensorflow/test_tensorflow_examples.py": [
+                    "examples/tensorflow/text-classification/run_glue.py"
+                ],
+            }
+
+            expected_examples = {
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
+            }
+
+            with patch_transformer_repo_path(tmp_folder):
+                example_deps, all_examples = init_test_examples_dependencies()
+                assert example_deps == expected_example_deps
+                assert {str(f.relative_to(tmp_folder)) for f in all_examples} == expected_examples
+
     def test_create_reverse_dependency_map(self):
         with tempfile.TemporaryDirectory() as tmp_folder:
             tmp_folder = Path(tmp_folder)
@@ -506,6 +559,12 @@ def test_create_reverse_dependency_map(self):
                 "src/transformers/__init__.py",
                 "src/transformers/models/bert/__init__.py",
                 "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
             }
             assert set(reverse_map["src/transformers/models/bert/modeling_bert.py"]) == expected_bert_deps
 
@@ -521,6 +580,12 @@ def test_create_reverse_dependency_map(self):
                 "src/transformers/modeling_utils.py",
                 "tests/test_modeling_common.py",
                 "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
             }
             assert set(reverse_map["src/transformers/__init__.py"]) == expected_init_deps
 
@@ -529,6 +594,12 @@ def test_create_reverse_dependency_map(self):
                 "src/transformers/models/bert/configuration_bert.py",
                 "src/transformers/models/bert/modeling_bert.py",
                 "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
             }
             assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
 
@@ -543,6 +614,12 @@ def test_create_reverse_dependency_map(self):
                 "src/transformers/models/bert/configuration_bert.py",
                 "src/transformers/models/bert/modeling_bert.py",
                 "tests/models/bert/test_modeling_bert.py",
+                "examples/flax/test_flax_examples.py",
+                "examples/flax/text-classification/run_glue.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/pytorch/text-classification/run_glue.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "examples/tensorflow/text-classification/run_glue.py",
             }
             assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
 
@@ -554,13 +631,26 @@ def test_create_module_to_test_map(self):
             with patch_transformer_repo_path(tmp_folder):
                 test_map = create_module_to_test_map(filter_models=True)
 
+            expected_bert_tests = {
+                "examples/flax/test_flax_examples.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+                "tests/models/bert/test_modeling_bert.py",
+            }
+
             for model in models:
-                assert test_map[f"src/transformers/models/{model}/modeling_{model}.py"] == [
-                    f"tests/models/{model}/test_modeling_{model}.py"
-                ]
+                if model != "bert":
+                    assert test_map[f"src/transformers/models/{model}/modeling_{model}.py"] == [
+                        f"tests/models/{model}/test_modeling_{model}.py"
+                    ]
+                else:
+                    assert set(test_map[f"src/transformers/models/{model}/modeling_{model}.py"]) == expected_bert_tests
 
             # Init got filtered
             expected_init_tests = {
+                "examples/flax/test_flax_examples.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
                 "tests/test_modeling_common.py",
                 "tests/models/bert/test_modeling_bert.py",
                 "tests/models/gpt2/test_modeling_gpt2.py",
@@ -575,12 +665,21 @@ def test_infer_tests_to_run(self):
 
             commit_changes("src/transformers/models/bert/modeling_bert.py", BERT_MODEL_FILE_NEW_CODE, repo)
 
+            example_tests = {
+                "examples/flax/test_flax_examples.py",
+                "examples/pytorch/test_pytorch_examples.py",
+                "examples/tensorflow/test_tensorflow_examples.py",
+            }
+
             with patch_transformer_repo_path(tmp_folder):
                 infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
                 with open(tmp_folder / "test-output.txt", "r") as f:
                     tests_to_run = f.read()
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
 
             assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
+            assert set(example_tests_to_run.split(" ")) == example_tests
 
             # Fake a new model addition
             repo = create_tmp_repo(tmp_folder, models=models)
@@ -617,6 +716,8 @@ def test_infer_tests_to_run(self):
                 infer_tests_to_run(tmp_folder / "test-output.txt")
                 with open(tmp_folder / "test-output.txt", "r") as f:
                     tests_to_run = f.read()
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
 
             expected_tests = {
                 "tests/models/bert/test_modeling_bert.py",
@@ -625,15 +726,19 @@ def test_infer_tests_to_run(self):
                 "tests/test_modeling_common.py",
             }
             assert set(tests_to_run.split(" ")) == expected_tests
+            assert set(example_tests_to_run.split(" ")) == example_tests
 
             with patch_transformer_repo_path(tmp_folder):
                 infer_tests_to_run(tmp_folder / "test-output.txt", filter_models=False)
                 with open(tmp_folder / "test-output.txt", "r") as f:
                     tests_to_run = f.read()
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
 
             expected_tests = [f"tests/models/{name}/test_modeling_{name}.py" for name in models + ["t5"]]
             expected_tests = set(expected_tests + ["tests/test_modeling_common.py"])
             assert set(tests_to_run.split(" ")) == expected_tests
+            assert set(example_tests_to_run.split(" ")) == example_tests
 
     def test_infer_tests_to_run_with_test_modifs(self):
         with tempfile.TemporaryDirectory() as tmp_folder:
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 05009e9759fb..d8373da5ef5b 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -46,6 +46,7 @@
 
 
 PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+PATH_TO_EXAMPLES = PATH_TO_REPO / "examples"
 PATH_TO_TRANFORMERS = PATH_TO_REPO / "src/transformers"
 PATH_TO_TESTS = PATH_TO_REPO / "tests"
 
@@ -512,15 +513,40 @@ def print_tree_deps_of(module, all_edges=None):
         print(line[0])
 
 
+def init_test_examples_dependencies():
+    """
+    The test examples do not import from the examples (which are just scripts, not modules) so we need som extra
+    care initializing the dependency map there.
+    """
+    test_example_deps = {}
+    all_examples = []
+    for framework in ["flax", "pytorch", "tensorflow"]:
+        test_files = list((PATH_TO_EXAMPLES / framework).glob("test_*.py"))
+        all_examples.extend(test_files)
+        examples = [
+            f for f in (PATH_TO_EXAMPLES / framework).glob("**/*.py") if f.parent != PATH_TO_EXAMPLES / framework
+        ]
+        all_examples.extend(examples)
+        for test_file in test_files:
+            with open(test_file, "r", encoding="utf-8") as f:
+                content = f.read()
+            test_example_deps[str(test_file.relative_to(PATH_TO_REPO))] = [
+                str(e.relative_to(PATH_TO_REPO)) for e in examples if e.name in content
+            ]
+    return test_example_deps, all_examples
+
+
 def create_reverse_dependency_map():
     """
     Create the dependency map from module/test filename to the list of modules/tests that depend on it (even
     recursively).
     """
     cache = {}
-    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    example_deps, examples = init_test_examples_dependencies()
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py")) + examples
     all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
     direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
+    direct_deps.update(example_deps)
 
     # This recurses the dependencies
     something_changed = True
@@ -557,7 +583,15 @@ def create_module_to_test_map(reverse_map=None, filter_models=False):
     """
     if reverse_map is None:
         reverse_map = create_reverse_dependency_map()
-    test_map = {module: [f for f in deps if f.startswith("tests")] for module, deps in reverse_map.items()}
+
+    def is_test(fname):
+        if fname.startswith("tests"):
+            return True
+        if fname.startswith("examples") and fname.split(os.path.sep)[-1].startswith("test"):
+            return True
+        return False
+
+    test_map = {module: [f for f in deps if is_test(f)] for module, deps in reverse_map.items()}
 
     if not filter_models:
         return test_map
@@ -627,9 +661,7 @@ def create_json_map(test_files_to_run, json_output_file):
         json.dump(test_map, fp, ensure_ascii=False)
 
 
-def infer_tests_to_run(
-    output_file, diff_with_last_commit=False, filters=None, filter_models=True, json_output_file=None
-):
+def infer_tests_to_run(output_file, diff_with_last_commit=False, filter_models=True, json_output_file=None):
     modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit)
     print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}")
 
@@ -663,11 +695,6 @@ def infer_tests_to_run(
         test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "sagemaker"]
         # Make sure we did not end up with a test file that was removed
         test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
-        if filters is not None:
-            filtered_files = []
-            for _filter in filters:
-                filtered_files.extend([f for f in test_files_to_run if f.startswith(_filter)])
-            test_files_to_run = filtered_files
 
         repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in modified_files)
 
@@ -676,6 +703,8 @@ def infer_tests_to_run(
         with open(repo_util_file, "w", encoding="utf-8") as f:
             f.write("tests/repo_utils")
 
+    examples_tests_to_run = [f for f in test_files_to_run if f.startswith("examples")]
+    test_files_to_run = [f for f in test_files_to_run if not f.startswith("examples")]
     print(f"\n### TEST TO RUN ###\n{_print_list(test_files_to_run)}")
     if len(test_files_to_run) > 0:
         with open(output_file, "w", encoding="utf-8") as f:
@@ -690,6 +719,12 @@ def infer_tests_to_run(
 
         create_json_map(test_files_to_run, json_output_file)
 
+    print(f"\n### EXAMPLES TEST TO RUN ###\n{_print_list(examples_tests_to_run)}")
+    if len(examples_tests_to_run) > 0:
+        example_file = Path(output_file).parent / "examples_test_list.txt"
+        with open(example_file, "w", encoding="utf-8") as f:
+            f.write(" ".join(examples_tests_to_run))
+
     doctest_list = get_doctest_files()
 
     print(f"\n### DOCTEST TO RUN ###\n{_print_list(doctest_list)}")
@@ -763,13 +798,6 @@ def parse_commit_message(commit_message):
         action="store_true",
         help="To fetch the tests between the current commit and the last commit",
     )
-    parser.add_argument(
-        "--filters",
-        type=str,
-        nargs="*",
-        default=["tests"],
-        help="Only keep the test files matching one of those filters.",
-    )
     parser.add_argument(
         "--filter_tests",
         action="store_true",
@@ -814,7 +842,6 @@ def parse_commit_message(commit_message):
                 infer_tests_to_run(
                     args.output_file,
                     diff_with_last_commit=diff_with_last_commit,
-                    filters=args.filters,
                     json_output_file=args.json_output_file,
                     filter_models=not commit_flags["no_filter"],
                 )

From 3416bba7c70c358ac17efd3be31e9090135969ab Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 25 May 2023 16:06:14 +0200
Subject: [PATCH 231/935] [LongFormer] code nits, removed unused parameters 
 (#23749)

* remove unused parameters

* remove unused parameters in config
---
 .../models/longformer/configuration_longformer.py         | 8 --------
 src/transformers/models/longformer/modeling_longformer.py | 2 --
 2 files changed, 10 deletions(-)

diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py
index 3f3e2da7e830..1542c497989f 100644
--- a/src/transformers/models/longformer/configuration_longformer.py
+++ b/src/transformers/models/longformer/configuration_longformer.py
@@ -86,12 +86,6 @@ class LongformerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
-            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
         attention_window (`int` or `List[int]`, *optional*, defaults to 512):
             Size of an attention window around each token. If an `int`, use the same size for all layers. To specify a
             different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`.
@@ -131,7 +125,6 @@ def __init__(
         type_vocab_size: int = 2,
         initializer_range: float = 0.02,
         layer_norm_eps: float = 1e-12,
-        position_embedding_type: str = "absolute",
         onnx_export: bool = False,
         **kwargs,
     ):
@@ -154,7 +147,6 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.position_embedding_type = position_embedding_type
         self.onnx_export = onnx_export
 
 
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 9768641afe45..cd975380be55 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -445,8 +445,6 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx

From f04f549bae48ee0be4c6a645a68162d5b3b0e99b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=A9=E7=81=AB?= <niltok@163.com>
Date: Fri, 26 May 2023 04:10:25 +0800
Subject: [PATCH 232/935] Fix is_ninja_available() (#23752)

* Fix is_ninja_available()

search ninja using subprocess instead of importlib.

* Fix style

* Fix doc

* Fix style
---
 src/transformers/utils/import_utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 037a0d96a130..be321c1017c3 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -19,6 +19,7 @@
 import json
 import os
 import shutil
+import subprocess
 import sys
 import warnings
 from collections import OrderedDict
@@ -94,7 +95,6 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _keras_nlp_available = _is_package_available("keras_nlp")
 _librosa_available = _is_package_available("librosa")
 _natten_available = _is_package_available("natten")
-_ninja_available = _is_package_available("ninja")
 _onnx_available = _is_package_available("onnx")
 _openai_available = _is_package_available("openai")
 _optimum_available = _is_package_available("optimum")
@@ -449,7 +449,16 @@ def is_apex_available():
 
 
 def is_ninja_available():
-    return _ninja_available
+    r"""
+    Code comes from *torch.utils.cpp_extension.is_ninja_available()*. Returns `True` if the
+    [ninja](https://ninja-build.org/) build system is available on the system, `False` otherwise.
+    """
+    try:
+        subprocess.check_output("ninja --version".split())
+    except Exception:
+        return False
+    else:
+        return True
 
 
 def is_ipex_available():

From 4b0e7ded1c65ce6c15dd50b371a8ec5903714dff Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 25 May 2023 16:16:01 -0400
Subject: [PATCH 233/935] Bump tornado from 6.0.4 to 6.3.2 in
 /examples/research_projects/lxmert (#23766)

Bumps [tornado](https://github.com/tornadoweb/tornado) from 6.0.4 to 6.3.2.
- [Changelog](https://github.com/tornadoweb/tornado/blob/master/docs/releases.rst)
- [Commits](https://github.com/tornadoweb/tornado/compare/v6.0.4...v6.3.2)

---
updated-dependencies:
- dependency-name: tornado
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/lxmert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index 8b2d89edbc4e..52e3878fac24 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -86,7 +86,7 @@ testpath==0.4.4
 tokenizers==0.8.1rc2
 torch==1.6.0
 torchvision==0.7.0
-tornado==6.0.4
+tornado==6.3.2
 tqdm==4.48.2
 traitlets
 git+https://github.com/huggingface/transformers.git

From d685e330b574120fc715e702617d163ceed5c461 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 25 May 2023 16:16:12 -0400
Subject: [PATCH 234/935] Bump tornado from 6.0.4 to 6.3.2 in
 /examples/research_projects/visual_bert (#23767)

Bump tornado in /examples/research_projects/visual_bert

Bumps [tornado](https://github.com/tornadoweb/tornado) from 6.0.4 to 6.3.2.
- [Changelog](https://github.com/tornadoweb/tornado/blob/master/docs/releases.rst)
- [Commits](https://github.com/tornadoweb/tornado/compare/v6.0.4...v6.3.2)

---
updated-dependencies:
- dependency-name: tornado
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/visual_bert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
index 8b2d89edbc4e..52e3878fac24 100644
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@@ -86,7 +86,7 @@ testpath==0.4.4
 tokenizers==0.8.1rc2
 torch==1.6.0
 torchvision==0.7.0
-tornado==6.0.4
+tornado==6.3.2
 tqdm==4.48.2
 traitlets
 git+https://github.com/huggingface/transformers.git

From f67dac97bdc63874f2288546b3fa87e69d2ea1c8 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 25 May 2023 22:37:33 +0200
Subject: [PATCH 235/935] [`Nllb-Moe`] Fix nllb moe accelerate issue (#23758)

fix nllb moe accelerate issue
---
 src/transformers/models/nllb_moe/modeling_nllb_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 567b03c1c4bb..b6ea574469ee 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -856,7 +856,7 @@ class NllbMoePreTrainedModel(PreTrainedModel):
     config_class = NllbMoeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["NllbMoeAttention"]
+    _no_split_modules = ["NllbMoeEncoderLayer", "NllbMoeDecoderLayer"]
 
     def _init_weights(self, module):
         """Initialize the weights"""

From 8d28dba35deb83a33926135895be786518c6e793 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 26 May 2023 14:30:32 +0200
Subject: [PATCH 236/935] [OPT] Doc nit, using fast is fine (#23789)

small doc nit
---
 docs/source/en/model_doc/opt.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/opt.mdx b/docs/source/en/model_doc/opt.mdx
index 0c041c5ecb87..073c7c3bfc68 100644
--- a/docs/source/en/model_doc/opt.mdx
+++ b/docs/source/en/model_doc/opt.mdx
@@ -23,7 +23,7 @@ The abstract from the paper is the following:
 
 Tips:
 - OPT has the same architecture as [`BartDecoder`].
-- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt. **Note**: Make sure to pass `use_fast=False` when loading OPT's tokenizer with [`AutoTokenizer`] to get the correct tokenizer.
+- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
 The original code can be found [here](https://github.com/facebookresearch/metaseq).

From 4d9b76a80f6c43a12d1ce0faf2c79e41d3ebb78f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 26 May 2023 08:33:17 -0400
Subject: [PATCH 237/935] Fix RWKV backward on GPU (#23774)

---
 src/transformers/models/rwkv/modeling_rwkv.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index 7a78ec082f45..cd577e9c7431 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -159,7 +159,7 @@ def forward(ctx, time_decay, time_first, key, value, state=None, return_state=Fa
 
     @staticmethod
     # g stands for grad
-    def backward(ctx, g_output):
+    def backward(ctx, g_output, g_state=None):
         input_dtype = ctx.input_dtype
 
         time_decay, time_first, key, value, output = ctx.saved_tensors
@@ -188,17 +188,14 @@ def backward(ctx, g_output):
             g_key,
             g_value,
         )
-        g_time_decay = torch.sum(g_time_decay, dim=0)
-        g_time_first = torch.sum(g_time_first, dim=0)
 
         return (
-            None,
-            None,
-            None,
             g_time_decay.to(input_dtype),
             g_time_first.to(input_dtype),
             g_key.to(input_dtype),
             g_value.to(input_dtype),
+            None,
+            None,
         )
 
 

From d61d7476275a6f31fcf68df334c68ac6bf739166 Mon Sep 17 00:00:00 2001
From: amitportnoy <113588658+amitportnoy@users.noreply.github.com>
Date: Fri, 26 May 2023 15:36:33 +0300
Subject: [PATCH 238/935] Update trainer.mdx class_weights example (#23787)

class_weights tensor should follow model's device
---
 docs/source/en/main_classes/trainer.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
index 67ab6aba42ef..409a6c6d33af 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.mdx
@@ -61,7 +61,7 @@ class CustomTrainer(Trainer):
         outputs = model(**inputs)
         logits = outputs.get("logits")
         # compute custom loss (suppose one has 3 labels with different weights)
-        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
         return (loss, outputs) if return_outputs else loss
 ```

From b7b729b38d12309185bcc9fdf8b55418a1ad2421 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 26 May 2023 22:47:51 +0800
Subject: [PATCH 239/935] no_cuda does not take effect in non distributed
 environment (#23795)

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
---
 src/transformers/training_args.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 36258c1508f9..63876e053ad4 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1684,7 +1684,9 @@ def _setup_devices(self) -> "torch.device":
                         )
                     device = torch.device("mps")
                     self._n_gpu = 1
-
+            elif self.no_cuda:
+                device = torch.device("cpu")
+                self._n_gpu = 0
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`

From e7242469355044af103dc79353e7e729a9e43350 Mon Sep 17 00:00:00 2001
From: Ran Ran <ran.rissy@gmail.com>
Date: Fri, 26 May 2023 11:24:57 -0700
Subject: [PATCH 240/935] Fix no such file or directory error (#23783)

* Fix no such file or directory error

* Address comment

* Fix formatting issue
---
 .../tensorflow/image-classification/run_image_classification.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 61c6cea2cd94..6a4b7df4d0a0 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -543,6 +543,7 @@ def compute_metrics(p):
                 logging.info(f"{metric_name}: {value:.3f}")
 
         if training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
             with open(os.path.join(training_args.output_dir, "all_results.json"), "w") as f:
                 f.write(json.dumps(eval_metrics))
 

From edf77728268e22e18151abb3d8acbb50ad8e92a8 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Fri, 26 May 2023 15:09:05 -0400
Subject: [PATCH 241/935] Log the right train_batch_size if using
 auto_find_batch_size and also log the adjusted value seperately. (#23800)

* Log right bs

* Log

* Diff message
---
 src/transformers/trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 590c5da19535..357cfc45bddd 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1704,6 +1704,7 @@ def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
         self._train_batch_size = batch_size
+        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
 
@@ -1811,7 +1812,7 @@ def _inner_training_loop(
         logger.info("***** Running training *****")
         logger.info(f"  Num examples = {num_examples:,}")
         logger.info(f"  Num Epochs = {num_train_epochs:,}")
-        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size:,}")
+        logger.info(f"  Instantaneous batch size per device = {self._train_batch_size:,}")
         logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
         logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
         logger.info(f"  Total optimization steps = {max_steps:,}")

From 17a55534f5e5df10ac4804d4270bf6b8cc24998d Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 26 May 2023 15:51:15 -0400
Subject: [PATCH 242/935] Enable code-specific revision for code on the Hub
 (#23799)

* Enable code-specific revision for code on the Hub

* invalidate old revision
---
 src/transformers/dynamic_module_utils.py       | 13 +++++++++----
 src/transformers/models/auto/auto_factory.py   | 18 ++++++++++++++++++
 .../models/auto/configuration_auto.py          |  1 +
 .../models/auto/feature_extraction_auto.py     |  1 +
 .../models/auto/image_processing_auto.py       |  1 +
 .../models/auto/processing_auto.py             |  1 +
 .../models/auto/tokenization_auto.py           |  1 +
 7 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 8492b4c2d56c..ae76e4ae1fa5 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -316,7 +316,7 @@ def get_cached_module_file(
                 )
                 new_files.append(f"{module_needed}.py")
 
-    if len(new_files) > 0:
+    if len(new_files) > 0 and revision is None:
         new_files = "\n".join([f"- {f}" for f in new_files])
         repo_type_str = "" if repo_type is None else f"{repo_type}s/"
         url = f"https://huggingface.co/{repo_type_str}{pretrained_model_name_or_path}"
@@ -340,6 +340,7 @@ def get_class_from_dynamic_module(
     revision: Optional[str] = None,
     local_files_only: bool = False,
     repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
     **kwargs,
 ):
     """
@@ -391,6 +392,10 @@ def get_class_from_dynamic_module(
             If `True`, will only try to load the tokenizer configuration from local files.
         repo_type (`str`, *optional*):
             Specify the repo type (useful when downloading from a space for instance).
+        code_revision (`str`, *optional*, defaults to `"main"`):
+            The specific revision to use for the code on the Hub, if the code leaves in a different repository than the
+            rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based system for
+            storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
 
     <Tip>
 
@@ -415,12 +420,12 @@ def get_class_from_dynamic_module(
     # Catch the name of the repo if it's specified in `class_reference`
     if "--" in class_reference:
         repo_id, class_reference = class_reference.split("--")
-        # Invalidate revision since it's not relevant for this repo
-        revision = "main"
     else:
         repo_id = pretrained_model_name_or_path
     module_file, class_name = class_reference.split(".")
 
+    if code_revision is None and pretrained_model_name_or_path == repo_id:
+        code_revision = revision
     # And lastly we get the class inside our newly created module
     final_module = get_cached_module_file(
         repo_id,
@@ -430,7 +435,7 @@ def get_class_from_dynamic_module(
         resume_download=resume_download,
         proxies=proxies,
         use_auth_token=use_auth_token,
-        revision=revision,
+        revision=code_revision,
         local_files_only=local_files_only,
         repo_type=repo_type,
     )
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index aad113d45442..eedecb0da9c7 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -128,6 +128,11 @@
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                 should only be set to `True` for repositories you trust and in which you have read the code, as it will
                 execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
             kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -224,6 +229,11 @@
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                 should only be set to `True` for repositories you trust and in which you have read the code, as it will
                 execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
             kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -320,6 +330,11 @@
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                 should only be set to `True` for repositories you trust and in which you have read the code, as it will
                 execute code present on the Hub on your local machine.
+            code_revision (`str`, *optional*, defaults to `"main"`):
+                The specific revision to use for the code on the Hub, if the code leaves in a different repository than
+                the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based
+                system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier
+                allowed by git.
             kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -408,6 +423,7 @@ def from_config(cls, config, **kwargs):
             else:
                 repo_id = config.name_or_path
             model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+            _ = kwargs.pop("code_revision", None)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
             model_class = _get_model_class(config, cls._model_mapping)
@@ -425,6 +441,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         kwargs["_from_auto"] = True
         hub_kwargs_names = [
             "cache_dir",
+            "code_revision",
             "force_download",
             "local_files_only",
             "proxies",
@@ -464,6 +481,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model_class = get_class_from_dynamic_module(
                 class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
             )
+            _ = hub_kwargs.pop("code_revision", None)
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 80f7af5caad3..0266dde7dd2a 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -938,6 +938,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 )
             class_ref = config_dict["auto_map"]["AutoConfig"]
             config_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            _ = kwargs.pop("code_revision", None)
             return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "model_type" in config_dict:
             config_class = CONFIG_MAPPING[config_dict["model_type"]]
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index ff9dec171f43..588f4b4d3d59 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -337,6 +337,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 feature_extractor_class = get_class_from_dynamic_module(
                     feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
                 )
+                _ = kwargs.pop("code_revision", None)
             else:
                 feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
 
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index d3d2944ff823..d3c6615527f3 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -361,6 +361,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 image_processor_class = get_class_from_dynamic_module(
                     image_processor_auto_map, pretrained_model_name_or_path, **kwargs
                 )
+                _ = kwargs.pop("code_revision", None)
             else:
                 image_processor_class = image_processor_class_from_name(image_processor_class)
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index d96757bc13ad..e72815747fa3 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -259,6 +259,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 processor_class = get_class_from_dynamic_module(
                     processor_auto_map, pretrained_model_name_or_path, **kwargs
                 )
+                _ = kwargs.pop("code_revision", None)
             else:
                 processor_class = processor_class_from_name(processor_class)
 
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index cb6c91521de9..aa4d5860a147 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -678,6 +678,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 else:
                     class_ref = tokenizer_auto_map[0]
                 tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+                _ = kwargs.pop("code_revision", None)
 
             elif use_fast and not config_tokenizer_class.endswith("Fast"):
                 tokenizer_class_candidate = f"{config_tokenizer_class}Fast"

From 4b6a5a7caa1ea31a3321eb17e6dcc9baff4f55d9 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Tue, 30 May 2023 13:53:32 +0530
Subject: [PATCH 243/935] [Time-Series] Autoformer model (#21891)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ran `transformers-cli add-new-model-like`

* added `AutoformerLayernorm` and `AutoformerSeriesDecomposition`

* added `decomposition_layer` in `init` and `moving_avg` to config

* added `AutoformerAutoCorrelation` to encoder & decoder

* removed caninical self attention `AutoformerAttention`

* added arguments in config and model tester. Init works! 😁

* WIP autoformer attention with autocorrlation

* fixed `attn_weights` size

* wip time_delay_agg_training

* fixing sizes and debug time_delay_agg_training

* aggregation in training works! 😁

* `top_k_delays` -> `top_k_delays_index` and added `contiguous()`

* wip time_delay_agg_inference

* finish time_delay_agg_inference 😎

* added resize to autocorrelation

* bug fix: added the length of the output signal to `irfft`

* `attention_mask = None` in the decoder

* fixed test: changed attention expected size, `test_attention_outputs` works!

* removed unnecessary code

* apply AutoformerLayernorm in final norm in enc & dec

* added series decomposition to the encoder

* added series decomp to decoder, with inputs

* added trend todos

* added autoformer to README

* added to index

* added autoformer.mdx

* remove scaling and init attention_mask in the decoder

* make style

* fix copies

* make fix-copies

* inital fix-copies

* fix from https://github.com/huggingface/transformers/pull/22076

* make style

* fix class names

* added trend

* added d_model and projection layers

* added `trend_projection` source, and decomp layer init

* added trend & seasonal init for decoder input

* AutoformerModel cannot be copied as it has the decomp layer too

* encoder can be copied from time series transformer

* fixed generation and made distrb. out more robust

* use context window to calculate decomposition

* use the context_window for decomposition

* use output_params helper

* clean up AutoformerAttention

* subsequences_length off by 1

* make fix copies

* fix test

* added init for nn.Conv1d

* fix IGNORE_NON_TESTED

* added model_doc

* fix ruff

* ignore tests

* remove dup

* fix SPECIAL_CASES_TO_ALLOW

* do not copy due to conv1d weight init

* remove unused imports

* added short summary

* added label_length and made the model non-autoregressive

* added params docs

* better doc for `factor`

* fix tests

* renamed `moving_avg` to `moving_average`

* renamed `factor` to `autocorrelation_factor`

* make style

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* fix configurations

* fix integration tests

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fixing `lags_sequence` doc

* Revert "fixing `lags_sequence` doc"

This reverts commit 21e34911e36a6f8f45f25cbf43584a49e5316c55.

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* model layers now take the config

* added `layer_norm_eps` to the config

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* added `config.layer_norm_eps` to AutoformerLayernorm

* added `config.layer_norm_eps` to all layernorm layers

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix variable names

* added inital pretrained model

* added use_cache docstring

* doc strings for trend and use_cache

* fix order of args

* imports on one line

* fixed get_lagged_subsequences docs

* add docstring for create_network_inputs

* get rid of layer_norm_eps config

* add back layernorm

* update fixture location

* fix signature

* use AutoformerModelOutput dataclass

* fix pretrain config

* no need as default exists

* subclass ModelOutput

* remove layer_norm_eps config

* fix test_model_outputs_equivalence test

* test hidden_states_output

* make fix-copies

* Update src/transformers/models/autoformer/configuration_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* removed unused attr

* Update tests/models/autoformer/test_modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/autoformer/modeling_autoformer.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* use AutoFormerDecoderOutput

* fix formatting

* fix formatting

---------

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/autoformer.mdx       |   42 +
 src/transformers/__init__.py                  |   22 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 .../models/autoformer/__init__.py             |   63 +
 .../autoformer/configuration_autoformer.py    |  245 ++
 .../models/autoformer/modeling_autoformer.py  | 2178 +++++++++++++++++
 .../models/informer/configuration_informer.py |    3 -
 .../models/informer/modeling_informer.py      |    8 +-
 .../configuration_time_series_transformer.py  |    3 -
 .../modeling_time_series_transformer.py       |    8 +-
 src/transformers/time_series_utils.py         |    4 +-
 src/transformers/utils/dummy_pt_objects.py    |   24 +
 tests/models/autoformer/__init__.py           |    0
 .../autoformer/test_modeling_autoformer.py    |  449 ++++
 .../models/informer/test_modeling_informer.py |    2 +-
 .../test_modeling_time_series_transformer.py  |    2 +-
 utils/check_config_attributes.py              |    2 +
 utils/check_repo.py                           |    3 +
 29 files changed, 3058 insertions(+), 16 deletions(-)
 create mode 100644 docs/source/en/model_doc/autoformer.mdx
 create mode 100644 src/transformers/models/autoformer/__init__.py
 create mode 100644 src/transformers/models/autoformer/configuration_autoformer.py
 create mode 100644 src/transformers/models/autoformer/modeling_autoformer.py
 create mode 100644 tests/models/autoformer/__init__.py
 create mode 100644 tests/models/autoformer/test_modeling_autoformer.py

diff --git a/README.md b/README.md
index 35ede94067a4..e8e3f26d0ad6 100644
--- a/README.md
+++ b/README.md
@@ -292,6 +292,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_es.md b/README_es.md
index 2f611fb09dfa..8d971e6f3047 100644
--- a/README_es.md
+++ b/README_es.md
@@ -267,6 +267,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_hd.md b/README_hd.md
index 8245082beade..bb94c9961a60 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -239,6 +239,7 @@ conda install -c huggingface transformers
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research से) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. द्वाराअनुसंधान पत्र [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) के साथ जारी किया गया
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 8a75ec530c75..25a2cdb1d538 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -301,6 +301,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research から) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. から公開された研究論文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
diff --git a/README_ko.md b/README_ko.md
index c7e128ad98d0..7c160536f19b 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -216,6 +216,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research 에서 제공)은 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.의 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)논문과 함께 발표했습니다.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 1a4b647ca144..75353804d30b 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -240,6 +240,7 @@ conda install -c huggingface transformers
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (来自 Google Research) 伴随论文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) 由 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig 发布。
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9e915cde44dd..066604362c13 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -252,6 +252,7 @@ conda install -c huggingface transformers
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 30bb7fca768f..dfe899032ea8 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -656,6 +656,8 @@
       title: Reinforcement learning models
     - isExpanded: false
       sections:
+      - local: model_doc/autoformer
+        title: Autoformer
       - local: model_doc/informer
         title: Informer
       - local: model_doc/time_series_transformer
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 497803322f89..7ee35c613669 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -53,6 +53,7 @@ The documentation is organized into five sections:
 1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -268,6 +269,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             ALIGN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Autoformer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/autoformer.mdx b/docs/source/en/model_doc/autoformer.mdx
new file mode 100644
index 000000000000..c1bd30555cdb
--- /dev/null
+++ b/docs/source/en/model_doc/autoformer.mdx
@@ -0,0 +1,42 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Autoformer
+
+## Overview
+
+The Autoformer model was proposed in [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+
+This model augments the Transformer as a deep decomposition architecture, which can progressively decompose the trend and seasonal components during the forecasting process.
+
+The abstract from the paper is the following:
+
+*Extending the forecasting time is a critical demand for real applications, such as extreme weather early warning and long-term energy consumption planning. This paper studies the long-term forecasting problem of time series. Prior Transformer-based models adopt various self-attention mechanisms to discover the long-range dependencies. However, intricate temporal patterns of the long-term future prohibit the model from finding reliable dependencies. Also, Transformers have to adopt the sparse versions of point-wise self-attentions for long series efficiency, resulting in the information utilization bottleneck. Going beyond Transformers, we design Autoformer as a novel decomposition architecture with an Auto-Correlation mechanism. We break with the pre-processing convention of series decomposition and renovate it as a basic inner block of deep models. This design empowers Autoformer with progressive decomposition capacities for complex time series. Further, inspired by the stochastic process theory, we design the Auto-Correlation mechanism based on the series periodicity, which conducts the dependencies discovery and representation aggregation at the sub-series level. Auto-Correlation outperforms self-attention in both efficiency and accuracy. In long-term forecasting, Autoformer yields state-of-the-art accuracy, with a 38% relative improvement on six benchmarks, covering five practical applications: energy, traffic, economics, weather and disease.*
+
+This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
+The original code can be found [here](https://github.com/thuml/Autoformer).
+
+## AutoformerConfig
+
+[[autodoc]] AutoformerConfig
+
+
+## AutoformerModel
+
+[[autodoc]] AutoformerModel
+    - forward
+
+
+## AutoformerForPrediction
+
+[[autodoc]] AutoformerForPrediction
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 10bf93633abd..37310c34b986 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -155,6 +155,10 @@
         "AutoProcessor",
         "AutoTokenizer",
     ],
+    "models.autoformer": [
+        "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AutoformerConfig",
+    ],
     "models.bart": ["BartConfig", "BartTokenizer"],
     "models.barthez": [],
     "models.bartpho": [],
@@ -1082,6 +1086,14 @@
             "AutoModelWithLMHead",
         ]
     )
+    _import_structure["models.autoformer"].extend(
+        [
+            "AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AutoformerForPrediction",
+            "AutoformerModel",
+            "AutoformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.bart"].extend(
         [
             "BART_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3946,6 +3958,10 @@
         AutoProcessor,
         AutoTokenizer,
     )
+    from .models.autoformer import (
+        AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AutoformerConfig,
+    )
     from .models.bart import BartConfig, BartTokenizer
     from .models.beit import BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BeitConfig
     from .models.bert import (
@@ -4784,6 +4800,12 @@
             AutoModelForZeroShotObjectDetection,
             AutoModelWithLMHead,
         )
+        from .models.autoformer import (
+            AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AutoformerForPrediction,
+            AutoformerModel,
+            AutoformerPreTrainedModel,
+        )
         from .models.bart import (
             BART_PRETRAINED_MODEL_ARCHIVE_LIST,
             BartForCausalLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 91a2d3ed5dab..1aa2a049aa1b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -18,6 +18,7 @@
     altclip,
     audio_spectrogram_transformer,
     auto,
+    autoformer,
     bart,
     barthez,
     bartpho,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0266dde7dd2a..fbffe8382266 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -33,6 +33,7 @@
         ("align", "AlignConfig"),
         ("altclip", "AltCLIPConfig"),
         ("audio-spectrogram-transformer", "ASTConfig"),
+        ("autoformer", "AutoformerConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
         ("bert", "BertConfig"),
@@ -225,6 +226,7 @@
         ("align", "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("altclip", "ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("audio-spectrogram-transformer", "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("autoformer", "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -399,6 +401,7 @@
         ("align", "ALIGN"),
         ("altclip", "AltCLIP"),
         ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
+        ("autoformer", "Autoformer"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
         ("bartpho", "BARTpho"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d42cf7491d57..d6581f94d863 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -32,6 +32,7 @@
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
         ("audio-spectrogram-transformer", "ASTModel"),
+        ("autoformer", "AutoformerModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
         ("bert", "BertModel"),
diff --git a/src/transformers/models/autoformer/__init__.py b/src/transformers/models/autoformer/__init__.py
new file mode 100644
index 000000000000..f87bfdea532d
--- /dev/null
+++ b/src/transformers/models/autoformer/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_autoformer": [
+        "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AutoformerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_autoformer"] = [
+        "AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "AutoformerForPrediction",
+        "AutoformerModel",
+        "AutoformerPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_autoformer import (
+        AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AutoformerConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_autoformer import (
+            AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AutoformerForPrediction,
+            AutoformerModel,
+            AutoformerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/autoformer/configuration_autoformer.py b/src/transformers/models/autoformer/configuration_autoformer.py
new file mode 100644
index 000000000000..ced76448cd1e
--- /dev/null
+++ b/src/transformers/models/autoformer/configuration_autoformer.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Autoformer model configuration"""
+
+from typing import List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "huggingface/autoformer-tourism-monthly": "https://huggingface.co/huggingface/autoformer-tourism-monthly/resolve/main/config.json",
+}
+
+
+class AutoformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`AutoformerModel`]. It is used to instantiate an
+    Autoformer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Autoformer
+    [huggingface/autoformer-tourism-monthly](https://huggingface.co/huggingface/autoformer-tourism-monthly)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        prediction_length (`int`):
+            The prediction length for the decoder. In other words, the prediction horizon of the model.
+        context_length (`int`, *optional*, defaults to `prediction_length`):
+            The context length for the encoder. If unset, the context length will be the same as the
+            `prediction_length`.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial".
+        loss (`string`, *optional*, defaults to `"nll"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood (nll) - which currently is the only supported one.
+        input_size (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
+            The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
+            5, 6, 7]`.
+        scaling (`bool`, *optional* defaults to `True`):
+            Whether to scale the input targets.
+        num_time_features (`int`, *optional*, defaults to 0):
+            The number of time features in the input time series.
+        num_dynamic_real_features (`int`, *optional*, defaults to 0):
+            The number of dynamic real valued features.
+        num_static_categorical_features (`int`, *optional*, defaults to 0):
+            The number of static categorical features.
+        num_static_real_features (`int`, *optional*, defaults to 0):
+            The number of static real valued features.
+        cardinality (`list[int]`, *optional*):
+            The cardinality (number of different values) for each of the static categorical features. Should be a list
+            of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        embedding_dimension (`list[int]`, *optional*):
+            The dimension of the embedding for each of the static categorical features. Should be a list of integers,
+            having the same length as `num_static_categorical_features`. Cannot be `None` if
+            `num_static_categorical_features` is > 0.
+        d_model (`int`, *optional*, defaults to 64):
+            Dimensionality of the transformer layers.
+        encoder_layers (`int`, *optional*, defaults to 2):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 2):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 32):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and
+            `"relu"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder, and decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each encoder layer.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention and fully connected layers for each decoder layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability used between the two layers of the feed-forward networks.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for each time step of inference.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions (if applicable to the model) to speed up decoding.
+        label_length (`int`, *optional*, defaults to 10):
+            Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
+            non-autoregressive generation).
+        moving_average (`int`, defaults to 25):
+            The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
+            Layer.
+        autocorrelation_factor (`int`, defaults to 3):
+            "Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
+            It's recommended in the paper to set it to a number between 1 and 5.
+
+
+        Example:
+
+    ```python
+    >>> from transformers import AutoformerConfig, AutoformerModel
+
+    >>> # Initializing a default Autoformer configuration
+    >>> configuration = AutoformerConfig()
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = AutoformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "autoformer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+    }
+
+    def __init__(
+        self,
+        prediction_length: Optional[int] = None,
+        context_length: Optional[int] = None,
+        distribution_output: str = "student_t",
+        loss: str = "nll",
+        input_size: int = 1,
+        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
+        scaling: bool = True,
+        num_time_features: int = 0,
+        num_dynamic_real_features: int = 0,
+        num_static_categorical_features: int = 0,
+        num_static_real_features: int = 0,
+        cardinality: Optional[List[int]] = None,
+        embedding_dimension: Optional[List[int]] = None,
+        d_model: int = 64,
+        encoder_attention_heads: int = 2,
+        decoder_attention_heads: int = 2,
+        encoder_layers: int = 2,
+        decoder_layers: int = 2,
+        encoder_ffn_dim: int = 32,
+        decoder_ffn_dim: int = 32,
+        activation_function: str = "gelu",
+        dropout: float = 0.1,
+        encoder_layerdrop: float = 0.1,
+        decoder_layerdrop: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        num_parallel_samples: int = 100,
+        init_std: float = 0.02,
+        use_cache: bool = True,
+        is_encoder_decoder=True,
+        # Autoformer arguments
+        label_length: int = 10,
+        moving_average: int = 25,
+        autocorrelation_factor: int = 3,
+        **kwargs,
+    ):
+        # time series specific configuration
+        self.prediction_length = prediction_length
+        self.context_length = context_length if context_length is not None else prediction_length
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.input_size = input_size
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.scaling = scaling
+        self.num_dynamic_real_features = num_dynamic_real_features
+        self.num_static_real_features = num_static_real_features
+        self.num_static_categorical_features = num_static_categorical_features
+        if cardinality is not None and num_static_categorical_features > 0:
+            if len(cardinality) != num_static_categorical_features:
+                raise ValueError(
+                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.cardinality = cardinality
+        else:
+            self.cardinality = [0]
+        if embedding_dimension is not None and num_static_categorical_features > 0:
+            if len(embedding_dimension) != num_static_categorical_features:
+                raise ValueError(
+                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
+                )
+            self.embedding_dimension = embedding_dimension
+        else:
+            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
+        self.d_model = d_model
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+
+        self.activation_function = activation_function
+        self.init_std = init_std
+
+        self.use_cache = use_cache
+
+        # Autoformer
+        self.label_length = label_length
+        self.moving_average = moving_average
+        self.autocorrelation_factor = autocorrelation_factor
+
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def _number_of_features(self) -> int:
+        return (
+            sum(self.embedding_dimension)
+            + self.num_dynamic_real_features
+            + self.num_time_features
+            + self.num_static_real_features
+            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
+        )
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
new file mode 100644
index 000000000000..a77920fb9d6e
--- /dev/null
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -0,0 +1,2178 @@
+# coding=utf-8
+# Copyright (c) 2021 THUML @ Tsinghua University
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Autoformer model."""
+
+import math
+import random
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    ModelOutput,
+    SampleTSPredictionOutput,
+    Seq2SeqTSPredictionOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_autoformer import AutoformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "AutoformerConfig"
+
+
+@dataclass
+class AutoFormerDecoderOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Trend tensor for each time series.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    trend: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class AutoformerModelOutput(ModelOutput):
+    """
+    Autoformer model output that contains the additional trend output.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Trend tensor for each time series.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Shift values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to shift back to the original magnitude.
+        scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
+            Scaling values of each time series' context window which is used to give the model inputs of the same
+            magnitude and then used to rescale back to the original magnitude.
+        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+            Static features of each time series' in a batch which are copied to the covariates at inference time.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    trend: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    static_features: Optional[torch.FloatTensor] = None
+
+
+AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "huggingface/autoformer-tourism-monthly",
+    # See all Autoformer models at https://huggingface.co/models?filter=autoformer
+]
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder with TimeSeries->Autoformer
+class AutoformerFeatureEmbedder(nn.Module):
+    """
+    Embed a sequence of categorical features.
+
+    Args:
+        cardinalities (`list[int]`):
+            List of cardinalities of the categorical features.
+        embedding_dims (`list[int]`):
+            List of embedding dimensions of the categorical features.
+    """
+
+    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
+        super().__init__()
+
+        self.num_features = len(cardinalities)
+        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.num_features > 1:
+            # we slice the last dimension, giving an array of length
+            # self.num_features with shape (N,T) or (N)
+            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
+        else:
+            cat_feature_slices = [features]
+
+        return torch.cat(
+            [
+                embed(cat_feature_slice.squeeze(-1))
+                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
+            ],
+            dim=-1,
+        )
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Autoformer
+class AutoformerStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
+    by subtracting from the mean and dividing by the standard deviation.
+
+    Args:
+        dim (`int`):
+            Dimension along which to calculate the mean and standard deviation.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        minimum_scale (`float`, *optional*, defaults to 1e-5):
+            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+        super().__init__()
+        if not dim > 0:
+            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+
+    @torch.no_grad()
+    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Autoformer
+class AutoformerMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    accordingly.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+        default_scale (`float`, *optional*, defaults to `None`):
+            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
+        minimum_scale (`float`, *optional*, defaults to 1e-10):
+            Default minimum possible scale that is used for any item.
+    """
+
+    def __init__(
+        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
+    ):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.minimum_scale = minimum_scale
+        self.default_scale = default_scale
+
+    @torch.no_grad()
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # shape: (N, [C], T=1)
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Autoformer
+class AutoformerNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
+
+    Args:
+        dim (`int`):
+            Dimension along which to compute the scale.
+        keepdim (`bool`, *optional*, defaults to `False`):
+            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    """
+
+    def __init__(self, dim: int, keepdim: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Autoformer
+class AutoformerSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Autoformer
+class AutoformerValueEmbedding(nn.Module):
+    def __init__(self, feature_size, d_model):
+        super().__init__()
+        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
+
+    def forward(self, x):
+        return self.value_projection(x)
+
+
+# Class based on
+# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L39
+# where AutoformerSeriesDecompositionLayer is series_decomp + moving_average
+class AutoformerSeriesDecompositionLayer(nn.Module):
+    """
+    Returns the trend and the seasonal parts of the time series. Calculated as:
+
+        x_trend = AvgPool(Padding(X)) and x_seasonal = X - x_trend
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.kernel_size = config.moving_average
+        self.avg = nn.AvgPool1d(kernel_size=self.kernel_size, stride=1, padding=0)
+
+    def forward(self, x):
+        """Input shape: Batch x Time x EMBED_DIM"""
+        # padding on the both ends of time series
+        num_of_pads = (self.kernel_size - 1) // 2
+        front = x[:, 0:1, :].repeat(1, num_of_pads, 1)
+        end = x[:, -1:, :].repeat(1, num_of_pads, 1)
+        x_padded = torch.cat([front, x, end], dim=1)
+
+        # calculate the trend and seasonal part of the series
+        x_trend = self.avg(x_padded.permute(0, 2, 1)).permute(0, 2, 1)
+        x_seasonal = x - x_trend
+        return x_seasonal, x_trend
+
+
+# Class based on
+# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L6
+# where AutoformerLayernorm is my_Layernorm
+class AutoformerLayernorm(nn.Module):
+    """
+    Special designed layer normalization for the seasonal part, calculated as: AutoformerLayernorm(x) = nn.LayerNorm(x)
+    - torch.mean(nn.LayerNorm(x))
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.layernorm = nn.LayerNorm(config.d_model)
+
+    def forward(self, x):
+        x_hat = self.layernorm(x)
+        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
+        return x_hat - bias
+
+
+class AutoformerAttention(nn.Module):
+    """
+    AutoCorrelation Mechanism with the following two phases:
+        (1) period-based dependencies discovery (2) time delay aggregation
+    This block replace the canonical self-attention mechanism.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        autocorrelation_factor: int = 3,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        self.autocorrelation_factor = autocorrelation_factor
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        # (1) period-based dependencies discovery
+        # Resize (truncation or zero filling)
+        queries_time_length = query_states.size(1)
+        values_time_length = value_states.size(1)
+        if queries_time_length > values_time_length:
+            query_states = query_states[:, : (queries_time_length - values_time_length), :]
+            zeros = torch.zeros_like(query_states).float()
+            value_states = torch.cat([value_states, zeros], dim=1)
+            key_states = torch.cat([key_states, zeros], dim=1)
+        else:
+            value_states = value_states[:, :queries_time_length, :]
+            key_states = key_states[:, :queries_time_length, :]
+
+        query_states_fft = torch.fft.rfft(query_states, n=tgt_len, dim=1)
+        key_states_fft = torch.fft.rfft(key_states, n=tgt_len, dim=1)
+        attn_weights = query_states_fft * torch.conj(key_states_fft)
+        attn_weights = torch.fft.irfft(attn_weights, n=tgt_len, dim=1)  # Autocorrelation(Q,K)
+
+        src_len = key_states.size(1)
+        channel = key_states.size(2)
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, channel):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, channel)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, channel)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, channel)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, channel)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, channel)
+        else:
+            attn_weights_reshaped = None
+
+        # time delay aggregation
+        time_length = value_states.size(1)
+        autocorrelations = attn_weights.view(bsz, self.num_heads, tgt_len, channel)
+
+        # find top k autocorrelations delays
+        top_k = int(self.autocorrelation_factor * math.log(time_length))
+        autocorrelations_mean_on_head_channel = torch.mean(autocorrelations, dim=(1, -1))  # bsz x tgt_len
+        if self.training:
+            autocorrelations_mean_on_bsz = torch.mean(autocorrelations_mean_on_head_channel, dim=0)
+            _, top_k_delays_index = torch.topk(autocorrelations_mean_on_bsz, top_k)
+            top_k_autocorrelations = torch.stack(
+                [autocorrelations_mean_on_head_channel[:, top_k_delays_index[i]] for i in range(top_k)], dim=-1
+            )
+        else:
+            top_k_autocorrelations, top_k_delays_index = torch.topk(
+                autocorrelations_mean_on_head_channel, top_k, dim=1
+            )
+
+        top_k_autocorrelations = torch.softmax(top_k_autocorrelations, dim=-1)  # bsz x top_k
+
+        # compute aggregation: value_states.roll(delay) * top_k_autocorrelations(delay)
+        if not self.training:
+            # used for compute values_states.roll(delay) in inference
+            tmp_values = value_states.repeat(1, 2, 1)
+            init_index = (
+                torch.arange(time_length)
+                .view(1, -1, 1)
+                .repeat(bsz * self.num_heads, 1, channel)
+                .to(value_states.device)
+            )
+
+        delays_agg = torch.zeros_like(value_states).float()  # bsz x time_length x channel
+        for i in range(top_k):
+            # compute value_states roll delay
+            if not self.training:
+                tmp_delay = init_index + top_k_delays_index[:, i].view(-1, 1, 1).repeat(
+                    self.num_heads, tgt_len, channel
+                )
+                value_states_roll_delay = torch.gather(tmp_values, dim=1, index=tmp_delay)
+            else:
+                value_states_roll_delay = value_states.roll(shifts=-int(top_k_delays_index[i]), dims=1)
+
+            # aggregation
+            top_k_autocorrelations_at_delay = (
+                top_k_autocorrelations[:, i].view(-1, 1, 1).repeat(self.num_heads, tgt_len, channel)
+            )
+            delays_agg += value_states_roll_delay * top_k_autocorrelations_at_delay
+
+        attn_output = delays_agg.contiguous()
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class AutoformerEncoderLayer(nn.Module):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = AutoformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            autocorrelation_factor=config.autocorrelation_factor,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = AutoformerLayernorm(config)
+        self.decomp1 = AutoformerSeriesDecompositionLayer(config)
+        self.decomp2 = AutoformerSeriesDecompositionLayer(config)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        # added layer norm here as an improvement
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, _ = self.decomp1(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states, _ = self.decomp2(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class AutoformerDecoderLayer(nn.Module):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = AutoformerAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            autocorrelation_factor=config.autocorrelation_factor,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = AutoformerAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            autocorrelation_factor=config.autocorrelation_factor,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = AutoformerLayernorm(config)
+
+        self.decomp1 = AutoformerSeriesDecompositionLayer(config)
+        self.decomp2 = AutoformerSeriesDecompositionLayer(config)
+        self.decomp3 = AutoformerSeriesDecompositionLayer(config)
+
+        # source: https://github.com/thuml/Autoformer/blob/e6371e24f2ae2dd53e472edefdd5814c5176f864/layers/Autoformer_EncDec.py#L128
+        self.trend_projection = nn.Conv1d(
+            in_channels=self.embed_dim,
+            out_channels=config.feature_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode="circular",
+            bias=False,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache: (`bool`, *optional*, defaults to `True`):
+                Whether or not the model should return the `present_key_value` state to be used for subsequent
+                decoding.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states, trend1 = self.decomp1(hidden_states)
+        # added layer norm here as an improvement
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states, trend2 = self.decomp2(hidden_states)
+            # added layer norm here as an improvement
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states, trend3 = self.decomp3(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if encoder_hidden_states is not None:
+            residual_trend = trend1 + trend2 + trend3
+        else:
+            residual_trend = trend1 + trend3
+        residual_trend = self.trend_projection(residual_trend.permute(0, 2, 1)).transpose(1, 2)
+        outputs = ((hidden_states, residual_trend),)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class AutoformerPreTrainedModel(PreTrainedModel):
+    config_class = AutoformerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, AutoformerSinusoidalPositionalEmbedding):
+            pass
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (AutoformerDecoder, AutoformerEncoder)):
+            module.gradient_checkpointing = value
+
+
+AUTOFORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AutoformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+AUTOFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Past values of the time series, that serve as context in order to predict the future. These values may
+            contain lags, i.e. additional values from the past which are added in order to serve as "extra context".
+            The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as
+            `static_categorical_features`, `static_real_features`, `past_time_features`).
+
+            The sequence length here is equal to `context_length` + `max(config.lags_sequence)`.
+
+            Missing values need to be replaced with zeros.
+
+        past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `past_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional time features.
+
+            The Autoformer only learns additional embeddings for `static_categorical_features`.
+
+        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for values that are **observed**,
+            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+            Optional static categorical features for which the model will learn an embedding, which it will add to the
+            values of the time series.
+
+            Static categorical features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static categorical feature is a time series ID.
+
+        static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+            Optional static real features which the model will add to the values of the time series.
+
+            Static real features are features which have the same value for all time steps (static over time).
+
+            A typical example of a static real feature is promotion information.
+
+        future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`):
+            Future values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs to learn to output, given the `past_values`.
+
+            See the demo notebook and code snippets for details.
+
+            Missing values need to be replaced with zeros.
+
+        future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*):
+            Optional time features, which the model internally will add to `future_values`. These could be things like
+            "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These
+            could also be so-called "age" features, which basically help the model know "at which point in life" a
+            time-series is. Age features have small values for distant past time steps and increase monotonically the
+            more we approach the current time step.
+
+            These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where
+            the position encodings are learned from scratch internally as parameters of the model, the Time Series
+            Transformer requires to provide additional features.
+
+            The Autoformer only learns additional embeddings for `static_categorical_features`.
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to
+            make sure the model can only look at previous inputs in order to predict the future.
+
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
+class AutoformerEncoder(AutoformerPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`AutoformerEncoderLayer`].
+
+    Args:
+        config: AutoformerConfig
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([AutoformerEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(inputs_embeds.size())
+
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class AutoformerDecoder(AutoformerPreTrainedModel):
+    """
+    Transformer decoder consisting of `config.decoder_layers` layers. Each layer is a [`AutoformerDecoderLayer`]
+
+    Args:
+        config: AutoformerConfig
+    """
+
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        if config.prediction_length is None:
+            raise ValueError("The `prediction_length` config needs to be specified.")
+
+        self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
+        self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
+            config.context_length + config.prediction_length, config.d_model
+        )
+        self.layers = nn.ModuleList([AutoformerDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        # https://github.com/thuml/Autoformer/blob/e6371e24f2ae2dd53e472edefdd5814c5176f864/models/Autoformer.py#L74
+        self.seasonality_projection = nn.Linear(config.d_model, config.feature_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        trend: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AutoFormerDecoderOutput]:
+        r"""
+        Args:
+            trend (`torch.FloatTensor` of shape `(batch_size, prediction_length, feature_size)`, *optional*):
+                The trend sequence to be fed to the decoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            use_cache (`bool`, *optional*):
+                If `use_cache` is True, `past_key_values` key value states are returned and can be used to speed up
+                decoding (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = inputs_embeds.size()[:-1]
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        hidden_states = self.value_embedding(inputs_embeds)
+        embed_pos = self.embed_positions(
+            inputs_embeds.size(), past_key_values_length=self.config.context_length - self.config.label_length
+        )
+        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            (hidden_states, residual_trend) = layer_outputs[0]
+            trend = trend + residual_trend
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # project seasonality representation
+        hidden_states = self.seasonality_projection(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, trend, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return AutoFormerDecoderOutput(
+            last_hidden_state=hidden_states,
+            trend=trend,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Autoformer Model outputting raw hidden-states without any specific head on top.",
+    AUTOFORMER_START_DOCSTRING,
+)
+class AutoformerModel(AutoformerPreTrainedModel):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+
+        if config.scaling == "mean" or config.scaling:
+            self.scaler = AutoformerMeanScaler(dim=1, keepdim=True)
+        elif config.scaling == "std":
+            self.scaler = AutoformerStdScaler(dim=1, keepdim=True)
+        else:
+            self.scaler = AutoformerNOPScaler(dim=1, keepdim=True)
+
+        if config.num_static_categorical_features > 0:
+            self.embedder = AutoformerFeatureEmbedder(
+                cardinalities=config.cardinality, embedding_dims=config.embedding_dimension
+            )
+
+        # transformer encoder-decoder and mask initializer
+        self.encoder = AutoformerEncoder(config)
+        self.decoder = AutoformerDecoder(config)
+
+        # used for decoder seasonal and trend initialization
+        self.decomposition_layer = AutoformerSeriesDecompositionLayer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @property
+    def _past_length(self) -> int:
+        return self.config.context_length + max(self.config.lags_sequence)
+
+    def get_lagged_subsequences(
+        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
+    ) -> torch.Tensor:
+        """
+        Returns lagged subsequences of a given sequence. Returns a tensor of shape (batch_size, subsequences_length,
+        feature_size, indices_length), containing lagged subsequences. Specifically, lagged[i, j, :, k] = sequence[i,
+        -indices[k]-subsequences_length+j, :].
+
+        Args:
+            sequence (`torch.Tensor` or shape `(batch_size, context_length,
+                feature_size)`): The sequence from which lagged subsequences should be extracted.
+            subsequences_length (`int`):
+                Length of the subsequences to be extracted.
+            shift (`int`, *optional* defaults to 0):
+                Shift the lags by this amount back in the time index.
+        """
+
+        # calculates the indices of the lags by subtracting the shift value from the given lags_sequence
+        indices = [lag - shift for lag in self.config.lags_sequence]
+
+        # checks if the maximum lag plus the length of the subsequences exceeds the length of the input sequence
+        sequence_length = sequence.shape[1]
+        if max(indices) + subsequences_length > sequence_length:
+            raise ValueError(
+                f"lags cannot go further than history length, found lag {max(indices)} "
+                f"while history length is only {sequence_length}"
+            )
+
+        # extracts the lagged subsequences from the input sequence using the calculated indices
+        lagged_values = []
+        for lag_index in indices:
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+
+        # return as stacked tensor in the feature dimension
+        return torch.stack(lagged_values, dim=-1)
+
+    def create_network_inputs(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Creates the inputs for the network given the past and future values, time features, and static features.
+
+        Args:
+            past_values (`torch.Tensor`):
+                A tensor of shape `(batch_size, past_length, input_size)` containing the past values.
+            past_time_features (`torch.Tensor`):
+                A tensor of shape `(batch_size, past_length, num_features)` containing the past time features.
+            static_categorical_features (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, num_categorical_features)` containing the static categorical
+                features.
+            static_real_features (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, num_real_features)` containing the static real features.
+            past_observed_mask (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, past_length, input_size)` containing the mask of observed
+                values in the past.
+            future_values (`Optional[torch.Tensor]`):
+                An optional tensor of shape `(batch_size, future_length, input_size)` containing the future values.
+
+        Returns:
+            A tuple containing the following tensors:
+            - reshaped_lagged_sequence (`torch.Tensor`): A tensor of shape `(batch_size, sequence_length, num_lags *
+              input_size)` containing the lagged subsequences of the inputs.
+            - features (`torch.Tensor`): A tensor of shape `(batch_size, sequence_length, num_features)` containing the
+              concatenated static and time features.
+            - loc (`torch.Tensor`): A tensor of shape `(batch_size, input_size)` containing the mean of the input
+              values.
+            - scale (`torch.Tensor`): A tensor of shape `(batch_size, input_size)` containing the std of the input
+              values.
+            - static_feat (`torch.Tensor`): A tensor of shape `(batch_size, num_static_features)` containing the
+              concatenated static features.
+        """
+        # time feature
+        time_feat = (
+            torch.cat(
+                (
+                    past_time_features[:, self._past_length - self.config.context_length :, ...],
+                    future_time_features,
+                ),
+                dim=1,
+            )
+            if future_values is not None
+            else past_time_features[:, self._past_length - self.config.context_length :, ...]
+        )
+
+        # target
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        context = past_values[:, -self.config.context_length :]
+        observed_context = past_observed_mask[:, -self.config.context_length :]
+        _, loc, scale = self.scaler(context, observed_context)
+
+        inputs = (
+            (torch.cat((past_values, future_values), dim=1) - loc) / scale
+            if future_values is not None
+            else (past_values - loc) / scale
+        )
+
+        # static features
+        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
+        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
+        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
+
+        if static_real_features is not None:
+            static_feat = torch.cat((static_real_features, static_feat), dim=1)
+        if static_categorical_features is not None:
+            embedded_cat = self.embedder(static_categorical_features)
+            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
+
+        # all features
+        features = torch.cat((expanded_static_feat, time_feat), dim=-1)
+
+        # lagged features
+        subsequences_length = (
+            self.config.context_length + self.config.prediction_length
+            if future_values is not None
+            else self.config.context_length
+        )
+        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+
+        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
+            raise ValueError(
+                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
+            )
+        return reshaped_lagged_sequence, features, loc, scale, static_feat
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=AutoformerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[AutoformerModelOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import AutoformerModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = AutoformerModel.from_pretrained("huggingface/autoformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_inputs, temporal_features, loc, scale, static_feat = self.create_network_inputs(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+        )
+
+        if encoder_outputs is None:
+            enc_input = torch.cat(
+                (
+                    transformer_inputs[:, : self.config.context_length, ...],
+                    temporal_features[:, : self.config.context_length, ...],
+                ),
+                dim=-1,
+            )
+            encoder_outputs = self.encoder(
+                inputs_embeds=enc_input,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        if future_values is not None:
+            # Decoder inputs
+            # seasonality and trend from context length
+            seasonal_input, trend_input = self.decomposition_layer(
+                transformer_inputs[:, : self.config.context_length, ...]
+            )
+            mean = (
+                torch.mean(transformer_inputs[:, : self.config.context_length, ...], dim=1)
+                .unsqueeze(1)
+                .repeat(1, self.config.prediction_length, 1)
+            )
+            zeros = torch.zeros(
+                [transformer_inputs.shape[0], self.config.prediction_length, transformer_inputs.shape[2]],
+                device=enc_input.device,
+            )
+
+            decoder_input = torch.cat(
+                (
+                    torch.cat((seasonal_input[:, -self.config.label_length :, ...], zeros), dim=1),
+                    temporal_features[:, self.config.context_length - self.config.label_length :, ...],
+                ),
+                dim=-1,
+            )
+            trend_init = torch.cat(
+                (
+                    torch.cat((trend_input[:, -self.config.label_length :, ...], mean), dim=1),
+                    temporal_features[:, self.config.context_length - self.config.label_length :, ...],
+                ),
+                dim=-1,
+            )
+
+            decoder_outputs = self.decoder(
+                trend=trend_init,
+                inputs_embeds=decoder_input,
+                attention_mask=decoder_attention_mask,
+                encoder_hidden_states=encoder_outputs[0],
+                head_mask=decoder_head_mask,
+                cross_attn_head_mask=cross_attn_head_mask,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            decoder_outputs = AutoFormerDecoderOutput()
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs + (loc, scale, static_feat)
+
+        return AutoformerModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            trend=decoder_outputs.trend,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            loc=loc,
+            scale=scale,
+            static_features=static_feat,
+        )
+
+
+@add_start_docstrings(
+    "The Autoformer Model with a distribution head on top for time-series forecasting.",
+    AUTOFORMER_START_DOCSTRING,
+)
+class AutoformerForPrediction(AutoformerPreTrainedModel):
+    def __init__(self, config: AutoformerConfig):
+        super().__init__(config)
+        self.model = AutoformerModel(config)
+        if config.distribution_output == "student_t":
+            self.distribution_output = StudentTOutput(dim=config.input_size)
+        elif config.distribution_output == "normal":
+            self.distribution_output = NormalOutput(dim=config.input_size)
+        elif config.distribution_output == "negative_binomial":
+            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
+        else:
+            raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.feature_size)
+        self.target_shape = self.distribution_output.event_shape
+
+        if config.loss == "nll":
+            self.loss = nll
+        else:
+            raise ValueError(f"Unknown loss function {config.loss}")
+
+        # Initialize weights of distribution_output and apply final processing
+        self.post_init()
+
+    def output_params(self, decoder_output):
+        return self.parameter_projection(decoder_output[:, -self.config.prediction_length :, :])
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @torch.jit.ignore
+    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
+        sliced_params = params
+        if trailing_n is not None:
+            sliced_params = [p[:, -trailing_n:] for p in params]
+        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
+
+    @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqTSPredictionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        past_observed_mask: torch.Tensor,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        future_time_features: Optional[torch.Tensor] = None,
+        future_observed_mask: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqTSPredictionOutput, Tuple]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import AutoformerForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly")
+
+        >>> # during training, one provides both past and future values
+        >>> # as well as possible additional features
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_values=batch["future_values"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values
+        >>> # as well as possible additional features
+        >>> # the model autoregressively generates future values
+        >>> outputs = model.generate(
+        ...     past_values=batch["past_values"],
+        ...     past_time_features=batch["past_time_features"],
+        ...     past_observed_mask=batch["past_observed_mask"],
+        ...     static_categorical_features=batch["static_categorical_features"],
+        ...     static_real_features=batch["static_real_features"],
+        ...     future_time_features=batch["future_time_features"],
+        ... )
+
+        >>> mean_prediction = outputs.sequences.mean(dim=1)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if future_values is not None:
+            use_cache = False
+
+        outputs = self.model(
+            past_values=past_values,
+            past_time_features=past_time_features,
+            past_observed_mask=past_observed_mask,
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            future_values=future_values,
+            future_time_features=future_time_features,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        prediction_loss = None
+        params = None
+        if future_values is not None:
+            # outputs.last_hidden_state and trend
+            # loc is 4rd last and scale is 3rd last output
+            params = self.output_params(outputs[0] + outputs[1])
+            distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2])
+
+            loss = self.loss(distribution, future_values)
+
+            if future_observed_mask is None:
+                future_observed_mask = torch.ones_like(future_values)
+
+            if len(self.target_shape) == 0:
+                loss_weights = future_observed_mask
+            else:
+                loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False)
+
+            prediction_loss = weighted_average(loss, weights=loss_weights)
+
+        if not return_dict:
+            outputs = ((params,) + outputs[2:]) if params is not None else outputs[2:]
+            return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs
+
+        return Seq2SeqTSPredictionOutput(
+            loss=prediction_loss,
+            params=params,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loc=outputs.loc,
+            scale=outputs.scale,
+            static_features=outputs.static_features,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_time_features: torch.Tensor,
+        future_time_features: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        static_categorical_features: Optional[torch.Tensor] = None,
+        static_real_features: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SampleTSPredictionOutput:
+        r"""
+        Greedily generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
+                Past values of the time series, that serve as context in order to predict the future. The sequence size
+                of this tensor must be larger than the `context_length` of the model, since the model will use the
+                larger size to construct lag features, i.e. additional values from the past which are added in order to
+                serve as "extra context".
+
+                The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if
+                no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest
+                look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length
+                of the past.
+
+                The `past_values` is what the Transformer encoder gets as input (with optional additional features,
+                such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags).
+
+                Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`.
+
+                For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number
+                of variates in the time series per time step.
+            past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`):
+                Required time features, which the model internally will add to `past_values`. These could be things
+                like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features).
+                These could also be so-called "age" features, which basically help the model know "at which point in
+                life" a time-series is. Age features have small values for distant past time steps and increase
+                monotonically the more we approach the current time step. Holiday features are also a good example of
+                time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`):
+                Required time features for the prediction window, which the model internally will add to sampled
+                predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors
+                (for instance as Fourier features). These could also be so-called "age" features, which basically help
+                the model know "at which point in life" a time-series is. Age features have small values for distant
+                past time steps and increase monotonically the more we approach the current time step. Holiday features
+                are also a good example of time features.
+
+                These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT,
+                where the position encodings are learned from scratch internally as parameters of the model, the Time
+                Series Transformer requires to provide additional time features. The Time Series Transformer only
+                learns additional embeddings for `static_categorical_features`.
+
+                Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these
+                features must but known at prediction time.
+
+                The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+            static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*):
+                Optional static categorical features for which the model will learn an embedding, which it will add to
+                the values of the time series.
+
+                Static categorical features are features which have the same value for all time steps (static over
+                time).
+
+                A typical example of a static categorical feature is a time series ID.
+            static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*):
+                Optional static real features which the model will add to the values of the time series.
+
+                Static real features are features which have the same value for all time steps (static over time).
+
+                A typical example of a static real feature is promotion information.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+        Return:
+            [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for
+            multivariate predictions.
+        """
+        outputs = self(
+            static_categorical_features=static_categorical_features,
+            static_real_features=static_real_features,
+            past_time_features=past_time_features,
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            future_time_features=None,
+            future_values=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            use_cache=False,
+        )
+
+        decoder = self.model.get_decoder()
+        enc_last_hidden = outputs.encoder_last_hidden_state
+        loc = outputs.loc
+        scale = outputs.scale
+        static_feat = outputs.static_features
+
+        num_parallel_samples = self.config.num_parallel_samples
+        repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0)
+        repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_past_values = (
+            past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc
+        ) / repeated_scale
+
+        time_features = torch.cat((past_time_features, future_time_features), dim=1)
+
+        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_features.shape[1], -1)
+        features = torch.cat((expanded_static_feat, time_features), dim=-1)
+        repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0)
+
+        lagged_sequence = self.model.get_lagged_subsequences(
+            sequence=repeated_past_values, subsequences_length=self.config.context_length
+        )
+        lags_shape = lagged_sequence.shape
+        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
+        seasonal_input, trend_input = self.model.decomposition_layer(reshaped_lagged_sequence)
+
+        mean = torch.mean(reshaped_lagged_sequence, dim=1).unsqueeze(1).repeat(1, self.config.prediction_length, 1)
+        zeros = torch.zeros(
+            [reshaped_lagged_sequence.shape[0], self.config.prediction_length, reshaped_lagged_sequence.shape[2]],
+            device=reshaped_lagged_sequence.device,
+        )
+
+        decoder_input = torch.cat(
+            (
+                torch.cat((seasonal_input[:, -self.config.label_length :, ...], zeros), dim=1),
+                repeated_features[:, -self.config.prediction_length - self.config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+        trend_init = torch.cat(
+            (
+                torch.cat((trend_input[:, -self.config.label_length :, ...], mean), dim=1),
+                repeated_features[:, -self.config.prediction_length - self.config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+        decoder_outputs = decoder(
+            trend=trend_init, inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden
+        )
+        decoder_last_hidden = decoder_outputs.last_hidden_state
+        trend = decoder_outputs.trend
+        params = self.output_params(decoder_last_hidden + trend)
+        distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale)
+        future_samples = distr.sample()
+
+        return SampleTSPredictionOutput(
+            sequences=future_samples.reshape(
+                (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape,
+            )
+        )
diff --git a/src/transformers/models/informer/configuration_informer.py b/src/transformers/models/informer/configuration_informer.py
index d5950275b988..d8af8c793cdb 100644
--- a/src/transformers/models/informer/configuration_informer.py
+++ b/src/transformers/models/informer/configuration_informer.py
@@ -232,9 +232,6 @@ def __init__(
         self.activation_function = activation_function
         self.init_std = init_std
 
-        self.output_attentions = False
-        self.output_hidden_states = False
-
         self.use_cache = use_cache
 
         # Informer
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1d0451add50d..4c8edcbc1564 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -142,7 +142,9 @@ def __init__(
         self.default_scale = default_scale
 
     @torch.no_grad()
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # shape: (N, [C], T=1)
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
@@ -1669,7 +1671,7 @@ def forward(
         >>> from transformers import InformerModel
 
         >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
@@ -1834,7 +1836,7 @@ def forward(
         >>> from transformers import InformerForPrediction
 
         >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index b8b21045b542..9676b50ed0b9 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -217,9 +217,6 @@ def __init__(
         self.activation_function = activation_function
         self.init_std = init_std
 
-        self.output_attentions = False
-        self.output_hidden_states = False
-
         self.use_cache = use_cache
 
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 812a8d6b4d01..d5ffa069d95a 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -140,7 +140,9 @@ def __init__(
         self.default_scale = default_scale
 
     @torch.no_grad()
-    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # shape: (N, [C], T=1)
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
@@ -1394,7 +1396,7 @@ def forward(
         >>> from transformers import TimeSeriesTransformerModel
 
         >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
@@ -1558,7 +1560,7 @@ def forward(
         >>> from transformers import TimeSeriesTransformerForPrediction
 
         >>> file = hf_hub_download(
-        ...     repo_id="kashif/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
+        ...     repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
         ... )
         >>> batch = torch.load(file)
 
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index b07451253e87..02eddd72cebd 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -171,7 +171,7 @@ class StudentTOutput(DistributionOutput):
 
     @classmethod
     def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
+        scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
         df = 2.0 + cls.squareplus(df)
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
@@ -186,7 +186,7 @@ class NormalOutput(DistributionOutput):
 
     @classmethod
     def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
-        scale = cls.squareplus(scale)
+        scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
         return loc.squeeze(-1), scale.squeeze(-1)
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ed09fe33ad5f..c32528c4c764 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -772,6 +772,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class AutoformerForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AutoformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AutoformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/autoformer/__init__.py b/tests/models/autoformer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
new file mode 100644
index 000000000000..9df5bf236e08
--- /dev/null
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -0,0 +1,449 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Autoformer model. """
+
+import inspect
+import tempfile
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoformerConfig, AutoformerForPrediction, AutoformerModel
+    from transformers.models.autoformer.modeling_autoformer import AutoformerDecoder, AutoformerEncoder
+
+
+@require_torch
+class AutoformerModelTester:
+    def __init__(
+        self,
+        parent,
+        d_model=16,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        label_length=10,
+        cardinality=19,
+        embedding_dimension=5,
+        num_time_features=4,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        lags_sequence=[1, 2, 3, 4, 5],
+        moving_average=25,
+        autocorrelation_factor=5,
+    ):
+        self.d_model = d_model
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.num_time_features = num_time_features
+        self.lags_sequence = lags_sequence
+        self.embedding_dimension = embedding_dimension
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+        self.encoder_seq_length = context_length
+        self.decoder_seq_length = prediction_length + label_length
+        self.label_length = label_length
+
+        self.moving_average = moving_average
+        self.autocorrelation_factor = autocorrelation_factor
+
+    def get_config(self):
+        return AutoformerConfig(
+            d_model=self.d_model,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            prediction_length=self.prediction_length,
+            context_length=self.context_length,
+            label_length=self.label_length,
+            lags_sequence=self.lags_sequence,
+            num_time_features=self.num_time_features,
+            num_static_categorical_features=1,
+            cardinality=[self.cardinality],
+            embedding_dimension=[self.embedding_dimension],
+            moving_average=self.moving_average,
+        )
+
+    def prepare_autoformer_inputs_dict(self, config):
+        _past_length = config.context_length + max(config.lags_sequence)
+
+        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
+        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
+        past_values = floats_tensor([self.batch_size, _past_length])
+        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
+
+        # decoder inputs
+        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
+        future_values = floats_tensor([self.batch_size, config.prediction_length])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "static_categorical_features": static_categorical_features,
+            "past_time_features": past_time_features,
+            "past_observed_mask": past_observed_mask,
+            "future_time_features": future_time_features,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_autoformer_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = AutoformerModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = AutoformerEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        transformer_inputs, feature, _, _, _ = model.create_network_inputs(**inputs_dict)
+        seasonal_input, trend_input = model.decomposition_layer(transformer_inputs[:, : config.context_length, ...])
+
+        enc_input = torch.cat(
+            (transformer_inputs[:, : config.context_length, ...], feature[:, : config.context_length, ...]),
+            dim=-1,
+        )
+        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        mean = (
+            torch.mean(transformer_inputs[:, : config.context_length, ...], dim=1)
+            .unsqueeze(1)
+            .repeat(1, config.prediction_length, 1)
+        )
+        zeros = torch.zeros(
+            [transformer_inputs.shape[0], config.prediction_length, transformer_inputs.shape[2]],
+            device=enc_input.device,
+        )
+
+        dec_input = torch.cat(
+            (
+                torch.cat((seasonal_input[:, -config.label_length :, ...], zeros), dim=1),
+                feature[:, config.context_length - config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+        trend_init = torch.cat(
+            (
+                torch.cat((trend_input[:, -config.label_length :, ...], mean), dim=1),
+                feature[:, config.context_length - config.label_length :, ...],
+            ),
+            dim=-1,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = AutoformerDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            trend=trend_init,
+            inputs_embeds=dec_input,
+            encoder_hidden_states=encoder_last_hidden_state,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class AutoformerModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
+    all_generative_model_classes = (AutoformerForPrediction,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    def setUp(self):
+        self.model_tester = AutoformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    @unittest.skip(reason="Model has no tokens embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # # Input is 'static_categorical_features' not 'input_ids'
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(AutoformerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(AutoformerModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "past_values",
+                "past_time_features",
+                "past_observed_mask",
+                "static_categorical_features",
+                "static_real_features",
+                "future_values",
+                "future_time_features",
+            ]
+
+            if model.__class__.__name__ in ["AutoformerForPrediction"]:
+                expected_arg_names.append("future_observed_mask")
+
+            expected_arg_names.extend(
+                [
+                    "decoder_attention_mask",
+                    "head_mask",
+                    "decoder_head_mask",
+                    "cross_attn_head_mask",
+                    "encoder_outputs",
+                    "past_key_values",
+                    "output_hidden_states",
+                    "output_attentions",
+                    "use_cache",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        d_model = getattr(self.model_tester, "d_model", None)
+        num_attention_heads = getattr(self.model_tester, "num_attention_heads", None)
+        dim = d_model // num_attention_heads
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, dim],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 7
+
+            if "last_hidden_state" in outputs:
+                correct_outlen += 1
+
+            if "trend" in outputs:
+                correct_outlen += 1
+
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            if "loss" in outputs:
+                correct_outlen += 1
+
+            if "params" in outputs:
+                correct_outlen += 1
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, dim],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, dim],
+            )
+
+        # Check attention is always last and order is fine
+        inputs_dict["output_attentions"] = True
+        inputs_dict["output_hidden_states"] = True
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+        self.assertEqual(out_len + 2, len(outputs))
+
+        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+        self.assertListEqual(
+            list(self_attentions[0].shape[-3:]),
+            [self.model_tester.num_attention_heads, encoder_seq_length, dim],
+        )
+
+
+def prepare_batch(filename="train-batch.pt"):
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class AutoformerModelIntegrationTests(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = AutoformerModel.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
+        batch = prepare_batch()
+
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+                future_values=batch["future_values"],
+                future_time_features=batch["future_time_features"],
+            )[0]
+
+        expected_shape = torch.Size(
+            (64, model.config.prediction_length + model.config.label_length, model.config.feature_size)
+        )
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.3593, -1.3398, 0.6330], [0.2279, 1.5396, -0.1792], [0.0450, 1.3225, -0.2335]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"],
+                past_time_features=batch["past_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+                static_categorical_features=batch["static_categorical_features"],
+            ).encoder_last_hidden_state
+        expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.0734, -0.9036, 0.8358], [4.7186, 2.4113, 1.9581], [1.7953, 2.3558, 1.2970]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
+        batch = prepare_batch("val-batch.pt")
+        with torch.no_grad():
+            outputs = model.generate(
+                static_categorical_features=batch["static_categorical_features"],
+                past_time_features=batch["past_time_features"],
+                past_values=batch["past_values"],
+                future_time_features=batch["future_time_features"],
+                past_observed_mask=batch["past_observed_mask"],
+            )
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor([3130.6763, 4056.5293, 7053.0786], device=torch_device)
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 493846b6708b..f3c8539d8450 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -438,7 +438,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 def prepare_batch(filename="train-batch.pt"):
-    file = hf_hub_download(repo_id="kashif/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
 
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 65834dac42f7..42319a1dd0a2 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -459,7 +459,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 def prepare_batch(filename="train-batch.pt"):
-    file = hf_hub_download(repo_id="kashif/tourism-monthly-batch", filename=filename, repo_type="dataset")
+    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
     batch = torch.load(file, map_location=torch_device)
     return batch
 
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 589a94ba6d07..929f3a51b1c6 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -73,6 +73,8 @@
     "InformerConfig": ["num_static_real_features", "num_time_features"],
     # used internally to calculate the feature size
     "TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
+    # used internally to calculate the feature size
+    "AutoformerConfig": ["num_static_real_features", "num_time_features"],
 }
 
 # TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 7280381faf97..8d1760d13352 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -73,6 +73,8 @@
     "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
     "InformerEncoder",  # Building part of bigger (tested) model.
     "InformerDecoder",  # Building part of bigger (tested) model.
+    "AutoformerEncoder",  # Building part of bigger (tested) model.
+    "AutoformerDecoder",  # Building part of bigger (tested) model.
     "JukeboxVQVAE",  # Building part of bigger (tested) model.
     "JukeboxPrior",  # Building part of bigger (tested) model.
     "DeformableDetrEncoder",  # Building part of bigger (tested) model.
@@ -223,6 +225,7 @@
     "GPTSanJapaneseModel",
     "TimeSeriesTransformerForPrediction",
     "InformerForPrediction",
+    "AutoformerForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",
     "PegasusXEncoder",

From af45ec0a1611062929ddbf6f15935e01e6cbf1af Mon Sep 17 00:00:00 2001
From: Samin Yasar <saminc97@gmail.com>
Date: Tue, 30 May 2023 16:05:58 +0600
Subject: [PATCH 244/935] add type hint in pipeline model argument (#23740)

* add type hint in pipeline model argument

* add pretrainedmodel and tfpretainedmodel type hint

* make type hints string
---
 src/transformers/pipelines/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 84d461cd1ae7..818164f3c28b 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -505,7 +505,7 @@ def clean_custom_task(task_info):
 
 def pipeline(
     task: str = None,
-    model: Optional = None,
+    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
     config: Optional[Union[str, PretrainedConfig]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,

From ac224dee90f09f9c5e1bbe6b3b603fa9c87d074b Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 30 May 2023 13:08:44 +0100
Subject: [PATCH 245/935] TF SAM shape flexibility fixes (#23842)

SAM shape flexibility fixes for compilation
---
 .../models/sam/modeling_tf_sam.py             | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index 794587cdde07..46710b329847 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -226,7 +226,8 @@ def _recombine_heads(self, hidden_states: tf.Tensor, point_batch_size: int) -> t
         batch, n_heads, n_tokens, c_per_head = shape_list(hidden_states)
         hidden_states = tf.transpose(hidden_states, perm=[0, 2, 1, 3])
         return tf.reshape(
-            hidden_states, (batch // max(1, point_batch_size), point_batch_size, n_tokens, n_heads * c_per_head)
+            hidden_states,
+            (batch // tf.reduce_max([1, point_batch_size]), point_batch_size, n_tokens, n_heads * c_per_head),
         )
 
     def call(self, query: tf.Tensor, key: tf.Tensor, value: tf.Tensor) -> tf.Tensor:
@@ -509,7 +510,7 @@ def call(
         # Matt: The original Torch code checked that the sum of sparse_prompt_embeddings equalled 0. However, this only
         #       happens when the sparse prompt embeddings are an empty tensor with shape[1] == 0. I replaced
         #       it with an explicit shape check to avoid data-dependent control flow which breaks XLA.
-        if sparse_prompt_embeddings.shape[1] != 0:
+        if shape_list(sparse_prompt_embeddings)[1] != 0:
             tokens = tf.concat((output_tokens, sparse_prompt_embeddings), axis=2)
         else:
             tokens = output_tokens
@@ -695,8 +696,8 @@ def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.T
         """Embeds point prompts."""
         points = points + 0.5  # Shift to center of pixel
         if pad:
-            target_point_shape = (points.shape[0], points.shape[1], 1, points.shape[-1])
-            target_labels_shape = (points.shape[0], points.shape[1], 1)
+            target_point_shape = (shape_list(points)[0], shape_list(points)[1], 1, shape_list(points)[-1])
+            target_labels_shape = (shape_list(points)[0], shape_list(points)[1], 1)
             padding_point = tf.zeros(target_point_shape, dtype=points.dtype)
             padding_label = -tf.ones(target_labels_shape, dtype=labels.dtype)
             points = tf.concat([points, padding_point], axis=2)
@@ -722,12 +723,12 @@ def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.T
     def _embed_boxes(self, boxes: tf.Tensor) -> tf.Tensor:
         """Embeds box prompts."""
         boxes = boxes + 0.5  # Shift to center of pixel
-        batch_size, nb_boxes = boxes.shape[:2]
+        batch_size, nb_boxes = shape_list(boxes)[:2]
         coords = tf.reshape(boxes, (batch_size, nb_boxes, 2, 2))
         input_shape = (self.input_image_size, self.input_image_size)
         corner_embedding = self.shared_embedding(coords, input_shape)
         corner_embedding += tf.where(
-            tf.range(corner_embedding.shape[2])[None, None, :, None] == 0,
+            tf.range(shape_list(corner_embedding)[2])[None, None, :, None] == 0,
             self.point_embed[2][0],
             self.point_embed[3][0],
         )
@@ -754,7 +755,7 @@ def call(
         """
         sparse_embeddings = None
         if input_points is not None:
-            batch_size, point_batch_size = input_points.shape[:2]
+            batch_size, point_batch_size = shape_list(input_points)[:2]
             if input_labels is None:
                 raise ValueError("If points are provided, labels must also be provided.")
             point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None))
@@ -763,7 +764,7 @@ def call(
             )
             sparse_embeddings = tf.concat([sparse_embeddings, point_embeddings], axis=2)
         if input_boxes is not None:
-            batch_size = input_boxes.shape[0]
+            batch_size = shape_list(input_boxes)[0]
             box_embeddings = self._embed_boxes(input_boxes)
             if sparse_embeddings is None:
                 sparse_embeddings = box_embeddings
@@ -1376,8 +1377,8 @@ def call(
                 " got {}.".format(input_boxes.shape),
             )
         if input_points is not None and input_boxes is not None:
-            point_batch_size = input_points.shape[1]
-            box_batch_size = input_boxes.shape[1]
+            point_batch_size = shape_list(input_points)[1]
+            box_batch_size = shape_list(input_boxes)[1]
             if point_batch_size != box_batch_size:
                 raise ValueError(
                     "You should provide as many bounding boxes as input points per box. Got {} and {}.".format(

From 2faa09530bc5d29756bddfec12037c066cc85a02 Mon Sep 17 00:00:00 2001
From: Matthijs Hollemans <mail@hollance.com>
Date: Tue, 30 May 2023 15:06:58 +0200
Subject: [PATCH 246/935] fix Whisper tests on GPU (#23753)

* move input features to GPU

* skip these tests because undefined behavior

* unskip tests
---
 tests/models/whisper/test_modeling_whisper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 2be7f6884e72..3eee5ad4967c 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1477,7 +1477,7 @@ def test_generate_with_prompt_ids(self):
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
         input_speech = self._load_datasamples(4)[-1:]
-        input_features = processor(input_speech, return_tensors="pt").input_features
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
 
         output_without_prompt = model.generate(input_features)
         prompt_ids = processor.get_prompt_ids("Leighton")
@@ -1494,7 +1494,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
         input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="pt").input_features
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
         task = "translate"
         language = "de"
         expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
@@ -1513,7 +1513,7 @@ def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         model.to(torch_device)
         input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="pt").input_features
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
         prompt = "test prompt"
         prompt_ids = processor.get_prompt_ids(prompt)
 

From a077f710f39064b18dad8c858aeb05e41c35e1e5 Mon Sep 17 00:00:00 2001
From: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Date: Tue, 30 May 2023 22:27:40 +0900
Subject: [PATCH 247/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`fast=5Ftokenizers.mdx`=20to=20Korean=20(#22956)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: fast_tokenizer.mdx

content - translated

Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>
Co-Authored-By: Hyeonseo Yun <0525_hhgus@naver.com>
Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/fast_tokenizers.mdx

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* Update fast_tokenizers.mdx

* Update fast_tokenizers.mdx

* Update fast_tokenizers.mdx

* Update fast_tokenizers.mdx

* Update _toctree.yml

---------

Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: Hyeonseo Yun <0525_hhgus@naver.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/_toctree.yml        |  6 +--
 docs/source/ko/fast_tokenizers.mdx | 67 ++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/ko/fast_tokenizers.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index aae299cfb66c..82f7c8d52789 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -75,8 +75,8 @@
     isExpanded: false
   title: 태스크 가이드
 - sections:
-    - local: in_translation
-      title: (번역중) Use fast tokenizers from 🤗 Tokenizers
+    - local: fast_tokenizers
+      title: 🤗 Tokenizers 라이브러리에서 토크나이저 사용하기
     - local: multilingual
       title: 다국어 모델 추론하기
     - local: in_translation
@@ -673,4 +673,4 @@
     - local: in_translation
       title: (번역중) Utilities for Time Series
     title: (번역중) Internal Helpers
-  title: (번역중) API
\ No newline at end of file
+  title: (번역중) API
diff --git a/docs/source/ko/fast_tokenizers.mdx b/docs/source/ko/fast_tokenizers.mdx
new file mode 100644
index 000000000000..bef75686ecb0
--- /dev/null
+++ b/docs/source/ko/fast_tokenizers.mdx
@@ -0,0 +1,67 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 🤗 Tokenizers 라이브러리의 토크나이저 사용하기[[use-tokenizers-from-tokenizers]]
+
+[`PreTrainedTokenizerFast`]는 [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) 라이브러리에 기반합니다. 🤗 Tokenizers 라이브러리의 토크나이저는
+🤗 Transformers로 매우 간단하게 불러올 수 있습니다.
+
+구체적인 내용에 들어가기 전에, 몇 줄의 코드로 더미 토크나이저를 만들어 보겠습니다:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+우리가 정의한 파일을 통해 이제 학습된 토크나이저를 갖게 되었습니다. 이 런타임에서 계속 사용하거나 JSON 파일로 저장하여 나중에 사용할 수 있습니다.
+
+## 토크나이저 객체로부터 직접 불러오기[[loading-directly-from-the-tokenizer-object]]
+
+🤗 Transformers 라이브러리에서 이 토크나이저 객체를 활용하는 방법을 살펴보겠습니다.
+[`PreTrainedTokenizerFast`] 클래스는 인스턴스화된 *토크나이저* 객체를 인수로 받아 쉽게 인스턴스화할 수 있습니다:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+이제 `fast_tokenizer` 객체는 🤗 Transformers 토크나이저에서 공유하는 모든 메소드와 함께 사용할 수 있습니다! 자세한 내용은 [토크나이저 페이지](main_classes/tokenizer)를 참조하세요.
+
+## JSON 파일에서 불러오기[[loading-from-a-JSON-file]]
+
+<!--In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:-->
+
+JSON 파일에서 토크나이저를 불러오기 위해, 먼저 토크나이저를 저장해 보겠습니다:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+JSON 파일을 저장한 경로는 `tokenizer_file` 매개변수를 사용하여 [`PreTrainedTokenizerFast`] 초기화 메소드에 전달할 수 있습니다:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+이제 `fast_tokenizer` 객체는 🤗 Transformers 토크나이저에서 공유하는 모든 메소드와 함께 사용할 수 있습니다! 자세한 내용은 [토크나이저 페이지](main_classes/tokenizer)를 참조하세요.

From 192aa047839c0b5c96529bc4bffa1cd41b493874 Mon Sep 17 00:00:00 2001
From: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Date: Tue, 30 May 2023 22:28:44 +0900
Subject: [PATCH 248/935] [i18n-KO] Translated video_classification.mdx to
 Korean (#23026)

* task/video_classification translated

Co-Authored-By: Hyeonseo Yun <0525_hhgus@naver.com>
Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>
Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Update docs/source/ko/tasks/video_classification.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>

* Update video_classification.mdx

* Update _toctree.yml

* Update _toctree.yml

* Update _toctree.yml

* Update _toctree.yml

---------

Co-authored-by: Hyeonseo Yun <0525_hhgus@naver.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/_toctree.yml                   |   4 +-
 docs/source/ko/tasks/video_classification.mdx | 494 ++++++++++++++++++
 2 files changed, 496 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tasks/video_classification.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 82f7c8d52789..edd6ea522bed 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -54,8 +54,8 @@
         title: 이미지 분류
       - local: in_translation
         title: (번역중) Semantic segmentation
-      - local: in_translation
-        title: (번역중) Video classification
+      - local: tasks/video_classification
+        title: 영상 분류
       - local: in_translation
         title: (번역중) Object detection
       - local: tasks/zero_shot_object_detection
diff --git a/docs/source/ko/tasks/video_classification.mdx b/docs/source/ko/tasks/video_classification.mdx
new file mode 100644
index 000000000000..4d185b0aa765
--- /dev/null
+++ b/docs/source/ko/tasks/video_classification.mdx
@@ -0,0 +1,494 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 영상 분류 [[video-classification]]
+
+[[open-in-colab]]
+
+
+영상 분류는 영상 전체에 레이블 또는 클래스를 지정하는 작업입니다. 각 영상에는 하나의 클래스가 있을 것으로 예상됩니다. 영상 분류 모델은 영상을 입력으로 받아 어느 클래스에 속하는지에 대한 예측을 반환합니다. 이러한 모델은 영상이 어떤 내용인지 분류하는 데 사용될 수 있습니다. 영상 분류의 실제 응용 예는 피트니스 앱에서 유용한 동작 / 운동 인식 서비스가 있습니다. 이는 또한 시각 장애인이 이동할 때 보조하는데 사용될 수 있습니다
+
+이 가이드에서는 다음을 수행하는 방법을 보여줍니다:
+
+1. [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) 데이터 세트의 하위 집합을 통해 [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) 모델을 미세 조정하기.
+2. 미세 조정한 모델을 추론에 사용하기.
+
+<Tip>
+
+이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+
+시작하기 전에 필요한 모든 라이브러리가 설치되었는지 확인하세요:
+```bash
+pip install -q pytorchvideo transformers evaluate
+```
+
+영상을 처리하고 준비하기 위해 [PyTorchVideo](https://pytorchvideo.org/)(이하 `pytorchvideo`)를 사용합니다.
+
+커뮤니티에 모델을 업로드하고 공유할 수 있도록 Hugging Face 계정에 로그인하는 것을 권장합니다. 프롬프트가 나타나면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## UCF101 데이터셋 불러오기 [[load-ufc101-dataset]]
+
+[UCF-101](https://www.crcv.ucf.edu/data/UCF101.php) 데이터 세트의 하위 집합(subset)을 불러오는 것으로 시작할 수 있습니다. 전체 데이터 세트를 학습하는데 더 많은 시간을 할애하기 전에 데이터의 하위 집합을 불러와 모든 것이 잘 작동하는지 실험하고 확인할 수 있습니다.
+
+```py
+>>> from huggingface_hub import hf_hub_download
+
+>>> hf_dataset_identifier = "sayakpaul/ucf101-subset"
+>>> filename = "UCF101_subset.tar.gz"
+>>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset")
+```
+
+데이터 세트의 하위 집합이 다운로드 되면, 압축된 파일의 압축을 해제해야 합니다:
+```py 
+>>> import tarfile
+
+>>> with tarfile.open(file_path) as t:
+...      t.extractall(".")
+```
+
+전체 데이터 세트는 다음과 같이 구성되어 있습니다.
+
+```bash
+UCF101_subset/
+    train/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    val/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    test/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+```
+
+
+정렬된 영상의 경로는 다음과 같습니다:
+
+```bash
+...
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi'
+...
+```
+
+동일한 그룹/장면에 속하는 영상 클립은 파일 경로에서 `g`로 표시되어 있습니다. 예를 들면, `v_ApplyEyeMakeup_g07_c04.avi`와 `v_ApplyEyeMakeup_g07_c06.avi` 이 있습니다. 이 둘은 같은 그룹입니다.
+
+검증 및 평가 데이터 분할을 할 때, [데이터 누출(data leakage)](https://www.kaggle.com/code/alexisbcook/data-leakage)을 방지하기 위해 동일한 그룹 / 장면의 영상 클립을 사용하지 않아야 합니다. 이 튜토리얼에서 사용하는 하위 집합은 이러한 정보를 고려하고 있습니다.
+
+그 다음으로, 데이터 세트에 존재하는 라벨을 추출합니다. 또한, 모델을 초기화할 때 도움이 될 딕셔너리(dictionary data type)를 생성합니다.
+
+* `label2id`: 클래스 이름을 정수에 매핑합니다.
+* `id2label`: 정수를 클래스 이름에 매핑합니다. 
+
+```py 
+>>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
+>>> label2id = {label: i for i, label in enumerate(class_labels)}
+>>> id2label = {i: label for label, i in label2id.items()}
+
+>>> print(f"Unique classes: {list(label2id.keys())}.")
+
+# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].
+```
+
+이 데이터 세트에는 총 10개의 고유한 클래스가 있습니다. 각 클래스마다 30개의 영상이 훈련 세트에 있습니다
+
+## 미세 조정하기 위해 모델 가져오기 [[load-a-model-to-fine-tune]]
+
+사전 훈련된 체크포인트와 체크포인트에 연관된 이미지 프로세서를 사용하여 영상 분류 모델을 인스턴스화합니다. 모델의 인코더에는 미리 학습된 매개변수가 제공되며, 분류 헤드(데이터를 분류하는 마지막 레이어)는 무작위로 초기화됩니다. 데이터 세트의 전처리 파이프라인을 작성할 때는 이미지 프로세서가 유용합니다.
+
+```py 
+>>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
+
+>>> model_ckpt = "MCG-NJU/videomae-base"
+>>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
+>>> model = VideoMAEForVideoClassification.from_pretrained(
+...     model_ckpt,
+...     label2id=label2id,
+...     id2label=id2label,
+...     ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
+... )
+```
+
+모델을 가져오는 동안, 다음과 같은 경고를 마주칠 수 있습니다:
+
+```bash
+Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight']
+- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+
+위 경고는 우리가 일부 가중치(예: `classifier` 층의 가중치와 편향)를 버리고 새로운 `classifier` 층의 가중치와 편향을 무작위로 초기화하고 있다는 것을 알려줍니다. 이 경우에는 미리 학습된 가중치가 없는 새로운 헤드를 추가하고 있으므로, 라이브러리가 모델을 추론에 사용하기 전에 미세 조정하라고 경고를 보내는 것은 당연합니다. 그리고 이제 우리는 이 모델을 미세 조정할 예정입니다.
+
+**참고** 이 [체크포인트](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics)는 도메인이 많이 중첩된 유사한 다운스트림 작업에 대해 미세 조정하여 얻은 체크포인트이므로 이 작업에서 더 나은 성능을 보일 수 있습니다. `MCG-NJU/videomae-base-finetuned-kinetics` 데이터 세트를 미세 조정하여 얻은 [체크포인트](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset)도 있습니다.
+
+## 훈련을 위한 데이터 세트 준비하기[[prepare-the-datasets-for-training]]
+
+영상 전처리를 위해 [PyTorchVideo 라이브러리](https://pytorchvideo.org/)를 활용할 것입니다. 필요한 종속성을 가져오는 것으로 시작하세요.
+
+```py 
+>>> import pytorchvideo.data
+
+>>> from pytorchvideo.transforms import (
+...     ApplyTransformToKey,
+...     Normalize,
+...     RandomShortSideScale,
+...     RemoveKey,
+...     ShortSideScale,
+...     UniformTemporalSubsample,
+... )
+
+>>> from torchvision.transforms import (
+...     Compose,
+...     Lambda,
+...     RandomCrop,
+...     RandomHorizontalFlip,
+...     Resize,
+... )
+```
+
+학습 데이터 세트 변환에는 '균일한 시간 샘플링(uniform temporal subsampling)', '픽셀 정규화(pixel normalization)', '랜덤 잘라내기(random cropping)' 및 '랜덤 수평 뒤집기(random horizontal flipping)'의 조합을 사용합니다. 검증 및 평가 데이터 세트 변환에는 '랜덤 잘라내기'와 '랜덤 뒤집기'를 제외한 동일한 변환 체인을 유지합니다. 이러한 변환에 대해 자세히 알아보려면 [PyTorchVideo 공식 문서](https://pytorchvideo.org)를 확인하세요.
+
+사전 훈련된 모델과 관련된 이미지 프로세서를 사용하여 다음 정보를 얻을 수 있습니다:
+
+* 영상 프레임 픽셀을 정규화하는 데 사용되는 이미지 평균과 표준 편차
+* 영상 프레임이 조정될 공간 해상도
+
+
+먼저, 몇 가지 상수를 정의합니다.
+
+```py
+>>> mean = image_processor.image_mean
+>>> std = image_processor.image_std
+>>> if "shortest_edge" in image_processor.size:
+...     height = width = image_processor.size["shortest_edge"]
+>>> else:
+...     height = image_processor.size["height"]
+...     width = image_processor.size["width"]
+>>> resize_to = (height, width)
+
+>>> num_frames_to_sample = model.config.num_frames
+>>> sample_rate = 4
+>>> fps = 30
+>>> clip_duration = num_frames_to_sample * sample_rate / fps
+```
+
+이제 데이터 세트에 특화된 전처리(transform)과 데이터 세트 자체를 정의합니다. 먼저 훈련 데이터 세트로 시작합니다:
+
+```py 
+>>> train_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     RandomShortSideScale(min_size=256, max_size=320),
+...                     RandomCrop(resize_to),
+...                     RandomHorizontalFlip(p=0.5),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> train_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "train"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
+...     decode_audio=False,
+...     transform=train_transform,
+... )
+```
+
+같은 방식의 작업 흐름을 검증과 평가 세트에도 적용할 수 있습니다.
+
+```py 
+>>> val_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     Resize(resize_to),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> val_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "val"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+
+>>> test_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "test"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+```
+
+
+**참고**: 위의 데이터 세트의 파이프라인은 [공식 파이토치 예제](https://pytorchvideo.org/docs/tutorial_classification#dataset)에서 가져온 것입니다. 우리는 UCF-101 데이터셋에 맞게 [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) 함수를 사용하고 있습니다. 내부적으로 이 함수는 [`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) 객체를 반환합니다. `LabeledVideoDataset` 클래스는 PyTorchVideo 데이터셋에서 모든 영상 관련 작업의 기본 클래스입니다. 따라서 PyTorchVideo에서 미리 제공하지 않는 사용자 지정 데이터 세트를 사용하려면, 이 클래스를 적절하게 확장하면 됩니다. 더 자세한 사항이 알고 싶다면 `data` API [문서](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html) 를 참고하세요. 또한 위의 예시와 유사한 구조를 갖는 데이터 세트를 사용하고 있다면, `pytorchvideo.data.Ucf101()` 함수를 사용하는 데 문제가 없을 것입니다.
+
+데이터 세트에 영상의 개수를 알기 위해 `num_videos` 인수에 접근할 수 있습니다.
+
+```py
+>>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)
+# (300, 30, 75)
+```
+
+## 더 나은 디버깅을 위해 전처리 영상 시각화하기[[visualize-the-preprocessed-video-for-better-debugging]]
+
+```py 
+>>> import imageio
+>>> import numpy as np
+>>> from IPython.display import Image
+
+>>> def unnormalize_img(img):
+...     """Un-normalizes the image pixels."""
+...     img = (img * std) + mean
+...     img = (img * 255).astype("uint8")
+...     return img.clip(0, 255)
+
+>>> def create_gif(video_tensor, filename="sample.gif"):
+...     """Prepares a GIF from a video tensor.
+...     
+...     The video tensor is expected to have the following shape:
+...     (num_frames, num_channels, height, width).
+...     """
+...     frames = []
+...     for video_frame in video_tensor:
+...         frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
+...         frames.append(frame_unnormalized)
+...     kargs = {"duration": 0.25}
+...     imageio.mimsave(filename, frames, "GIF", **kargs)
+...     return filename
+
+>>> def display_gif(video_tensor, gif_name="sample.gif"):
+...     """Prepares and displays a GIF from a video tensor."""
+...     video_tensor = video_tensor.permute(1, 0, 2, 3)
+...     gif_filename = create_gif(video_tensor, gif_name)
+...     return Image(filename=gif_filename)
+
+>>> sample_video = next(iter(train_dataset))
+>>> video_tensor = sample_video["video"]
+>>> display_gif(video_tensor)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
+</div>
+
+## 모델 훈련하기[[train-the-model]] 
+
+🤗 Transformers의 [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer)를 사용하여 모델을 훈련시켜보세요. `Trainer`를 인스턴스화하려면 훈련 설정과 평가 지표를 정의해야 합니다.  가장 중요한 것은 [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments)입니다. 이 클래스는 훈련을 구성하는 모든 속성을 포함하며, 훈련 중 체크포인트를 저장할 출력 폴더 이름을 필요로 합니다. 또한 🤗 Hub의 모델 저장소의 모든 정보를 동기화하는 데 도움이 됩니다.
+
+대부분의 훈련 인수는 따로 설명할 필요는 없습니다. 하지만 여기에서 중요한 인수는 `remove_unused_columns=False` 입니다. 이 인자는 모델의 호출 함수에서 사용되지 않는 모든 속성 열(columns)을 삭제합니다. 기본값은 일반적으로 True입니다. 이는 사용되지 않는 기능 열을 삭제하는 것이 이상적이며, 입력을 모델의 호출 함수로 풀기(unpack)가 쉬워지기 때문입니다. 하지만 이 경우에는 `pixel_values`(모델의 입력으로 필수적인 키)를 생성하기 위해 사용되지 않는 기능('video'가 특히 그렇습니다)이 필요합니다. 따라서 remove_unused_columns을 False로 설정해야 합니다.
+
+```py 
+>>> from transformers import TrainingArguments, Trainer
+
+>>> model_name = model_ckpt.split("/")[-1]
+>>> new_model_name = f"{model_name}-finetuned-ucf101-subset"
+>>> num_epochs = 4
+
+>>> args = TrainingArguments(
+...     new_model_name,
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=batch_size,
+...     per_device_eval_batch_size=batch_size,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+...     max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
+... )
+```
+
+`pytorchvideo.data.Ucf101()` 함수로 반환되는 데이터 세트는 `__len__` 메소드가 이식되어 있지 않습니다. 따라서,  `TrainingArguments`를 인스턴스화할 때 `max_steps`를 정의해야 합니다.
+
+다음으로, 평가지표를 불러오고, 예측값에서 평가지표를 계산할 함수를 정의합니다. 필요한 전처리 작업은 예측된 로짓(logits)에 argmax 값을 취하는 것뿐입니다:
+
+```py
+import evaluate
+
+metric = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+**평가에 대한 참고사항**:
+
+[VideoMAE 논문](https://arxiv.org/abs/2203.12602)에서 저자는 다음과 같은 평가 전략을 사용합니다. 테스트 영상에서 여러 클립을 선택하고 그 클립에 다양한 크롭을 적용하여 집계 점수를 보고합니다. 그러나 이번 튜토리얼에서는 간단함과 간결함을 위해 해당 전략을 고려하지 않습니다.
+
+또한, 예제를 묶어서 배치를 형성하는 `collate_fn`을 정의해야합니다. 각 배치는 `pixel_values`와 `labels`라는 2개의 키로 구성됩니다.
+
+```py 
+>>> def collate_fn(examples):
+...     # permute to (num_frames, num_channels, height, width)
+...     pixel_values = torch.stack(
+...         [example["video"].permute(1, 0, 2, 3) for example in examples]
+...     )
+...     labels = torch.tensor([example["label"] for example in examples])
+...     return {"pixel_values": pixel_values, "labels": labels}
+```
+
+그런 다음 이 모든 것을 데이터 세트와 함께 `Trainer`에 전달하기만 하면 됩니다:
+
+```py 
+>>> trainer = Trainer(
+...     model,
+...     args,
+...     train_dataset=train_dataset,
+...     eval_dataset=val_dataset,
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+...     data_collator=collate_fn,
+... )
+```
+
+데이터를 이미 처리했는데도 불구하고 `image_processor`를 토크나이저 인수로 넣은 이유는 JSON으로 저장되는 이미지 프로세서 구성 파일이 Hub의 저장소에 업로드되도록 하기 위함입니다.
+
+`train` 메소드를 호출하여 모델을 미세 조정하세요:
+
+```py 
+>>> train_results = trainer.train()
+```
+
+학습이 완료되면, 모델을 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 허브에 공유하여 누구나 모델을 사용할 수 있도록 합니다:
+```py
+>>> trainer.push_to_hub()
+```
+
+## 추론하기[[inference]]
+
+좋습니다. 이제 미세 조정된 모델을 추론하는 데 사용할 수 있습니다.
+
+추론에 사용할 영상을 불러오세요:
+```py 
+>>> sample_test_video = next(iter(test_dataset))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif_two.gif" alt="Teams playing basketball"/>
+</div>
+
+미세 조정된 모델을 추론에 사용하는 가장 간단한 방법은 [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline)에서 모델을 사용하는 것입니다. 모델로 영상 분류를 하기 위해 `pipeline`을 인스턴스화하고 영상을 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> video_cls = pipeline(model="my_awesome_video_cls_model")
+>>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi")
+[{'score': 0.9272987842559814, 'label': 'BasketballDunk'},
+ {'score': 0.017777055501937866, 'label': 'BabyCrawling'},
+ {'score': 0.01663011871278286, 'label': 'BalanceBeam'},
+ {'score': 0.009560945443809032, 'label': 'BandMarching'},
+ {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}]
+```
+
+만약 원한다면 수동으로 `pipeline`의 결과를 재현할 수 있습니다:
+
+
+```py
+>>> def run_inference(model, video):
+...     # (num_frames, num_channels, height, width)
+...     perumuted_sample_test_video = video.permute(1, 0, 2, 3)
+...     inputs = {
+...         "pixel_values": perumuted_sample_test_video.unsqueeze(0),
+...         "labels": torch.tensor(
+...             [sample_test_video["label"]]
+...         ),  # this can be skipped if you don't have labels available.
+...     }
+
+...     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+...     inputs = {k: v.to(device) for k, v in inputs.items()}
+...     model = model.to(device)
+
+...     # forward pass
+...     with torch.no_grad():
+...         outputs = model(**inputs)
+...         logits = outputs.logits
+
+...     return logits
+```
+
+모델에 입력값을 넣고 `logits`을 반환받으세요:
+
+```
+>>> logits = run_inference(trained_model, sample_test_video["video"])
+```
+
+`logits`을 디코딩하면, 우리는 다음 결과를 얻을 수 있습니다:
+
+```py 
+>>> predicted_class_idx = logits.argmax(-1).item()
+>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+# Predicted class: BasketballDunk
+```

From 867316670a909dd1a60ad69cdb0c962bdc6f0cd4 Mon Sep 17 00:00:00 2001
From: Hyeonseo Yun <0525yhs@gmail.com>
Date: Tue, 30 May 2023 22:49:47 +0900
Subject: [PATCH 249/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`troubleshooting.mdx`=20to=20Korean=20(#23166)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: troubleshooting.mdx

* revised: fix _toctree.yml #23112

* feat: nmt draft `troubleshooting.mdx`

* fix: manual edits `troubleshooting.mdx`

* revised: resolve suggestions troubleshooting.mdx

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

---------

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml        |   4 +-
 docs/source/ko/troubleshooting.mdx | 194 +++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/troubleshooting.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index edd6ea522bed..fd921229c36b 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -97,8 +97,8 @@
       title: (번역중) Notebooks with examples
     - local: in_translation
       title: (번역중) Community resources
-    - local: in_translation
-      title: (번역중) Troubleshoot
+    - local: troubleshooting
+      title: 문제 해결
   title: (번역중) 개발자 가이드
 - sections:
     - local: in_translation
diff --git a/docs/source/ko/troubleshooting.mdx b/docs/source/ko/troubleshooting.mdx
new file mode 100644
index 000000000000..56c27df9fcc0
--- /dev/null
+++ b/docs/source/ko/troubleshooting.mdx
@@ -0,0 +1,194 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# 문제 해결[[troubleshoot]]
+
+때때로 오류가 발생할 수 있지만, 저희가 도와드리겠습니다! 이 가이드는 현재까지 확인된 가장 일반적인 문제 몇 가지와 그것들을 해결하는 방법에 대해 다룹니다. 그러나 이 가이드는 모든 🤗 Transformers 문제를 포괄적으로 다루고 있지 않습니다. 문제 해결에 더 많은 도움을 받으려면 다음을 시도해보세요:
+
+<Youtube id="S2EEG3JIt2A"/>
+
+1. [포럼](https://discuss.huggingface.co/)에서 도움을 요청하세요. [Beginners](https://discuss.huggingface.co/c/beginners/5) 또는 [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9)와 같은 특정 카테고리에 질문을 게시할 수 있습니다. 재현 가능한 코드와 함께 잘 서술된 포럼 게시물을 작성하여 여러분의 문제가 해결될 가능성을 극대화하세요!
+
+<Youtube id="_PAli-V4wj0"/>
+
+2. 라이브러리와 관련된 버그이면 🤗 Transformers 저장소에서 [이슈](https://github.com/huggingface/transformers/issues/new/choose)를 생성하세요. 버그에 대해 설명하는 정보를 가능한 많이 포함하려고 노력하여, 무엇이 잘못 되었는지와 어떻게 수정할 수 있는지 더 잘 파악할 수 있도록 도와주세요.
+
+3. 이전 버전의 🤗 Transformers을 사용하는 경우 중요한 변경 사항이 버전 사이에 도입되었기 때문에 [마이그레이션](migration) 가이드를 확인하세요.
+
+문제 해결 및 도움 매뉴얼에 대한 자세한 내용은 Hugging Face 강좌의 [8장](https://huggingface.co/course/chapter8/1?fw=pt)을 참조하세요.
+
+
+## 방화벽 환경[[firewalled-environments]]
+
+클라우드 및 내부망(intranet) 설정의 일부 GPU 인스턴스는 외부 연결에 대한 방화벽으로 차단되어 연결 오류가 발생할 수 있습니다. 스크립트가 모델 가중치나 데이터를 다운로드하려고 할 때, 다운로드가 중단되고 다음 메시지와 함께 시간 초과됩니다: 
+
+```
+ValueError: Connection error, and we cannot find the requested files in the cached path.
+Please try again or make sure your Internet connection is on.
+```
+
+이 경우에는 연결 오류를 피하기 위해 🤗 Transformers를 [오프라인 모드](installation#offline-mode)로 실행해야 합니다.
+
+## CUDA 메모리 부족(CUDA out of memory)[[cuda-out-of-memory]]
+
+수백만 개의 매개변수로 대규모 모델을 훈련하는 것은 적절한 하드웨어 없이 어려울 수 있습니다. GPU 메모리가 부족한 경우 발생할 수 있는 일반적인 오류는 다음과 같습니다:
+
+```
+CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch)
+```
+
+다음은 메모리 사용을 줄이기 위해 시도해 볼 수 있는 몇 가지 잠재적인 해결책입니다:
+
+- [`TrainingArguments`]의 [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) 값을 줄이세요.
+- [`TrainingArguments`]의 [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps)은 전체 배치 크기를 효과적으로 늘리세요.
+
+<Tip>
+
+메모리 절약 기술에 대한 자세한 내용은 성능 [가이드](performance)를 참조하세요.
+
+</Tip>
+
+## 저장된 TensorFlow 모델을 가져올 수 없습니다(Unable to load a saved TensorFlow model)[[unable-to-load-a-saved-uensorFlow-model]]
+
+TensorFlow의 [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) 메소드는 아키텍처, 가중치, 훈련 구성 등 전체 모델을 단일 파일에 저장합니다. 그러나 모델 파일을 다시 가져올 때 🤗 Transformers는 모델 파일에 있는 모든 TensorFlow 관련 객체를 가져오지 않을 수 있기 때문에 오류가 발생할 수 있습니다. TensorFlow 모델 저장 및 가져오기 문제를 피하려면 다음을 권장합니다:
+
+- 모델 가중치를 `h5` 파일 확장자로 [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model)로 저장한 다음 [`~TFPreTrainedModel.from_pretrained`]로 모델을 다시 가져옵니다:
+
+```py
+>>> from transformers import TFPreTrainedModel
+>>> from tensorflow import keras
+
+>>> model.save_weights("some_folder/tf_model.h5")
+>>> model = TFPreTrainedModel.from_pretrained("some_folder")
+```
+
+- 모델을 [`~TFPretrainedModel.save_pretrained`]로 저장하고 [`~TFPreTrainedModel.from_pretrained`]로 다시 가져옵니다:
+
+```py
+>>> from transformers import TFPreTrainedModel
+
+>>> model.save_pretrained("path_to/model")
+>>> model = TFPreTrainedModel.from_pretrained("path_to/model")
+```
+
+## ImportError[[importerror]]
+
+특히 최신 모델인 경우 만날 수 있는 다른 일반적인 오류는 `ImportError`입니다:
+
+```
+ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location)
+```
+
+이러한 오류 유형의 경우 최신 모델에 액세스할 수 있도록 최신 버전의 🤗 Transformers가 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers --upgrade
+```
+
+## CUDA error: device-side assert triggered[[cuda-error-deviceside-assert-triggered]]
+
+때때로 장치 코드 오류에 대한 일반적인 CUDA 오류가 발생할 수 있습니다.
+
+```
+RuntimeError: CUDA error: device-side assert triggered
+```
+
+더 자세한 오류 메시지를 얻으려면 우선 코드를 CPU에서 실행합니다. 다음 환경 변수를 코드의 시작 부분에 추가하여 CPU로 전환하세요:
+
+```py
+>>> import os
+
+>>> os.environ["CUDA_VISIBLE_DEVICES"] = ""
+```
+
+또 다른 옵션은 GPU에서 더 나은 역추적(traceback)을 얻는 것입니다. 다음 환경 변수를 코드의 시작 부분에 추가하여 역추적이 오류가 발생한 소스를 가리키도록 하세요:
+
+```py
+>>> import os
+
+>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+```
+
+## 패딩 토큰이 마스킹되지 않은 경우 잘못된 출력(Incorrect output when padding tokens aren't masked)[[incorrect-output-when-padding-tokens-arent-masked]]
+
+경우에 따라 `input_ids`에 패딩 토큰이 포함된 경우 `hidden_state` 출력이 올바르지 않을 수 있습니다. 데모를 위해 모델과 토크나이저를 가져오세요. 모델의 `pad_token_id`에 액세스하여 해당 값을 확인할 수 있습니다. 일부 모델의 경우 `pad_token_id`가 `None`일 수 있지만 언제든지 수동으로 설정할 수 있습니다.
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+>>> import torch
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+>>> model.config.pad_token_id
+0
+```
+
+다음 예제는 패딩 토큰을 마스킹하지 않은 출력을 보여줍니다:
+
+```py
+>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+        [ 0.1317, -0.1683]], grad_fn=<AddmmBackward0>)
+```
+
+다음은 두 번째 시퀀스의 실제 출력입니다:
+
+```py
+>>> input_ids = torch.tensor([[7592]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+대부분의 경우 모델에 `attention_mask`를 제공하여 패딩 토큰을 무시해야 이러한 조용한 오류를 방지할 수 있습니다. 이제 두 번째 시퀀스의 출력이 실제 출력과 일치합니다:
+
+<Tip>
+
+일반적으로 토크나이저는 특정 토크나이저의 기본 값을 기준으로 사용자에 대한 'attention_mask'를 만듭니다.
+
+</Tip>
+
+```py
+>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids, attention_mask=attention_mask)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+        [-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+🤗 Transformers는 패딩 토큰이 제공된 경우 패딩 토큰을 마스킹하기 위한 `attention_mask`를 자동으로 생성하지 않습니다. 그 이유는 다음과 같습니다:
+
+- 일부 모델에는 패딩 토큰이 없습니다.
+- 일부 사용 사례의 경우 사용자가 모델이 패딩 토큰을 관리하기를 원합니다.
+
+## ValueError: 이 유형의 AutoModel에 대해 인식할 수 없는 XYZ 구성 클래스(ValueError: Unrecognized configuration class XYZ for this kind of AutoModel)[[valueerror-unrecognized-configuration-class-xyz-for-this-kind-of-automodel]]
+
+일반적으로, 사전 학습된 모델의 인스턴스를 가져오기 위해 [`AutoModel`] 클래스를 사용하는 것이 좋습니다.
+이 클래스는 구성에 따라 주어진 체크포인트에서 올바른 아키텍처를 자동으로 추론하고 가져올 수 있습니다.
+모델을 체크포인트에서 가져올 때 이 `ValueError`가 발생하면, 이는 Auto 클래스가 주어진 체크포인트의 구성에서 
+가져오려는 모델 유형과 매핑을 찾을 수 없다는 것을 의미합니다. 가장 흔하게 발생하는 경우는 
+체크포인트가 주어진 태스크를 지원하지 않을 때입니다.
+예를 들어, 다음 예제에서 질의응답에 대한 GPT2가 없기 때문에 오류가 발생합니다:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium")
+ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
+Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
+```

From 62ba64b90a2786c74e202d9f1ad2e34da4e083ce Mon Sep 17 00:00:00 2001
From: peridotml <106936600+peridotml@users.noreply.github.com>
Date: Tue, 30 May 2023 07:08:07 -0700
Subject: [PATCH 250/935] Adds a FlyteCallback (#23759)

* initial flyte callback

* lint

* logs should still be saved to Flyte even if pandas isn't install (unlikely)

* cr - flyte team

* add docs for Flytecallback

* fix doc string - cr sgugger

* Apply suggestions from code review

cr - sgugger fix doc strings

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/main_classes/callback.mdx |  3 +
 src/transformers/integrations.py         | 76 +++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/main_classes/callback.mdx b/docs/source/en/main_classes/callback.mdx
index 33ae17c66df2..2636130473c7 100644
--- a/docs/source/en/main_classes/callback.mdx
+++ b/docs/source/en/main_classes/callback.mdx
@@ -39,6 +39,7 @@ By default a [`Trainer`] will use the following callbacks:
   installed.
 - [`~integrations.ClearMLCallback`] if [clearml](https://github.com/allegroai/clearml) is installed.
 - [`~integrations.DagsHubCallback`] if [dagshub](https://dagshub.com/) is installed.
+- [`~integrations.FlyteCallback`] if [flyte](https://flyte.org/) is installed.
 
 The main class that implements callbacks is [`TrainerCallback`]. It gets the
 [`TrainingArguments`] used to instantiate the [`Trainer`], can access that
@@ -79,6 +80,8 @@ Here is the list of the available [`TrainerCallback`] in the library:
 
 [[autodoc]] integrations.DagsHubCallback
 
+[[autodoc]] integrations.FlyteCallback
+
 ## TrainerCallback
 
 [[autodoc]] TrainerCallback
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 4b0e1c590d82..0a85cef6981c 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -30,7 +30,7 @@
 import numpy as np
 
 from . import __version__ as version
-from .utils import flatten_dict, is_datasets_available, is_torch_available, logging
+from .utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
 from .utils.versions import importlib_metadata
 
 
@@ -146,6 +146,16 @@ def is_codecarbon_available():
     return importlib.util.find_spec("codecarbon") is not None
 
 
+def is_flytekit_available():
+    return importlib.util.find_spec("flytekit") is not None
+
+
+def is_flyte_deck_standard_available():
+    if not is_flytekit_available():
+        return False
+    return importlib.util.find_spec("flytekitplugins.deck") is not None
+
+
 def hp_params(trial):
     if is_optuna_available():
         import optuna
@@ -1537,6 +1547,69 @@ def on_save(self, args, state, control, **kwargs):
             self._clearml_task.update_output_model(artifact_path, iteration=state.global_step, auto_delete_file=False)
 
 
+class FlyteCallback(TrainerCallback):
+    """A [`TrainerCallback`] that sends the logs to [Flyte](https://flyte.org/).
+    NOTE: This callback only works within a Flyte task.
+
+    Args:
+        save_log_history (`bool`, *optional*, defaults to `True`):
+            When set to True, the training logs are saved as a Flyte Deck.
+
+        sync_checkpoints (`bool`, *optional*, defaults to `True`):
+            When set to True, checkpoints are synced with Flyte and can be used to resume training in the case of an
+            interruption.
+
+    Example:
+
+    ```python
+    # Note: This example skips over some setup steps for brevity.
+    from flytekit import current_context, task
+
+
+    @task
+    def train_hf_transformer():
+        cp = current_context().checkpoint
+        trainer = Trainer(..., callbacks=[FlyteCallback()])
+        output = trainer.train(resume_from_checkpoint=cp.restore())
+    ```
+    """
+
+    def __init__(self, save_log_history: bool = True, sync_checkpoints: bool = True):
+        super().__init__()
+        if not is_flytekit_available():
+            raise ImportError("FlyteCallback requires flytekit to be installed. Run `pip install flytekit`.")
+
+        if not is_flyte_deck_standard_available() or not is_pandas_available():
+            logger.warning(
+                "Syncing log history requires both flytekitplugins-deck-standard and pandas to be installed. "
+                "Run `pip install flytekitplugins-deck-standard pandas` to enable this feature."
+            )
+            save_log_history = False
+
+        from flytekit import current_context
+
+        self.cp = current_context().checkpoint
+        self.save_log_history = save_log_history
+        self.sync_checkpoints = sync_checkpoints
+
+    def on_save(self, args, state, control, **kwargs):
+        if self.sync_checkpoints and state.is_world_process_zero:
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+
+            logger.info(f"Syncing checkpoint in {ckpt_dir} to Flyte. This may take time.")
+            self.cp.save(artifact_path)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.save_log_history:
+            import pandas as pd
+            from flytekit import Deck
+            from flytekitplugins.deck.renderer import TableRenderer
+
+            log_history_df = pd.DataFrame(state.log_history)
+            Deck("Log History", TableRenderer().to_html(log_history_df))
+
+
 INTEGRATION_TO_CALLBACK = {
     "azure_ml": AzureMLCallback,
     "comet_ml": CometCallback,
@@ -1547,6 +1620,7 @@ def on_save(self, args, state, control, **kwargs):
     "codecarbon": CodeCarbonCallback,
     "clearml": ClearMLCallback,
     "dagshub": DagsHubCallback,
+    "flyte": FlyteCallback,
 }
 
 

From 0623f08e999aa023bdf8111555fba9e7e8d6b555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Tue, 30 May 2023 16:23:20 +0200
Subject: [PATCH 251/935] Update collating_graphormer.py (#23862)

---
 src/transformers/models/graphormer/collating_graphormer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/graphormer/collating_graphormer.py b/src/transformers/models/graphormer/collating_graphormer.py
index e2cccc6668a4..58ce602ea28d 100644
--- a/src/transformers/models/graphormer/collating_graphormer.py
+++ b/src/transformers/models/graphormer/collating_graphormer.py
@@ -129,6 +129,6 @@ def __call__(self, features: List[dict]) -> Dict[str, Any]:
             else:  # binary classification
                 batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
         else:  # multi task classification, left to float to keep the NaNs
-            batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], dim=0))
+            batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))
 
         return batch

From 6fc0454b2ffa4edc1212c1fb3affac201d1e7a33 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 30 May 2023 16:50:41 +0200
Subject: [PATCH 252/935] [LlamaTokenizerFast] nit update `post_processor` on
 the fly (#23855)

* Update the processor when changing add_eos and add_bos

* fixup

* update

* add a test

* fix failing tests

* fixup
---
 .../models/llama/tokenization_llama_fast.py   | 44 +++++++++++++++++++
 tests/models/llama/test_tokenization_llama.py | 33 ++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index bb2737075ea2..c3946d83b0e0 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -16,6 +16,8 @@
 from shutil import copyfile
 from typing import Optional, Tuple
 
+from tokenizers import processors
+
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import is_sentencepiece_available, logging
 from ...utils.versions import require_version
@@ -84,6 +86,8 @@ def __init__(
         unk_token="<unk>",
         bos_token="<s>",
         eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
         **kwargs,
     ):
         super().__init__(
@@ -95,10 +99,50 @@ def __init__(
             eos_token=eos_token,
             **kwargs,
         )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
 
         self.vocab_file = vocab_file
         self.can_save_slow_tokenizer = False if not self.vocab_file else True
 
+    def update_post_processor(self):
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+
+        single = f"{(bos+':0 ') * self.add_bos_token}$A:0{(' '+eos+':0') * self.add_eos_token}"
+        pair = f"{single}{(' '+bos+':1') * self.add_bos_token} $B:1{(' '+eos+':1') * self.add_eos_token}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not self.can_save_slow_tokenizer:
             raise ValueError(
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 6ce1bb44c03d..3a1ec2be93bf 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -315,6 +315,39 @@ def integration_tests(self):
             },
         )
 
+    def test_fast_special_tokens(self):
+        slow_tokenizer = self.tokenizer
+        fast_tokenizer = self.rust_tokenizer
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243]
+
+        fast_tokenizer.add_eos_token = False
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243]
+
+        fast_tokenizer.add_eos_token = True
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243, 2]
+
+        slow_tokenizer.add_eos_token = True
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243, 2]
+
+        fast_tokenizer = LlamaTokenizerFast.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [319, 4559, 1243, 2]
+
+        slow_tokenzier = LlamaTokenizer.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        slow = slow_tokenzier.encode("A sample test", add_special_tokens=True)
+        assert slow == [319, 4559, 1243, 2]
+
+        self.tokenizer.add_eos_token = False
+        self.rust_tokenizer.add_eos_token = False
+
     @slow
     def test_conversion(self):
         # This is excruciatingly slow since it has to recreate the entire merge

From 58022e41b83d9b14e5adcdb6d71426aff92cc793 Mon Sep 17 00:00:00 2001
From: Vijeth Moudgalya <33093576+vijethmoudgalya@users.noreply.github.com>
Date: Tue, 30 May 2023 20:23:40 +0530
Subject: [PATCH 253/935] #23388 Issue: Update RoBERTa configuration (#23863)

---
 src/transformers/models/roberta/configuration_roberta.py      | 4 ++--
 .../configuration_roberta_prelayernorm.py                     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index 3025fe2833d9..f82033f4588f 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -46,7 +46,7 @@ class RobertaConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
         hidden_size (`int`, *optional*, defaults to 768):
@@ -105,7 +105,7 @@ class RobertaConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=30522,
+        vocab_size=50265,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
index 49f92586c1b7..fca6763f274e 100644
--- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
@@ -45,7 +45,7 @@ class RobertaPreLayerNormConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the RoBERTa-PreLayerNorm model. Defines the number of different tokens that can be
             represented by the `inputs_ids` passed when calling [`RobertaPreLayerNormModel`] or
             [`TFRobertaPreLayerNormModel`].
@@ -106,7 +106,7 @@ class RobertaPreLayerNormConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=30522,
+        vocab_size=50265,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,

From af2aac51fc1c59237ff7228908ace2cd8fc0d9a6 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 30 May 2023 17:12:14 +0200
Subject: [PATCH 254/935] [from_pretrained] imporve the error message when
 `_no_split_modules` is not defined (#23861)

* Better warning

* Update src/transformers/modeling_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* format line

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/modeling_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 38be4e736a7a..2dbe5d43829b 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2734,7 +2734,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 target_dtype = torch.int8
 
             if model._no_split_modules is None:
-                raise ValueError(f"{model.__class__.__name__} does not support `device_map='{device_map}'` yet.")
+                raise ValueError(
+                    f"{model.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model"
+                    "class needs to implement the `_no_split_modules` attribute."
+                )
             no_split_modules = model._no_split_modules
             if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
                 raise ValueError(

From 6451ad0471b839d7f22353a9c502cedf3c8aa3db Mon Sep 17 00:00:00 2001
From: George <51296182+Natyren@users.noreply.github.com>
Date: Tue, 30 May 2023 20:26:37 +0300
Subject: [PATCH 255/935] Editing issue with pickle def with lambda function
 (#23869)

* Editing issue with pickle def with lambda function

* fix type

* Made helper function private

* delete tab

---------

Co-authored-by: georgebredis <9454-georgebredis@users.noreply.gitlab.aicrowd.com>
---
 src/transformers/optimization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 1019dab85430..2b8f5d2a8882 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -32,6 +32,10 @@
 logger = logging.get_logger(__name__)
 
 
+def _get_constant_lambda(_=None):
+    return 1
+
+
 def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
     """
     Create a schedule with a constant learning rate, using the learning rate set in optimizer.
@@ -46,7 +50,7 @@ def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
-    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
+    return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
 
 
 def get_reduce_on_plateau_schedule(optimizer: Optimizer):

From de9255de27abfcae4a1f816b904915f0b1e23cd9 Mon Sep 17 00:00:00 2001
From: Abhinav Patil <apatil.cs@gmail.com>
Date: Tue, 30 May 2023 11:36:18 -0700
Subject: [PATCH 256/935] Adds AutoProcessor.from_pretrained support for
 MCTCTProcessor (#23856)

Adds support for AutoProcessor.from_pretrained to MCTCTProcessor models
---
 src/transformers/models/auto/processing_auto.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index e72815747fa3..691a9bb6d513 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -57,6 +57,7 @@
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("markuplm", "MarkupLMProcessor"),
+        ("mctct", "MCTCTProcessor"),
         ("mgp-str", "MgpstrProcessor"),
         ("oneformer", "OneFormerProcessor"),
         ("owlvit", "OwlViTProcessor"),

From 9f0646a5550ccfb49a139abe14edb724edf785f5 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 31 May 2023 12:27:51 +0530
Subject: [PATCH 257/935] Smangrul/accelerate mp integrate (#23148)

* mixed precision support via accelerate

* fix issues

* fix for the sharded ddp case

* fix flax and tf failing tests

* `refactor the place to create `Accelerator` object

* address comments by removing debugging print statements
---
 src/transformers/trainer.py       | 68 +++++++++++++++++++------------
 src/transformers/training_args.py |  9 ++++
 2 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 357cfc45bddd..842a410d1589 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -212,6 +212,8 @@
     if version.parse(accelerate_version) >= version.parse("0.16"):
         from accelerate import skip_first_batches
 
+    from accelerate import Accelerator
+
 
 if TYPE_CHECKING:
     import optuna
@@ -337,6 +339,9 @@ def __init__(
         self.deepspeed = None
         self.is_in_train = False
 
+        # create accelerator object
+        self.accelerator = Accelerator()
+
         # memory metrics - must set up as early as possible
         self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
         self._memory_tracker.start()
@@ -607,7 +612,7 @@ def __init__(
                         "but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer."
                     )
 
-        if args.fp16 or args.bf16:
+        if (args.fp16 or args.bf16) and self.sharded_ddp is not None:
             if args.half_precision_backend == "auto":
                 if args.device == torch.device("cpu"):
                     if args.fp16:
@@ -624,30 +629,31 @@ def __init__(
         self.do_grad_scaling = False
         if (args.fp16 or args.bf16) and not (args.deepspeed or is_sagemaker_mp_enabled()):
             # deepspeed and SageMaker Model Parallel manage their own half precision
-            if args.half_precision_backend == "cuda_amp":
-                self.use_cuda_amp = True
-                self.amp_dtype = torch.float16 if args.fp16 else torch.bfloat16
-                #  bf16 does not need grad scaling
-                self.do_grad_scaling = self.amp_dtype == torch.float16
-                if self.do_grad_scaling:
-                    if self.sharded_ddp is not None:
-                        self.scaler = ShardedGradScaler()
-                    elif self.fsdp is not None:
-                        from torch.distributed.fsdp.sharded_grad_scaler import (
-                            ShardedGradScaler as FSDPShardedGradScaler,
-                        )
+            if self.sharded_ddp is not None:
+                if args.half_precision_backend == "cuda_amp":
+                    self.use_cuda_amp = True
+                    self.amp_dtype = torch.float16 if args.fp16 else torch.bfloat16
+                    #  bf16 does not need grad scaling
+                    self.do_grad_scaling = self.amp_dtype == torch.float16
+                    if self.do_grad_scaling:
+                        if self.sharded_ddp is not None:
+                            self.scaler = ShardedGradScaler()
+                        elif self.fsdp is not None:
+                            from torch.distributed.fsdp.sharded_grad_scaler import (
+                                ShardedGradScaler as FSDPShardedGradScaler,
+                            )
 
-                        self.scaler = FSDPShardedGradScaler()
-                    elif is_torch_tpu_available():
-                        from torch_xla.amp import GradScaler
+                            self.scaler = FSDPShardedGradScaler()
+                        elif is_torch_tpu_available():
+                            from torch_xla.amp import GradScaler
 
-                        self.scaler = GradScaler()
-                    else:
-                        self.scaler = torch.cuda.amp.GradScaler()
-            elif args.half_precision_backend == "cpu_amp":
-                self.use_cpu_amp = True
-                self.amp_dtype = torch.bfloat16
-            else:
+                            self.scaler = GradScaler()
+                        else:
+                            self.scaler = torch.cuda.amp.GradScaler()
+                elif args.half_precision_backend == "cpu_amp":
+                    self.use_cpu_amp = True
+                    self.amp_dtype = torch.bfloat16
+            elif args.half_precision_backend == "apex":
                 if not is_apex_available():
                     raise ImportError(
                         "Using FP16 with APEX but APEX is not installed, please refer to"
@@ -1801,6 +1807,11 @@ def _inner_training_loop(
         if delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
+        # prepare using `accelerator` prepare
+        model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+            self.model, self.optimizer, self.lr_scheduler
+        )
+
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
 
@@ -2013,10 +2024,15 @@ def _inner_training_loop(
                         elif hasattr(model, "clip_grad_norm_"):
                             # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
                             model.clip_grad_norm_(args.max_grad_norm)
-                        else:
+                        elif self.use_apex:
                             # Revert to normal clipping otherwise, handling Apex or full precision
                             nn.utils.clip_grad_norm_(
-                                amp.master_params(self.optimizer) if self.use_apex else model.parameters(),
+                                amp.master_params(self.optimizer),
+                                args.max_grad_norm,
+                            )
+                        else:
+                            self.accelerator.clip_grad_norm_(
+                                model.parameters(),
                                 args.max_grad_norm,
                             )
 
@@ -2802,7 +2818,7 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor,
             # loss gets scaled under gradient_accumulation_steps in deepspeed
             loss = self.deepspeed.backward(loss)
         else:
-            loss.backward()
+            self.accelerator.backward(loss)
 
         return loss.detach()
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 63876e053ad4..a0b11cd12e35 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1562,6 +1562,15 @@ def __post_init__(self):
                 FutureWarning,
             )
 
+        # if training args is specified, it will override the one specified in the accelerate config
+        if self.half_precision_backend != "apex" and len(self.sharded_ddp) == 0:
+            mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
+            if self.fp16:
+                mixed_precision_dtype = "fp16"
+            elif self.bf16:
+                mixed_precision_dtype = "bf16"
+            os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype
+
     def __str__(self):
         self_as_dict = asdict(self)
 

From 1cf148a6aaafcb0c483f341aababdfc05fd1da58 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 31 May 2023 13:42:49 +0530
Subject: [PATCH 258/935] Smangrul/accelerate ddp integrate (#23151)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mixed precision support via accelerate

* fix issues

* fix for the sharded ddp case

* fix flax and tf failing tests

* `refactor the place to create `Accelerator` object

* move ddp prep to accelerate

* fix 😅

* resolving comments
---
 src/transformers/trainer.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 842a410d1589..567fe0ac8437 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -213,6 +213,7 @@
         from accelerate import skip_first_batches
 
     from accelerate import Accelerator
+    from accelerate.utils import DistributedDataParallelKwargs
 
 
 if TYPE_CHECKING:
@@ -1591,6 +1592,8 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
                 model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
             )
         elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+            if is_torch_neuroncore_available():
+                return model
             kwargs = {}
             if self.args.ddp_find_unused_parameters is not None:
                 kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
@@ -1603,15 +1606,8 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
 
             if self.args.ddp_bucket_cap_mb is not None:
                 kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
-            if is_torch_neuroncore_available():
-                return model
-            if any(p.requires_grad for p in model.parameters()):
-                model = nn.parallel.DistributedDataParallel(
-                    model,
-                    device_ids=[self.args.local_rank] if self.args._n_gpu != 0 else None,
-                    output_device=self.args.local_rank if self.args._n_gpu != 0 else None,
-                    **kwargs,
-                )
+
+            self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
 
         # torch.compile() needs to be called after wrapping the model with FSDP or DDP
         # to ensure that it accounts for the graph breaks required by those wrappers

From 015829e6c4a1b9a33cbf522d6c07619c92f069fd Mon Sep 17 00:00:00 2001
From: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Date: Wed, 31 May 2023 17:23:59 +0900
Subject: [PATCH 259/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`pad=5Ftruncation.mdx`=20to=20Korean=20(#23823)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: pad_truncation.mdx

* feat: manual draft

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/_toctree.yml       |  4 +-
 docs/source/ko/pad_truncation.mdx | 64 +++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/pad_truncation.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index fd921229c36b..c676c1dd267a 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -166,8 +166,8 @@
     title: (번역중) Summary of the tokenizers
   - local: in_translation
     title: (번역중) Attention mechanisms
-  - local: in_translation
-    title: (번역중) Padding and truncation
+  - local: pad_truncation
+    title: 패딩과 잘라내기
   - local: in_translation
     title: (번역중) BERTology
   - local: in_translation
diff --git a/docs/source/ko/pad_truncation.mdx b/docs/source/ko/pad_truncation.mdx
new file mode 100644
index 000000000000..6fd7ccee2f6a
--- /dev/null
+++ b/docs/source/ko/pad_truncation.mdx
@@ -0,0 +1,64 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 패딩과 잘라내기[[padding-and-truncation]]
+
+배치 입력은 길이가 다른 경우가 많아서 고정 크기 텐서로 변환할 수 없습니다. 패딩과 잘라내기는 다양한 길이의 배치에서 직사각형 텐서를 생성할 수 있도록 이 문제를 해결하는 전략입니다. 패딩은 특수한 **패딩 토큰**을 추가하여 짧은 시퀀스가 배치에서 가장 긴 시퀀스 또는 모델에서 허용하는 최대 길이와 동일한 길이를 갖도록 합니다. 잘라내기는 긴 시퀀스를 잘라내어 패딩과 다른 방식으로 시퀀스의 길이를 동일하게 합니다.
+
+대부분의 경우 배치에 가장 긴 시퀀스의 길이로 패딩하고 모델이 허용할 수 있는 최대 길이로 잘라내는 것이 잘 작동합니다. 그러나 필요하다면 API가 지원하는 더 많은 전략을 사용할 수 있습니다. 필요한 인수는 `padding`, `truncation`, `max_length` 세 가지입니다.
+
+`padding` 인수는 패딩을 제어합니다. 불리언 또는 문자열일 수 있습니다:
+
+  - `True` 또는 `'longest'`: 배치에서 가장 긴 시퀀스로 패딩합니다(단일 시퀀스만 제공하는 경우 패딩이 적용되지 않습니다).
+  - `'max_length'`: `max_length` 인수가 지정한 길이로 패딩하거나, `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 패딩합니다. 단일 시퀀스만 제공하는 경우에도 패딩이 적용됩니다.
+  - `False` 또는 `'do_not_pad'`: 패딩이 적용되지 않습니다. 이것이 기본 동작입니다.
+
+`truncation` 인수는 잘라낼 방법을 정합니다. 불리언 또는 문자열일 수 있습니다:
+
+  - `True` 또는 `longest_first`: `max_length` 인수가 지정한 최대 길이로 잘라내거나, 
+    `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 잘라냅니다. 
+    시퀀스 쌍에서 가장 긴 시퀀스의 토큰을 적절한 길이에 도달할 때까지 하나씩 제거합니다.
+  - `'only_second'`: `max_length` 인수가 지정한 최대 길이로 잘라내거나, 
+    `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 잘라냅니다.
+    시퀀스 쌍(또는 시퀀스 쌍의 배치)가 제공된 경우 쌍의 두 번째 문장만 잘라냅니다.
+  - `'only_first'`: `max_length` 인수가 지정한 최대 길이로 잘라내거나, 
+    `max_length`가 제공되지 않은 경우(`max_length=None`) 모델에서 허용되는 최대 길이로 잘라냅니다. 
+    시퀀스 쌍(또는 시퀀스 쌍의 배치)가 제공된 경우 쌍의 첫 번째 문장만 잘라냅니다.
+  - `False` 또는 `'do_not_truncate'`: 잘라내기를 적용하지 않습니다. 이것이 기본 동작입니다.
+
+`max_length` 인수는 패딩 및 잘라내기를 적용할 길이를 제어합니다. 이 인수는 정수 또는 `None`일 수 있으며, `None`일 경우 모델이 허용할 수 있는 최대 길이로 기본값이 설정됩니다. 모델에 특정한 최대 입력 길이가 없는 경우 `max_length`에 대한 잘라내기 또는 패딩이 비활성화됩니다.
+
+다음 표에는 패딩 및 잘라내기를 설정하는 권장 방법이 요약되어 있습니다. 
+입력으로 시퀀스 쌍을 사용하는 경우, 다음 예제에서 `truncation=True`를 `['only_first', 'only_second', 'longest_first']`에서 선택한 `STRATEGY`, 즉 `truncation='only_second'` 또는 `truncation='longest_first'`로 바꾸면 앞서 설명한 대로 쌍의 두 시퀀스가 잘리는 방식을 제어할 수 있습니다.
+
+| 잘라내기                             | 패딩                              | 사용 방법                                                                                 |
+|--------------------------------------|-----------------------------------|------------------------------------------------------------------------------------------|
+| 잘라내기 없음                        | 패딩 없음                          | `tokenizer(batch_sentences)`                                                             |
+|                                      | 배치 내 최대 길이로 패딩           | `tokenizer(batch_sentences, padding=True)` 또는                                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                          |
+|                                      | 모델의 최대 입력 길이로 패딩      | `tokenizer(batch_sentences, padding='max_length')`                                        |
+|                                      | 특정 길이로 패딩                  | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                         |
+|                                      | 다양한 길이로 패딩                | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)                           |
+| 모델의 최대 입력 길이로 잘라내기      | 패딩 없음                         | `tokenizer(batch_sentences, truncation=True)` 또는                                        |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                         |
+|                                      | 배치 내 최대 길이로 패딩          | `tokenizer(batch_sentences, padding=True, truncation=True)` 또는                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                           |
+|                                      | 모델의 최대 입력 길이로 패딩      | `tokenizer(batch_sentences, padding='max_length', truncation=True)` 또는                  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                   |
+|                                      | 특정 길이로 패딩                  | 사용 불가                                                                              |
+| 특정 길이로 잘라내기                 | 패딩 없음                         | `tokenizer(batch_sentences, truncation=True, max_length=42)` 또는                         |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                         |
+|                                      | 배치 내 최대 길이로 패딩          | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` 또는           |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`           |
+|                                      | 모델의 최대 입력 길이로 패딩       | 사용 불가                                                                             |
+|                                      | 특정 길이로 패딩                   | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` 또는  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)`   |

From 0b774074a5f7f2137d0f1743bb9990cfb7e7a1d8 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 31 May 2023 14:10:46 +0530
Subject: [PATCH 260/935] move fsdp handling to accelerate (#23158)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mixed precision support via accelerate

* fix issues

* fix for the sharded ddp case

* fix flax and tf failing tests

* `refactor the place to create `Accelerator` object

* move ddp prep to accelerate

* fix 😅

* resolving comments

* move fsdp handling to accelerate

* fixex

* fix saving
---
 src/transformers/trainer.py       | 193 ++++++++++++------------------
 src/transformers/training_args.py |  28 ++++-
 2 files changed, 105 insertions(+), 116 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 567fe0ac8437..491d37cbc211 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -343,6 +343,12 @@ def __init__(
         # create accelerator object
         self.accelerator = Accelerator()
 
+        # post accelerator creation setup
+        if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
+            fsdp_plugin = self.accelerator.state.fsdp_plugin
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get("limit_all_gathers", False)
+            fsdp_plugin.use_orig_params = self.args.fsdp_config.get("use_orig_params", False)
+
         # memory metrics - must set up as early as possible
         self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
         self._memory_tracker.start()
@@ -464,7 +470,7 @@ def __init__(
                 self.fsdp = ShardingStrategy.NO_SHARD
 
             self.backward_prefetch = BackwardPrefetch.BACKWARD_PRE
-            if "backward_prefetch" in self.args.fsdp_config and "backward_pos" in self.args.fsdp_config.get(
+            if "backward_prefetch" in self.args.fsdp_config and "backward_post" in self.args.fsdp_config.get(
                 "backward_prefetch", []
             ):
                 self.backward_prefetch = BackwardPrefetch.BACKWARD_POST
@@ -1479,114 +1485,58 @@ def _wrap_model(self, model, training=True, dataloader=None):
                     cpu_offload=cpu_offload,
                 ).to(self.args.device)
         # Distributed training using PyTorch FSDP
-        elif self.fsdp is not None:
-            if not self.args.fsdp_config["xla"]:
-                # PyTorch FSDP!
-                from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
-                from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-                from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
-
-                if FSDPOption.OFFLOAD in self.args.fsdp:
-                    cpu_offload = CPUOffload(offload_params=True)
-                else:
-                    cpu_offload = CPUOffload(offload_params=False)
-
-                auto_wrap_policy = None
-
-                if FSDPOption.AUTO_WRAP in self.args.fsdp:
-                    if self.args.fsdp_config["fsdp_min_num_params"] > 0:
-                        auto_wrap_policy = functools.partial(
-                            size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
-                        )
-                    elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
-                        transformer_cls_to_wrap = set()
-                        for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
-                            transformer_cls = get_module_class_from_name(model, layer_class)
-                            if transformer_cls is None:
-                                raise Exception("Could not find the transformer layer class to wrap in the model.")
-                            else:
-                                transformer_cls_to_wrap.add(transformer_cls)
-                        auto_wrap_policy = functools.partial(
-                            transformer_auto_wrap_policy,
-                            # Transformer layer class to wrap
-                            transformer_layer_cls=transformer_cls_to_wrap,
-                        )
-                mixed_precision_policy = None
-                dtype = None
-                if self.args.fp16:
-                    dtype = torch.float16
-                elif self.args.bf16:
-                    dtype = torch.bfloat16
-                if dtype is not None:
-                    mixed_precision_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=dtype)
-                if type(model) != FSDP:
-                    # XXX: Breaking the self.model convention but I see no way around it for now.
-                    signature = inspect.signature(FSDP.__init__).parameters.keys()
-                    kwargs = {}
-                    for arg in ["limit_all_gathers", "forward_prefetch", "backward_prefetch"]:
-                        if arg in signature:
-                            kwargs[arg] = getattr(self, arg)
-                    self.model = model = FSDP(
-                        model,
-                        sharding_strategy=self.fsdp,
-                        cpu_offload=cpu_offload,
-                        auto_wrap_policy=auto_wrap_policy,
-                        mixed_precision=mixed_precision_policy,
-                        device_id=self.args.device,
-                        **kwargs,
-                    )
-            else:
-                try:
-                    from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
-                    from torch_xla.distributed.fsdp import checkpoint_module
-                    from torch_xla.distributed.fsdp.wrap import (
-                        size_based_auto_wrap_policy,
-                        transformer_auto_wrap_policy,
-                    )
-                except ImportError:
-                    raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.")
-                auto_wrap_policy = None
-                auto_wrapper_callable = None
-                if self.args.fsdp_config["fsdp_min_num_params"] > 0:
-                    auto_wrap_policy = functools.partial(
-                        size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
-                    )
-                elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
-                    transformer_cls_to_wrap = set()
-                    for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
-                        transformer_cls = get_module_class_from_name(model, layer_class)
-                        if transformer_cls is None:
-                            raise Exception("Could not find the transformer layer class to wrap in the model.")
-                        else:
-                            transformer_cls_to_wrap.add(transformer_cls)
-                    auto_wrap_policy = functools.partial(
-                        transformer_auto_wrap_policy,
-                        # Transformer layer class to wrap
-                        transformer_layer_cls=transformer_cls_to_wrap,
-                    )
-                fsdp_kwargs = self.args.xla_fsdp_config
-                if self.args.fsdp_config["xla_fsdp_grad_ckpt"]:
-                    # Apply gradient checkpointing to auto-wrapped sub-modules if specified
-                    def auto_wrapper_callable(m, *args, **kwargs):
-                        return FSDP(checkpoint_module(m), *args, **kwargs)
-
-                # Wrap the base model with an outer FSDP wrapper
-                self.model = model = FSDP(
-                    model,
-                    auto_wrap_policy=auto_wrap_policy,
-                    auto_wrapper_callable=auto_wrapper_callable,
-                    **fsdp_kwargs,
+        elif self.fsdp is not None and self.args.fsdp_config["xla"]:
+            try:
+                from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
+                from torch_xla.distributed.fsdp import checkpoint_module
+                from torch_xla.distributed.fsdp.wrap import (
+                    size_based_auto_wrap_policy,
+                    transformer_auto_wrap_policy,
+                )
+            except ImportError:
+                raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.")
+            auto_wrap_policy = None
+            auto_wrapper_callable = None
+            if self.args.fsdp_config["fsdp_min_num_params"] > 0:
+                auto_wrap_policy = functools.partial(
+                    size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
                 )
+            elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
+                transformer_cls_to_wrap = set()
+                for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
+                    transformer_cls = get_module_class_from_name(model, layer_class)
+                    if transformer_cls is None:
+                        raise Exception("Could not find the transformer layer class to wrap in the model.")
+                    else:
+                        transformer_cls_to_wrap.add(transformer_cls)
+                auto_wrap_policy = functools.partial(
+                    transformer_auto_wrap_policy,
+                    # Transformer layer class to wrap
+                    transformer_layer_cls=transformer_cls_to_wrap,
+                )
+            fsdp_kwargs = self.args.xla_fsdp_config
+            if self.args.fsdp_config["xla_fsdp_grad_ckpt"]:
+                # Apply gradient checkpointing to auto-wrapped sub-modules if specified
+                def auto_wrapper_callable(m, *args, **kwargs):
+                    return FSDP(checkpoint_module(m), *args, **kwargs)
+
+            # Wrap the base model with an outer FSDP wrapper
+            self.model = model = FSDP(
+                model,
+                auto_wrap_policy=auto_wrap_policy,
+                auto_wrapper_callable=auto_wrapper_callable,
+                **fsdp_kwargs,
+            )
 
-                # Patch `xm.optimizer_step` should not reduce gradients in this case,
-                # as FSDP does not need gradient reduction over sharded parameters.
-                def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
-                    loss = optimizer.step(**optimizer_args)
-                    if barrier:
-                        xm.mark_step()
-                    return loss
+            # Patch `xm.optimizer_step` should not reduce gradients in this case,
+            # as FSDP does not need gradient reduction over sharded parameters.
+            def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
+                loss = optimizer.step(**optimizer_args)
+                if barrier:
+                    xm.mark_step()
+                return loss
 
-                xm.optimizer_step = patched_optimizer_step
+            xm.optimizer_step = patched_optimizer_step
         elif is_sagemaker_dp_enabled():
             model = nn.parallel.DistributedDataParallel(
                 model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
@@ -1796,17 +1746,26 @@ def _inner_training_loop(
         if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
             self._load_from_checkpoint(resume_from_checkpoint, model)
 
-        # for the rest of this function `model` is the outside model, whether it was wrapped or not
-        if model is not self.model:
-            self.model_wrapped = model
+        # as the model is wrapped, don't use `accelerator.prepare`
+        # this is for unhandled cases such as
+        # Fairscale Sharded DDP, FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
+        use_accelerator_prepare = True if model is self.model else False
 
         if delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         # prepare using `accelerator` prepare
-        model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
-            self.model, self.optimizer, self.lr_scheduler
-        )
+        if use_accelerator_prepare:
+            model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                self.model, self.optimizer, self.lr_scheduler
+            )
+
+        if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
+            self.model = model
+
+        # for the rest of this function `model` is the outside model, whether it was wrapped or not
+        if model is not self.model:
+            self.model_wrapped = model
 
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
@@ -2894,11 +2853,15 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp
             or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp
             or self.fsdp is not None
+            or getattr(self.accelerator.state, "fsdp_plugin", None) is not None
         ):
-            state_dict = self.model.state_dict()
+            if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
+                self.accelerator.state.fsdp_plugin.save_model(self.accelerator, self.model, output_dir)
+            else:
+                state_dict = self.model.state_dict()
 
-            if self.args.should_save:
-                self._save(output_dir, state_dict=state_dict)
+                if self.args.should_save:
+                    self._save(output_dir, state_dict=state_dict)
         elif self.deepspeed:
             # this takes care of everything as long as we aren't under zero3
             if self.args.should_save:
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index a0b11cd12e35..001c8c5eeb48 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -442,7 +442,7 @@ class TrainingArguments:
                     - `"backward_pre"` : Prefetches the next set of parameters before the current set of parameter's
                       gradient
                         computation.
-                    - `"backward_pos"` : This prefetches the next set of parameters after the current set of
+                    - `"backward_post"` : This prefetches the next set of parameters after the current set of
                       parameter’s
                         gradient computation.
                 - fsdp_forward_prefetch (`bool`, *optional*, defaults to `False`)
@@ -1504,6 +1504,32 @@ def __post_init__(self):
             if self.fsdp_config["xla_fsdp_grad_ckpt"]:
                 warnings.warn("`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.")
 
+        # accelerate integration for FSDP
+        if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
+            os.environ["ACCELERATE_USE_FSDP"] = "true"
+            from accelerate.utils.constants import (
+                FSDP_AUTO_WRAP_POLICY,
+                FSDP_SHARDING_STRATEGY,
+            )
+
+            for fsdp_option in self.fsdp:
+                if fsdp_option.upper() in FSDP_SHARDING_STRATEGY:
+                    # set environment variable for FSDP sharding strategy
+                    os.environ["FSDP_SHARDING_STRATEGY"] = str(FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1)
+                elif fsdp_option == FSDPOption.OFFLOAD:
+                    os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
+                elif fsdp_option == FSDPOption.AUTO_WRAP:
+                    if self.fsdp_config["fsdp_min_num_params"] > 0:
+                        os.environ["FSDP_MIN_NUM_PARAMS"] = str(self.fsdp_config["fsdp_min_num_params"])
+                        os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
+                    elif self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
+                        os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ",".join(
+                            self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]
+                        )
+                        os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
+            prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH")
+            os.environ["FSDP_BACKWARD_PREFETCH"] = prefetch_policy.upper()
+
         if self.tpu_metrics_debug:
             warnings.warn(
                 "using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"

From 03db59104714f60b591ef48073840b288ee4cdc0 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 31 May 2023 14:42:07 +0530
Subject: [PATCH 261/935] shift torch dynamo handling to accelerate (#23168)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mixed precision support via accelerate

* fix issues

* fix for the sharded ddp case

* fix flax and tf failing tests

* `refactor the place to create `Accelerator` object

* move ddp prep to accelerate

* fix 😅

* resolving comments

* move fsdp handling to accelerate

* fixex

* fix saving

* shift torch dynamo handling to accelerate
---
 src/transformers/trainer.py       | 5 -----
 src/transformers/training_args.py | 9 +++++++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 491d37cbc211..1531a65e4695 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1559,11 +1559,6 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
 
             self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
 
-        # torch.compile() needs to be called after wrapping the model with FSDP or DDP
-        # to ensure that it accounts for the graph breaks required by those wrappers
-        if self.args.torch_compile:
-            model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode)
-
         return model
 
     def train(
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 001c8c5eeb48..403d437843ec 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1371,6 +1371,15 @@ def __post_init__(self):
             self.torch_compile = True
         if self.torch_compile and self.torch_compile_backend is None:
             self.torch_compile_backend = "inductor"
+
+        # accelerate integration for torch compile
+        if self.torch_compile:
+            # set env vars for accelerate
+            prefix = "ACCELERATE_DYNAMO_"
+            os.environ[prefix + "BACKEND"] = self.torch_compile_backend
+            if self.torch_compile_mode is not None:
+                os.environ[prefix + "MODE"] = self.torch_compile_mode
+
         if self.framework == "pt" and is_torch_available() and self.torch_compile:
             if is_torch_tf32_available():
                 if self.tf32 is None and not self.fp16 or self.bf16:

From 38dbbc26405338ea7ba147e24b3d9a1e6cbd3f7a Mon Sep 17 00:00:00 2001
From: Sam Passaglia <8333102+passaglia@users.noreply.github.com>
Date: Wed, 31 May 2023 18:32:27 +0900
Subject: [PATCH 262/935] Fix bug leading to missing token in
 GPTSanJapaneseTokenizer (#23883)

* add \n

* removed copied from header
---
 .../models/gptsan_japanese/tokenization_gptsan_japanese.py     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
index 0c89a60b786f..a16e55ec7180 100644
--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -55,7 +55,6 @@
 }
 
 
-# Copied from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese.load_vocab_and_emoji
 def load_vocab_and_emoji(vocab_file, emoji_file):
     """Loads a vocabulary file and emoji file into a dictionary."""
     with open(emoji_file, "r", encoding="utf-8") as f:
@@ -66,7 +65,7 @@ def load_vocab_and_emoji(vocab_file, emoji_file):
     ids_to_tokens = collections.OrderedDict()
     with open(vocab_file, "r", encoding="utf-8") as f:
         token = f.readlines()
-    token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token]
+    token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
     for idx, b in enumerate(token):
         ids_to_tokens[idx] = b
         raw_vocab[",".join(b)] = idx

From 9fea71b465008201404c77b5c6733116f2eea278 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 31 May 2023 05:38:20 -0400
Subject: [PATCH 263/935] Fix last instances of kbit -> quantized (#23797)

---
 src/transformers/modeling_utils.py | 4 ++--
 src/transformers/trainer.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2dbe5d43829b..552d182190bf 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2237,7 +2237,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
                 logger.info(
                     f"Overriding torch_dtype={torch_dtype} with `torch_dtype=torch.float16` due to "
-                    "requirements of `bitsandbytes` to enable model loading in mixed kbit. "
+                    "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
                     "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
                     " torch_dtype=torch.float16 to remove this warning."
                 )
@@ -2683,7 +2683,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             )
 
             # training in 8-bit is only available in 0.37.0+
-            model._is_kbit_training_enabled = version.parse(
+            model._is_quantized_training_enabled = version.parse(
                 importlib_metadata.version("bitsandbytes")
             ) >= version.parse("0.37.0")
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 1531a65e4695..6c2b3c097fda 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -403,8 +403,8 @@ def __init__(
             )
 
         # At this stage the model is already loaded
-        if getattr(model, "is_loaded_in_kbit", False):
-            if getattr(model, "_is_kbit_training_enabled", False):
+        if getattr(model, "is_quantized", False):
+            if getattr(model, "_is_quantized_training_enabled", False):
                 logger.info(
                     "The model is loaded in 8-bit precision. To train this model you need to add additional modules"
                     " inside the model such as adapters using `peft` library and freeze the model weights. Please"

From 88f50a1e8974cb2f78a2cf9512ad20544dc72808 Mon Sep 17 00:00:00 2001
From: Denisa Roberts <denisa.roberts@denisaroberts.me>
Date: Wed, 31 May 2023 05:43:12 -0400
Subject: [PATCH 264/935] Add TensorFlow implementation of  EfficientFormer
 (#22620)

* Add tf code for efficientformer

* Fix return dict bug - return last hidden state after last stage

* Fix corresponding return dict bug

* Override test tol

* Change default values of training to False

* Set training to default False X3

* Rm axis from ln

* Set init in dense projection

* Rm debug stuff

* Make style; all tests pass.

* Modify year to 2023

* Fix attention biases codes

* Update the shape list logic

* Add a batch norm eps config

* Remove extract comments in test files

* Add conditional attn and hidden states return for serving output

* Change channel dim checking logic

* Add exception for withteacher model in training mode

* Revert layer count for now

* Add layer count for conditional layer naming

* Transpose for conv happens only in main layer

* Make tests smaller

* Make style

* Update doc

* Rm from_pt

* Change to actual expect image class label

* Remove stray print in tests

* Update image processor test

* Remove the old serving output logic

* Make style

* Make style

* Complete test
---
 docs/source/en/index.mdx                      |   2 +-
 docs/source/en/model_doc/efficientformer.mdx  |  17 +-
 src/transformers/__init__.py                  |  16 +
 .../models/auto/modeling_tf_auto.py           |   5 +
 .../models/efficientformer/__init__.py        |  35 +-
 .../configuration_efficientformer.py          |   8 +-
 .../modeling_efficientformer.py               |  37 +-
 .../modeling_tf_efficientformer.py            | 986 ++++++++++++++++++
 src/transformers/utils/dummy_tf_objects.py    |  31 +
 .../test_modeling_efficientformer.py          |  29 +-
 .../test_modeling_tf_efficientformer.py       | 393 +++++++
 utils/documentation_tests.txt                 |   1 +
 12 files changed, 1537 insertions(+), 23 deletions(-)
 create mode 100644 src/transformers/models/efficientformer/modeling_tf_efficientformer.py
 create mode 100644 tests/models/efficientformer/test_modeling_tf_efficientformer.py

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 7ee35c613669..b7442050c21a 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -313,7 +313,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/efficientformer.mdx b/docs/source/en/model_doc/efficientformer.mdx
index 2b512932ef82..0ef8cfb53f80 100644
--- a/docs/source/en/model_doc/efficientformer.mdx
+++ b/docs/source/en/model_doc/efficientformer.mdx
@@ -37,7 +37,7 @@ EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work pr
 reach extremely low latency on mobile devices while maintaining high performance.*
 
 This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
-The original code can be found [here](https://github.com/snap-research/EfficientFormer).
+The original code can be found [here](https://github.com/snap-research/EfficientFormer). The TensorFlow version of this model was added by [D-Roberts](https://huggingface.co/D-Roberts).
 
 ## Documentation resources
 
@@ -66,3 +66,18 @@ The original code can be found [here](https://github.com/snap-research/Efficient
 
 [[autodoc]] EfficientFormerForImageClassificationWithTeacher
     - forward
+
+## TFEfficientFormerModel
+
+[[autodoc]] TFEfficientFormerModel
+    - call
+
+## TFEfficientFormerForImageClassification
+
+[[autodoc]] TFEfficientFormerForImageClassification
+    - call
+
+## TFEfficientFormerForImageClassificationWithTeacher
+
+[[autodoc]] TFEfficientFormerForImageClassificationWithTeacher
+    - call
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 37310c34b986..14ba5ec4100b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3142,6 +3142,15 @@
             "TFDPRReader",
         ]
     )
+    _import_structure["models.efficientformer"].extend(
+        [
+            "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFEfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerModel",
+            "TFEfficientFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.electra"].extend(
         [
             "TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6471,6 +6480,13 @@
             TFDPRQuestionEncoder,
             TFDPRReader,
         )
+        from .models.efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
         from .models.electra import (
             TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFElectraForMaskedLM,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index bfc29f2dd35f..bd86431c8cb2 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -47,6 +47,7 @@
         ("deit", "TFDeiTModel"),
         ("distilbert", "TFDistilBertModel"),
         ("dpr", "TFDPRQuestionEncoder"),
+        ("efficientformer", "TFEfficientFormerModel"),
         ("electra", "TFElectraModel"),
         ("esm", "TFEsmModel"),
         ("flaubert", "TFFlaubertModel"),
@@ -202,6 +203,10 @@
         ("cvt", "TFCvtForImageClassification"),
         ("data2vec-vision", "TFData2VecVisionForImageClassification"),
         ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
+        (
+            "efficientformer",
+            ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
+        ),
         ("mobilevit", "TFMobileViTForImageClassification"),
         ("regnet", "TFRegNetForImageClassification"),
         ("resnet", "TFResNetForImageClassification"),
diff --git a/src/transformers/models/efficientformer/__init__.py b/src/transformers/models/efficientformer/__init__.py
index ea7bcdffd459..25d60d1ee765 100644
--- a/src/transformers/models/efficientformer/__init__.py
+++ b/src/transformers/models/efficientformer/__init__.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {
@@ -45,6 +51,20 @@
         "EfficientFormerPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_efficientformer"] = [
+        "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFEfficientFormerForImageClassification",
+        "TFEfficientFormerForImageClassificationWithTeacher",
+        "TFEfficientFormerModel",
+        "TFEfficientFormerPreTrainedModel",
+    ]
+
 if TYPE_CHECKING:
     from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
 
@@ -69,6 +89,19 @@
             EfficientFormerModel,
             EfficientFormerPreTrainedModel,
         )
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
 
 else:
     import sys
diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py
index 5f30664ff325..fecb90a886e8 100644
--- a/src/transformers/models/efficientformer/configuration_efficientformer.py
+++ b/src/transformers/models/efficientformer/configuration_efficientformer.py
@@ -52,7 +52,7 @@ class EfficientFormerConfig(PretrainedConfig):
             The size of the key in meta3D block.
         attention_ratio (`int`, *optional*, defaults to 4):
             Ratio of the dimension of the query and value to the dimension of the key in MSHA block
-        resolution (`int`, *optional*, defaults to 5)
+        resolution (`int`, *optional*, defaults to 7)
             Size of each patch
         num_hidden_layers (`int`, *optional*, defaults to 5):
             Number of hidden layers in the Transformer encoder.
@@ -91,6 +91,8 @@ class EfficientFormerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
 
     Example:
 
@@ -136,6 +138,8 @@ def __init__(
         hidden_act: str = "gelu",
         initializer_range: float = 0.02,
         layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        batch_norm_eps: float = 1e-05,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -165,3 +169,5 @@ def __init__(
         self.distillation = distillation
         self.use_layer_scale = use_layer_scale
         self.layer_scale_init_value = layer_scale_init_value
+        self.image_size = image_size
+        self.batch_norm_eps = batch_norm_eps
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
index b6264e60c8b8..cd38b2d813e1 100644
--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -43,7 +43,7 @@
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
@@ -73,7 +73,7 @@ def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim:
             stride=config.downsample_stride,
             padding=config.downsample_pad,
         )
-        self.norm = nn.BatchNorm2d(embed_dim) if apply_norm else nn.Identity()
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
@@ -157,10 +157,10 @@ def __init__(self, config: EfficientFormerConfig, out_channels: int):
         super().__init__()
 
         self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
-        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2)
+        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
 
         self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
-        self.batchnorm_after = nn.BatchNorm2d(out_channels)
+        self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
 
         self.activation = nn.ReLU()
 
@@ -224,24 +224,24 @@ def __init__(
         hidden_features = hidden_features or in_features
 
         self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
-        self.actvation = ACT2FN[config.hidden_act]
+        self.activation = ACT2FN[config.hidden_act]
         self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
         self.dropout = nn.Dropout(drop)
 
-        self.batchnorm_before = nn.BatchNorm2d(hidden_features)
-        self.batchnorm_after = nn.BatchNorm2d(out_features)
+        self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
+        self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
 
     def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         hidden_state = self.convolution1(hidden_state)
         hidden_state = self.batchnorm_before(hidden_state)
 
-        hidden_state = self.actvation(hidden_state)
+        hidden_state = self.activation(hidden_state)
         hidden_state = self.dropout(hidden_state)
         hidden_state = self.convolution2(hidden_state)
 
         hidden_state = self.batchnorm_after(hidden_state)
-
         hidden_state = self.dropout(hidden_state)
+
         return hidden_state
 
 
@@ -266,7 +266,7 @@ def drop_path(input, drop_prob: float = 0.0, training: bool = False):
     return output
 
 
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
 class EfficientFormerDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
@@ -301,8 +301,10 @@ def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0
             attention_ratio=config.attention_ratio,
             resolution=config.resolution,
         )
-        self.layernorm1 = nn.LayerNorm(dim)
-        self.layernorm2 = nn.LayerNorm(dim)
+
+        self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
         mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
         self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
 
@@ -346,15 +348,20 @@ def __init__(self, config: EfficientFormerConfig):
 
     def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
         all_attention_outputs = () if output_attentions else None
+
         for layer_module in self.blocks:
             if isinstance(hidden_states, tuple):
                 hidden_states = hidden_states[0]
+
             hidden_states = layer_module(hidden_states, output_attentions)
+
             if output_attentions:
                 all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
         if output_attentions:
             outputs = (hidden_states[0],) + all_attention_outputs
             return outputs
+
         return hidden_states
 
 
@@ -379,6 +386,7 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
 
         if self.use_layer_scale:
             layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
+
             layer_output = layer_output + self.drop_path(
                 self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
             )
@@ -398,6 +406,7 @@ def __init__(self, config: EfficientFormerConfig, stage_idx: int):
         drop_paths = [
             config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
         ]
+
         self.blocks = nn.ModuleList(
             [
                 EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
@@ -446,6 +455,7 @@ def __init__(self, config: EfficientFormerConfig):
             for i in range(num_intermediate_stages)
         ]
         intermediate_stages = []
+
         for i in range(num_intermediate_stages):
             intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
             if downsamples[i]:
@@ -475,6 +485,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
         layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
+
         if output_attentions:
             all_self_attentions = all_self_attentions + layer_output[1:]
 
@@ -482,7 +493,7 @@ def forward(
             all_hidden_states = all_hidden_states + (layer_output[0],)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
 
         return BaseModelOutput(
             last_hidden_state=layer_output[0],
diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
new file mode 100644
index 000000000000..1907af388f92
--- /dev/null
+++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
@@ -0,0 +1,986 @@
+# coding=utf-8
+# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TensorFlow EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFImageClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
+
+
+TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "snap-research/efficientformer-l1-300",
+    # See all EfficientFormer models at https://huggingface.co/models?filter=efficientformer
+]
+
+
+class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(
+        self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+
+        self.padding = tf.keras.layers.ZeroPadding2D(padding=config.downsample_pad)
+        self.projection = tf.keras.layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=config.downsample_patch_size,
+            strides=config.downsample_stride,
+            padding="valid",
+            name="projection",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.norm = (
+            tf.keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
+            if apply_norm
+            else tf.identity
+        )
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        tf.debugging.assert_shapes(
+            [(pixel_values, (..., None, None, self.num_channels))],
+            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
+        )
+        embeddings = self.projection(self.padding(pixel_values))
+        embeddings = self.norm(embeddings, training=training)
+        return embeddings
+
+
+class TFEfficientFormerSelfAttention(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        dim: int,
+        key_dim: int,
+        num_heads: int,
+        attention_ratio: int,
+        resolution: int,
+        config: EfficientFormerConfig,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+
+        self.qkv = tf.keras.layers.Dense(
+            units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
+        )
+        self.projection = tf.keras.layers.Dense(
+            units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
+        )
+        self.resolution = resolution
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        points = list(itertools.product(range(self.resolution), range(self.resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+
+        idxs = []
+
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+
+        self.attention_biases = self.add_weight(
+            shape=(self.num_heads, len(attention_offsets)),
+            initializer=tf.keras.initializers.zeros(),
+            trainable=True,
+            name="attention_biases",
+        )
+        self.attention_bias_idxs = self.add_weight(
+            shape=(num_points, num_points),
+            trainable=False,
+            dtype=tf.int32,
+            name="attention_bias_idxs",
+        )
+
+        self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))
+
+        super().build(input_shape)
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        batch_size, sequence_length, *_ = shape_list(hidden_states)
+        qkv = self.qkv(inputs=hidden_states)
+
+        query_layer, key_layer, value_layer = tf.split(
+            tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
+            num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
+            axis=3,
+        )
+
+        query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
+        key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
+        value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
+
+        attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
+        scale = tf.cast(self.scale, dtype=attention_probs.dtype)
+        attention_probs = tf.multiply(attention_probs, scale)
+
+        attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
+        attention_probs = attention_probs + attention_biases
+        attention_probs = stable_softmax(logits=attention_probs, axis=-1)
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+
+        context_layer = tf.reshape(
+            tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
+        )
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class TFEfficientFormerConvStem(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding = tf.keras.layers.ZeroPadding2D(padding=1)
+        self.convolution1 = tf.keras.layers.Conv2D(
+            filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+
+        self.convolution2 = tf.keras.layers.Conv2D(
+            filters=out_channels,
+            kernel_size=3,
+            strides=2,
+            padding="valid",
+            name="convolution2",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+
+        self.activation = tf.keras.layers.Activation(activation=tf.keras.activations.relu, name="activation")
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
+        features = self.activation(features)
+        return features
+
+
+class TFEfficientFormerPooling(tf.keras.layers.Layer):
+    def __init__(self, pool_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.pool = tf.keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        output = self.pool(hidden_states)
+        output = output - hidden_states
+        return output
+
+
+class TFEfficientFormerDenseMlp(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = tf.keras.layers.Dense(
+            units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
+        )
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+        self.linear_out = tf.keras.layers.Dense(
+            units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
+        )
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.linear_in(inputs=hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.linear_out(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+        return hidden_states
+
+
+class TFEfficientFormerConvMlp(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = tf.keras.layers.Conv2D(
+            filters=hidden_features,
+            kernel_size=1,
+            name="convolution1",
+            padding="valid",
+        )
+
+        self.activation = ACT2FN[config.hidden_act]
+
+        self.convolution2 = tf.keras.layers.Conv2D(
+            filters=out_features,
+            kernel_size=1,
+            name="convolution2",
+            padding="valid",
+        )
+
+        self.dropout = tf.keras.layers.Dropout(rate=drop)
+
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = tf.keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state, training=training)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state, training=training)
+        hidden_state = self.convolution2(hidden_state)
+        hidden_state = self.batchnorm_after(hidden_state, training=training)
+        hidden_state = self.dropout(hidden_state, training=training)
+        return hidden_state
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer
+class TFEfficientFormerDropPath(tf.keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFEfficientFormerFlat(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, hidden_states: tf.Tensor) -> Tuple[tf.Tensor]:
+        batch_size, _, _, in_channels = shape_list(hidden_states)
+        hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
+        return hidden_states
+
+
+class TFEfficientFormerMeta3D(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+
+        self.token_mixer = TFEfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+            name="token_mixer",
+            config=config,
+        )
+        self.dim = dim
+        self.config = config
+
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")
+
+        # Using `layers.Activation` instead of `tf.identity` to better control `training' behavior.
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path)
+            if drop_path > 0.0
+            else tf.keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim,),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim,),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+        super().build(input_shape)
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        self_attention_outputs = self.token_mixer(
+            hidden_states=self.layernorm1(hidden_states, training=training),
+            output_attentions=output_attentions,
+            training=training,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
+                training=training,
+            )
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TFEfficientFormerMeta3DLayers(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = [
+            TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
+            for i, drop_path in enumerate(drop_paths)
+        ]
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.blocks):
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(
+                hidden_states=hidden_states, output_attentions=output_attentions, training=training
+            )
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+
+class TFEfficientFormerMeta4D(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
+        self.dim = dim
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerConvMlp(
+            config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
+        )
+
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path, name="drop_path")
+            if drop_path > 0.0
+            else tf.keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim),
+                initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+        super().build(input_shape)
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
+                training=training,
+            )
+
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_state=layer_output, training=training),
+                training=training,
+            )
+
+        else:
+            layer_output = hidden_states + self.drop_path(outputs, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_state=layer_output, training=training), training=training
+            )
+
+        return layer_output
+
+
+class TFEfficientFormerMeta4DLayers(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
+        super().__init__(**kwargs)
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = [
+            TFEfficientFormerMeta4D(
+                config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
+            )
+            for i in range(len(drop_paths))
+        ]
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+
+class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+
+class TFEfficientFormerLastStage(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
+        self.flat = TFEfficientFormerFlat(name="flat")
+        self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> Tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        hidden_states = self.flat(hidden_states=hidden_states)
+        hidden_states = self.meta3D_layers(
+            hidden_states=hidden_states, output_attentions=output_attentions, training=training
+        )
+
+        return hidden_states
+
+
+class TFEfficientFormerEncoder(tf.keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+
+        intermediate_stages = []
+        layer_count = -1
+        for i in range(num_intermediate_stages):
+            layer_count += 1
+            intermediate_stages.append(
+                TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
+            )
+            if downsamples[i]:
+                layer_count += 1
+                intermediate_stages.append(
+                    TFEfficientFormerPatchEmbeddings(
+                        config,
+                        config.hidden_sizes[i],
+                        config.hidden_sizes[i + 1],
+                        name=f"intermediate_stages.{layer_count}",
+                    )
+                )
+        self.intermediate_stages = intermediate_stages
+        self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_hidden_states: bool,
+        output_attentions: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states, training=training)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@keras_serializable
+class TFEfficientFormerMainLayer(tf.keras.layers.Layer):
+    config_class = EfficientFormerConfig
+
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
+        self.encoder = TFEfficientFormerEncoder(config, name="encoder")
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[tf.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor, ...]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # When running on CPU, tf.keras.layers.Conv2D and tf.keras.layers.AveragePool2D do not
+        # support channels first NCHW format. A number of blocks contain both.
+        # So change the input format from (batch_size, num_channels, height, width) to
+        # (batch_size, height, width, num_channels) here.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        embedding_output = self.patch_embed(pixel_values, training=training)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output, training=training)
+
+        # Change the hidden states from (batch_size, height, width, num_channels) to
+        # (batch_size, num_channels, height, width).
+        # The hidden states are in (batch_size, height, width, num_channels)
+        # shape after all stages except the MB3D blocks.
+        if output_hidden_states:
+            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]]) + (
+                encoder_outputs[1][-1],
+            )
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TFEfficientFormerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a TensorFlow
+    [tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
+    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
+
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`EfficientFormerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
+    ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier head
+        self.classifier = (
+            tf.keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else tf.keras.layers.Activation("linear", name="classifier")
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tf.Tensor, TFImageClassifierOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFImageClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@dataclass
+class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Args:
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: tf.Tensor = None
+    cls_logits: tf.Tensor = None
+    distillation_logits: tf.Tensor = None
+    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[tf.Tensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
+
+    .. warning::
+            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+            supported.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier heads
+        self.classifier = (
+            tf.keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else tf.keras.layers.Activation("linear", name="classifier")
+        )
+        self.distillation_classifier = (
+            tf.keras.layers.Dense(config.num_labels, name="distillation_classifier")
+            if config.num_labels > 0
+            else tf.keras.layers.Activation("linear", name="distillation_classifier")
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if training:
+            raise Exception(
+                "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
+            )
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+        distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return TFEfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 658d7f689fce..4a189174eeeb 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1099,6 +1099,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFEfficientFormerForImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFEfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFEfficientFormerModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFEfficientFormerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
index f3e88b1d2959..2d951e4e5b90 100644
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -18,6 +18,7 @@
 import inspect
 import unittest
 import warnings
+from typing import List
 
 from transformers import EfficientFormerConfig
 from transformers.models.auto import get_values
@@ -55,15 +56,16 @@ def __init__(
         self,
         parent,
         batch_size: int = 13,
-        image_size: int = 224,
+        image_size: int = 64,
         patch_size: int = 2,
-        embed_dim: int = 48,  # last embed dim of stem
+        embed_dim: int = 3,
         num_channels: int = 3,
         is_training: bool = True,
         use_labels: bool = True,
-        hidden_size: int = 448,
-        num_hidden_layers: int = 7,  # For the l1
-        num_attention_heads: int = 8,
+        hidden_size: int = 128,
+        hidden_sizes=[16, 32, 64, 128],
+        num_hidden_layers: int = 7,
+        num_attention_heads: int = 4,
         intermediate_size: int = 37,
         hidden_act: str = "gelu",
         hidden_dropout_prob: float = 0.1,
@@ -71,7 +73,11 @@ def __init__(
         type_sequence_label_size: int = 10,
         initializer_range: float = 0.02,
         encoder_stride: int = 2,
-        num_attention_outputs: int = 1,  # For l1
+        num_attention_outputs: int = 1,
+        dim: int = 128,
+        depths: List[int] = [2, 2, 2, 2],
+        resolution: int = 2,
+        mlp_expansion_ratio: int = 2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -93,6 +99,11 @@ def __init__(
         self.num_attention_outputs = num_attention_outputs
         self.embed_dim = embed_dim
         self.seq_length = embed_dim + 1
+        self.resolution = resolution
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.dim = dim
+        self.mlp_expansion_ratio = mlp_expansion_ratio
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -119,6 +130,11 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             encoder_stride=self.encoder_stride,
+            resolution=self.resolution,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            dim=self.dim,
+            mlp_expansion_ratio=self.mlp_expansion_ratio,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -379,6 +395,7 @@ def test_attention_outputs(self):
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
         chunk_length = getattr(self.model_tester, "chunk_length", None)
+
         if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
             encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
 
diff --git a/tests/models/efficientformer/test_modeling_tf_efficientformer.py b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
new file mode 100644
index 000000000000..5301aee561b0
--- /dev/null
+++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
@@ -0,0 +1,393 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow EfficientFormer model. """
+
+import inspect
+import unittest
+from typing import List
+
+import numpy as np
+
+from transformers import EfficientFormerConfig
+from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import cached_property, is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFEfficientFormerForImageClassification,
+        TFEfficientFormerForImageClassificationWithTeacher,
+        TFEfficientFormerModel,
+    )
+    from transformers.models.efficientformer.modeling_tf_efficientformer import (
+        TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import EfficientFormerImageProcessor
+
+
+class TFEfficientFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size: int = 13,
+        image_size: int = 64,
+        patch_size: int = 2,
+        embed_dim: int = 3,
+        num_channels: int = 3,
+        is_training: bool = True,
+        use_labels: bool = True,
+        hidden_size: int = 128,
+        hidden_sizes=[16, 32, 64, 128],
+        num_hidden_layers: int = 7,
+        num_attention_heads: int = 4,
+        intermediate_size: int = 37,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        type_sequence_label_size: int = 10,
+        initializer_range: float = 0.02,
+        encoder_stride: int = 2,
+        num_attention_outputs: int = 1,
+        dim: int = 128,
+        depths: List[int] = [2, 2, 2, 2],
+        resolution: int = 2,
+        mlp_expansion_ratio: int = 2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.encoder_stride = encoder_stride
+        self.num_attention_outputs = num_attention_outputs
+        self.embed_dim = embed_dim
+        self.seq_length = embed_dim + 1
+        self.resolution = resolution
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.dim = dim
+        self.mlp_expansion_ratio = mlp_expansion_ratio
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return EfficientFormerConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+            resolution=self.resolution,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            dim=self.dim,
+            mlp_expansion_ratio=self.mlp_expansion_ratio,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TFEfficientFormerModel(config=config)
+        result = model(pixel_values, training=False)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = TFEfficientFormerForImageClassification(config)
+        result = model(pixel_values, labels=labels, training=False)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = TFEfficientFormerForImageClassification(config)
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_tf_common.py, as EfficientFormer does not use input_ids,
+    inputs_embeds, attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            TFEfficientFormerModel,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerForImageClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFEfficientFormerModel,
+            "image-classification": (
+                TFEfficientFormerForImageClassification,
+                TFEfficientFormerForImageClassificationWithTeacher,
+            ),
+        }
+        if is_tf_available()
+        else {}
+    )
+
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFEfficientFormerModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[-1].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.asseretIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[-1].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "TFEfficientFormerForImageClassificationWithTeacher":
+                del inputs_dict["labels"]
+
+        return inputs_dict
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
+    def test_for_masked_image_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFEfficientFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_tf
+@require_vision
+class EfficientFormerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = TFEfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="tf")
+        # forward pass
+        outputs = model(**inputs, training=False)
+        # verify the logits
+        expected_shape = tf.TensorShape((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = tf.constant([-0.0555, 0.4825, -0.0852])
+        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_image_classification_head_with_teacher(self):
+        model = TFEfficientFormerForImageClassificationWithTeacher.from_pretrained(
+            "snap-research/efficientformer-l1-300"
+        )
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="tf")
+        # forward pass
+        outputs = model(**inputs, training=False)
+        # verify the logits
+        expected_shape = tf.TensorShape((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = tf.constant([-0.1312, 0.4353, -1.0499])
+        self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index d3143b2167bf..5cdded64bbe8 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -82,6 +82,7 @@ src/transformers/models/dpt/modeling_dpt.py
 src/transformers/models/electra/configuration_electra.py
 src/transformers/models/electra/modeling_electra.py
 src/transformers/models/electra/modeling_tf_electra.py
+src/transformers/models/efficientformer/modeling_tf_efficientformer.py
 src/transformers/models/ernie/configuration_ernie.py
 src/transformers/models/ernie_m/configuration_ernie_m.py
 src/transformers/models/ernie_m/modeling_ernie_m.py

From a73b1d59a34ca8e30098b3e32a977a7928663559 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 31 May 2023 15:16:22 +0530
Subject: [PATCH 265/935] accelerate deepspeed and gradient accumulation
 integrate (#23236)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mixed precision support via accelerate

* fix issues

* fix for the sharded ddp case

* fix flax and tf failing tests

* `refactor the place to create `Accelerator` object

* move ddp prep to accelerate

* fix 😅

* resolving comments

* move fsdp handling to accelerate

* fixex

* fix saving

* shift torch dynamo handling to accelerate

* shift deepspeed integration and save & load utils to accelerate

* fix accelerate launcher support

* oops

* fix 🐛

* save ckpt fix

* Trigger CI

* nasty 🐛 😅

* as deepspeed needs grad_acc fixes, transfer grad_acc to accelerate

* make tests happy

* quality ✨

* loss tracked needs to account for grad_acc

* fixing the deepspeed tests

* quality ✨

* 😅😅😅

* tests 😡

* quality ✨

* Trigger CI

* resolve comments and fix the issue with the previous merge from branch

* Trigger CI

* accelerate took over deepspeed integration

---------

Co-authored-by: Stas Bekman <stas@stason.org>
---
 .github/ISSUE_TEMPLATE/bug-report.yml |   2 +-
 .github/PULL_REQUEST_TEMPLATE.md      |   2 +-
 src/transformers/deepspeed.py         |  95 ++++++-------
 src/transformers/testing_utils.py     |   7 +
 src/transformers/trainer.py           | 195 +++++++++++++-------------
 src/transformers/trainer_pt_utils.py  |   2 +-
 src/transformers/training_args.py     |  11 +-
 tests/deepspeed/test_deepspeed.py     |  17 ++-
 8 files changed, 167 insertions(+), 164 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index b8d0f312216d..6bc35994f34a 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -41,7 +41,7 @@ body:
         
         Integrations:
         
-          - deepspeed: HF Trainer: @stas00, Accelerate: @pacman100
+          - deepspeed: HF Trainer/Accelerate: @pacman100
           - ray/raytune: @richardliaw, @amogkam
           - Big Model Inference: @sgugger @muellerzr
         
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index e38d5ac9242e..4c2ca5752e29 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -55,7 +55,7 @@ Library:
 
 Integrations:
 
-- deepspeed: HF Trainer: @stas00, Accelerate: @pacman100
+- deepspeed: HF Trainer/Accelerate: @pacman100
 - ray/raytune: @richardliaw, @amogkam
 
 Documentation: @sgugger, @stevhliu and @MKhalusova
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 3a36db5f3ea5..7af2bedece84 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -17,7 +17,6 @@
 
 import importlib.util
 import weakref
-from copy import deepcopy
 from functools import partialmethod
 
 from .dependency_versions_check import dep_version_check
@@ -256,10 +255,12 @@ def deepspeed_config():
         return None
 
 
-def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps):
+def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
     """
     A convenience wrapper that deals with optimizer and lr scheduler configuration.
     """
+    from accelerate.utils import DummyOptim, DummyScheduler
+
     config = hf_deepspeed_config.config
 
     # Optimizer + Scheduler
@@ -267,13 +268,13 @@ def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps
     # 1. DS scheduler + DS optimizer: Yes
     # 2. HF scheduler + HF optimizer: Yes
     # 3. DS scheduler + HF optimizer: Yes
-    # 4. HF scheduler + DS optimizer: Yes
+    # 4. HF scheduler + DS optimizer: No
     #
     # Unless Offload is enabled in which case it's:
     # 1. DS scheduler + DS optimizer: Yes
     # 2. HF scheduler + HF optimizer: Mostly*
     # 3. DS scheduler + HF optimizer: Mostly*
-    # 4. HF scheduler + DS optimizer: Yes
+    # 4. HF scheduler + DS optimizer: No
     #
     # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
 
@@ -284,6 +285,7 @@ def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps
                 "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
                 "Only one optimizer can be configured."
             )
+        optimizer = DummyOptim(params=model_parameters)
     else:
         if hf_deepspeed_config.is_offload():
             logger.info(
@@ -297,21 +299,21 @@ def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps
         # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
         config["zero_allow_untested_optimizer"] = True
 
-    def _lr_scheduler_callable(optimizer):
-        return trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
-
     lr_scheduler = None
-    if "scheduler" not in config:
-        if optimizer is None:
-            # Optimizer is not available, so use callable to defer lr_scheduler creation to DS init
-            lr_scheduler = _lr_scheduler_callable
-        else:
-            lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
+    if "scheduler" in config:
+        lr_scheduler = DummyScheduler(optimizer)
+    else:
+        if isinstance(optimizer, DummyOptim):
+            raise ValueError(
+                "Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
+                "Please configure a scheduler in the DeepSpeed config."
+            )
+        lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
 
     return optimizer, lr_scheduler
 
 
-def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False):
+def deepspeed_init(trainer, num_training_steps, inference=False):
     """
     Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
 
@@ -323,28 +325,22 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
         resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
         inference: launch in inference mode (no optimizer and no lr scheduler)
 
-    Returns: model, optimizer, lr_scheduler
+    Returns: optimizer, lr_scheduler
 
     We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
     https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
     can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
 
     """
-    import deepspeed
     from deepspeed.utils import logger as ds_logger
 
     model = trainer.model
     args = trainer.args
 
-    if hasattr(trainer, "hf_deepspeed_config_orig"):
-        hf_deepspeed_config = deepcopy(trainer.hf_deepspeed_config_orig)
-    else:
-        hf_deepspeed_config = args.hf_deepspeed_config
-        trainer.hf_deepspeed_config_orig = deepcopy(args.hf_deepspeed_config)
+    hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
 
     # resume config update - some bits like `model` and `num_training_steps` only become available during train
     hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
-    config = hf_deepspeed_config.config
 
     # set the Deepspeed log level consistent with the Trainer
     ds_logger.setLevel(args.get_process_log_level())
@@ -361,40 +357,33 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
         model_parameters = None
     else:
         trainer.optimizer = None  # important for when deepspeed_init is used as re-init
-        optimizer, lr_scheduler = deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps)
         model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
+        optimizer, lr_scheduler = deepspeed_optim_sched(
+            trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
+        )
 
     # keep for quick debug:
     # from pprint import pprint; pprint(config)
 
-    kwargs = {
-        "model": model,
-        "model_parameters": model_parameters,
-        "config_params": config,
-        "optimizer": optimizer,
-        "lr_scheduler": lr_scheduler,
-    }
-
-    deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
-
-    if resume_from_checkpoint is not None:
-        # it's possible that the user is trying to resume from model_path, which doesn't necessarily
-        # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
-        # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
-        # path contains what looks like a deepspeed checkpoint
-        import glob
-
-        deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*"))
-
-        if len(deepspeed_checkpoint_dirs) > 0:
-            logger.info(f"Attempting to resume from {resume_from_checkpoint}")
-            # this magically updates self.optimizer and self.lr_scheduler
-            load_path, _ = deepspeed_engine.load_checkpoint(
-                resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
-            )
-            if load_path is None:
-                raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
-        else:
-            raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
+    return optimizer, lr_scheduler
+
 
-    return deepspeed_engine, optimizer, lr_scheduler
+def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
+    # it's possible that the user is trying to resume from model_path, which doesn't necessarily
+    # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
+    # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
+    # path contains what looks like a deepspeed checkpoint
+    import glob
+
+    deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
+
+    if len(deepspeed_checkpoint_dirs) > 0:
+        logger.info(f"Attempting to resume from {checkpoint_path}")
+        # this magically updates self.optimizer and self.lr_scheduler
+        load_path, _ = deepspeed_engine.load_checkpoint(
+            checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
+        )
+        if load_path is None:
+            raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
+    else:
+        raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 9e8f51bd9345..40d5d4c022cb 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -112,6 +112,10 @@
 )
 
 
+if is_accelerate_available():
+    from accelerate.state import AcceleratorState, PartialState
+
+
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
 DUMMY_UNKNOWN_IDENTIFIER = "julien-c/dummy-unknown"
 DUMMY_DIFF_TOKENIZER_IDENTIFIER = "julien-c/dummy-diff-tokenizer"
@@ -1331,6 +1335,9 @@ def tearDown(self):
         for path in self.teardown_tmp_dirs:
             shutil.rmtree(path, ignore_errors=True)
         self.teardown_tmp_dirs = []
+        if is_accelerate_available():
+            AcceleratorState._reset_state()
+            PartialState._reset_state()
 
 
 def mockenv(**kwargs):
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 6c2b3c097fda..bbcce50a7d55 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -67,7 +67,7 @@
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 from .debug_utils import DebugOption, DebugUnderflowOverflow
-from .deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
+from .deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
 from .dependency_versions_check import dep_version_check
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
@@ -337,18 +337,34 @@ def __init__(
         # Seed must be set before instantiating the model when using model
         enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
         self.hp_name = None
-        self.deepspeed = None
         self.is_in_train = False
 
         # create accelerator object
-        self.accelerator = Accelerator()
+        self.accelerator = Accelerator(
+            deepspeed_plugin=self.args.deepspeed_plugin,
+            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
+        )
+
+        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
 
         # post accelerator creation setup
-        if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
+        if self.is_fsdp_enabled:
             fsdp_plugin = self.accelerator.state.fsdp_plugin
             fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get("limit_all_gathers", False)
             fsdp_plugin.use_orig_params = self.args.fsdp_config.get("use_orig_params", False)
 
+        if self.is_deepspeed_enabled:
+            if getattr(self.args, "hf_deepspeed_config", None) is None:
+                from transformers.deepspeed import HfTrainerDeepSpeedConfig
+
+                ds_plugin = self.accelerator.state.deepspeed_plugin
+
+                ds_plugin.hf_ds_config = HfTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config)
+                ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config
+                ds_plugin.hf_ds_config.trainer_config_process(self.args)
+
         # memory metrics - must set up as early as possible
         self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
         self._memory_tracker.start()
@@ -420,7 +436,7 @@ def __init__(
         # Setup Sharded DDP training
         self.sharded_ddp = None
         if len(args.sharded_ddp) > 0:
-            if args.deepspeed:
+            if self.is_deepspeed_enabled:
                 raise ValueError(
                     "Using --sharded_ddp xxx together with --deepspeed is not possible, deactivate one of those flags."
                 )
@@ -446,7 +462,7 @@ def __init__(
 
         self.fsdp = None
         if len(args.fsdp) > 0:
-            if args.deepspeed:
+            if self.is_deepspeed_enabled:
                 raise ValueError(
                     "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
                 )
@@ -494,10 +510,11 @@ def __init__(
         self.place_model_on_device = args.place_model_on_device
         if (
             self.is_model_parallel
-            or args.deepspeed
+            or self.is_deepspeed_enabled
             or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
             or (self.sharded_ddp in [ShardedDDPOption.ZERO_DP_2, ShardedDDPOption.ZERO_DP_3])
             or (self.fsdp is not None)
+            or self.is_fsdp_enabled
         ):
             self.place_model_on_device = False
 
@@ -541,7 +558,7 @@ def __init__(
                     " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
                     " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
                 )
-        if ((self.sharded_ddp is not None) or args.deepspeed or (self.fsdp is not None)) and (
+        if ((self.sharded_ddp is not None) or self.is_deepspeed_enabled or (self.fsdp is not None)) and (
             self.optimizer is not None or self.lr_scheduler is not None
         ):
             raise RuntimeError(
@@ -634,7 +651,7 @@ def __init__(
             logger.info(f"Using {args.half_precision_backend} half precision backend")
 
         self.do_grad_scaling = False
-        if (args.fp16 or args.bf16) and not (args.deepspeed or is_sagemaker_mp_enabled()):
+        if (args.fp16 or args.bf16) and not (self.is_deepspeed_enabled or is_sagemaker_mp_enabled()):
             # deepspeed and SageMaker Model Parallel manage their own half precision
             if self.sharded_ddp is not None:
                 if args.half_precision_backend == "cuda_amp":
@@ -1316,12 +1333,17 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
             logger.info(f"SigOpt Assignments: {trial.assignments}")
         if self.hp_search_backend == HPSearchBackend.WANDB:
             logger.info(f"W&B Sweep parameters: {trial}")
-        if self.args.deepspeed:
+        if self.is_deepspeed_enabled:
+            if self.args.deepspeed is None:
+                raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set")
             # Rebuild the deepspeed config to reflect the updated training parameters
+            from accelerate.utils import DeepSpeedPlugin
+
             from transformers.deepspeed import HfTrainerDeepSpeedConfig
 
             self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
             self.args.hf_deepspeed_config.trainer_config_process(self.args)
+            self.accelerator.state.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
 
     def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
         if self.hp_search_backend is None or trial is None:
@@ -1440,10 +1462,6 @@ def _wrap_model(self, model, training=True, dataloader=None):
                 return self.model_wrapped
             return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
 
-        # already initialized its own DDP and AMP
-        if self.deepspeed:
-            return self.deepspeed
-
         # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
         if unwrap_model(model) is not model:
             return model
@@ -1628,7 +1646,7 @@ def train(
             if resume_from_checkpoint is None:
                 raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
 
-        if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and args.deepspeed is None:
+        if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled:
             self._load_from_checkpoint(resume_from_checkpoint)
 
         # If model was re-initialized, put it on the right device and update self.model_wrapped
@@ -1717,16 +1735,11 @@ def _inner_training_loop(
             or is_sagemaker_mp_enabled()
             or self.fsdp is not None
         )
-        if args.deepspeed:
-            deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
-                self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
-            )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-            self.optimizer = optimizer
-            self.lr_scheduler = lr_scheduler
-        elif not delay_optimizer_creation:
+
+        if self.is_deepspeed_enabled:
+            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
+
+        if not delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         self.state = TrainerState()
@@ -1755,6 +1768,27 @@ def _inner_training_loop(
                 self.model, self.optimizer, self.lr_scheduler
             )
 
+        if self.is_fsdp_enabled:
+            self.model = model
+
+        # for the rest of this function `model` is the outside model, whether it was wrapped or not
+        if model is not self.model:
+            self.model_wrapped = model
+
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model_wrapped
+
+        # deepspeed ckpt loading
+        if resume_from_checkpoint is not None and self.is_deepspeed_enabled:
+            deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
+
+        # prepare using `accelerator` prepare
+        if use_accelerator_prepare:
+            model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                self.model, self.optimizer, self.lr_scheduler
+            )
+
         if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
             self.model = model
 
@@ -1921,16 +1955,7 @@ def _inner_training_loop(
                 if step % args.gradient_accumulation_steps == 0:
                     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
 
-                if (
-                    (total_batched_samples % args.gradient_accumulation_steps != 0)
-                    and args.parallel_mode == ParallelMode.DISTRIBUTED
-                    and args._no_sync_in_gradient_accumulation
-                    and hasattr(model, "no_sync")
-                ):
-                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
-                    with model.no_sync():
-                        tr_loss_step = self.training_step(model, inputs)
-                else:
+                with self.accelerator.accumulate(model):
                     tr_loss_step = self.training_step(model, inputs)
 
                 if (
@@ -1945,17 +1970,16 @@ def _inner_training_loop(
 
                 self.current_flos += float(self.floating_point_ops(inputs))
 
-                # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps
-                if self.deepspeed:
-                    self.deepspeed.step()
-
+                # should this be under the accumulate context manager?
+                # the `or` condition of `steps_in_epoch <= args.gradient_accumulation_steps` is not covered
+                # in accelerate
                 if total_batched_samples % args.gradient_accumulation_steps == 0 or (
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
                     steps_in_epoch <= args.gradient_accumulation_steps
                     and (step + 1) == steps_in_epoch
                 ):
                     # Gradient clipping
-                    if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed:
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
                         # deepspeed does its own clipping
 
                         if self.do_grad_scaling:
@@ -1988,9 +2012,7 @@ def _inner_training_loop(
 
                     # Optimizer step
                     optimizer_was_run = True
-                    if self.deepspeed:
-                        pass  # called outside the loop
-                    elif is_torch_tpu_available():
+                    if is_torch_tpu_available():
                         if self.do_grad_scaling:
                             self.scaler.step(self.optimizer)
                             self.scaler.update()
@@ -2005,7 +2027,7 @@ def _inner_training_loop(
                     else:
                         self.optimizer.step()
 
-                    if optimizer_was_run and not self.deepspeed:
+                    if optimizer_was_run:
                         # Delay optimizer scheduling until metrics are generated
                         if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                             self.lr_scheduler.step()
@@ -2159,6 +2181,8 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     load_result = model.load_state_dict(state_dict, strict=True)
                     # release memory
                     del state_dict
+            elif self.is_fsdp_enabled:
+                self.accelerator.state.fsdp_plugin.load_model(self.accelerator, model, resume_from_checkpoint)
             else:
                 # We load the model state dict on the CPU to avoid an OOM error.
                 if self.args.save_safetensors and os.path.isfile(safe_weights_file):
@@ -2186,23 +2210,8 @@ def _load_best_model(self):
         best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
         model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
         if os.path.exists(best_model_path) or os.path.exists(best_safe_model_path):
-            if self.deepspeed:
-                if self.model_wrapped is not None:
-                    # this removes the pre-hooks from the previous engine
-                    self.model_wrapped.destroy()
-                    self.model_wrapped = None
-
-                # temp hack until Deepspeed fixes the problem with resume from an existing engine that did some stepping
-                deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
-                    self,
-                    num_training_steps=self.args.max_steps,
-                    resume_from_checkpoint=self.state.best_model_checkpoint,
-                )
-                self.model = deepspeed_engine.module
-                self.model_wrapped = deepspeed_engine
-                self.deepspeed = deepspeed_engine
-                self.optimizer = optimizer
-                self.lr_scheduler = lr_scheduler
+            if self.is_deepspeed_enabled:
+                deepspeed_load_checkpoint(self.model_wrapped, self.state.best_model_checkpoint)
             else:
                 if is_sagemaker_mp_enabled():
                     if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
@@ -2224,6 +2233,10 @@ def _load_best_model(self):
 
                         state_dict["_smp_is_partial"] = False
                         load_result = model.load_state_dict(state_dict, strict=True)
+                elif self.is_fsdp_enabled:
+                    self.accelerator.state.fsdp_plugin.load_model(
+                        self.accelerator, model, self.state.best_model_checkpoint
+                    )
                 else:
                     if hasattr(model, "base_model") and getattr(model.base_model, "is_8bit_serializable", False):
                         # If train base_8_bit_models using PEFT & LoRA, assume that adapter have been saved properly.
@@ -2381,10 +2394,10 @@ def _save_checkpoint(self, model, trial, metrics=None):
         run_dir = self._get_output_dir(trial=trial)
         output_dir = os.path.join(run_dir, checkpoint_folder)
         self.save_model(output_dir, _internal_call=True)
-        if self.deepspeed:
+        if self.is_deepspeed_enabled:
             # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
             # config `stage3_gather_16bit_weights_on_model_save` is True
-            self.deepspeed.save_checkpoint(output_dir)
+            self.model_wrapped.save_checkpoint(output_dir)
 
         # Save optimizer and scheduler
         if self.sharded_ddp == ShardedDDPOption.SIMPLE:
@@ -2418,7 +2431,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 reissue_pt_warnings(caught_warnings)
                 if self.do_grad_scaling:
                     torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
-        elif self.args.should_save and not self.deepspeed:
+        elif self.args.should_save and not self.is_deepspeed_enabled:
             # deepspeed.save_checkpoint above saves model/optim/sched
             if self.fsdp:
                 torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
@@ -2488,7 +2501,7 @@ def _load_optimizer_and_scheduler(self, checkpoint):
         if checkpoint is None:
             return
 
-        if self.deepspeed:
+        if self.is_deepspeed_enabled:
             # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
             return
 
@@ -2675,11 +2688,11 @@ def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor,
             return type(data)(self._prepare_input(v) for v in data)
         elif isinstance(data, torch.Tensor):
             kwargs = {"device": self.args.device}
-            if self.deepspeed and (torch.is_floating_point(data) or torch.is_complex(data)):
+            if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
                 # NLP models inputs are int/uint and those get adjusted to the right dtype of the
                 # embedding. Other models such as wav2vec2's inputs are already float and thus
                 # may need special handling to match the dtypes of the model
-                kwargs.update({"dtype": self.args.hf_deepspeed_config.dtype()})
+                kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
             return data.to(**kwargs)
         return data
 
@@ -2755,22 +2768,15 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor,
         if self.args.n_gpu > 1:
             loss = loss.mean()  # mean() to average on multi-gpu parallel training
 
-        if self.args.gradient_accumulation_steps > 1 and not self.deepspeed:
-            # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
-            loss = loss / self.args.gradient_accumulation_steps
-
         if self.do_grad_scaling:
             self.scaler.scale(loss).backward()
         elif self.use_apex:
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
-        elif self.deepspeed:
-            # loss gets scaled under gradient_accumulation_steps in deepspeed
-            loss = self.deepspeed.backward(loss)
         else:
             self.accelerator.backward(loss)
 
-        return loss.detach()
+        return loss.detach() / self.args.gradient_accumulation_steps
 
     def compute_loss(self, model, inputs, return_outputs=False):
         """
@@ -2848,16 +2854,16 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             ShardedDDPOption.ZERO_DP_2 in self.args.sharded_ddp
             or ShardedDDPOption.ZERO_DP_3 in self.args.sharded_ddp
             or self.fsdp is not None
-            or getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+            or self.is_fsdp_enabled
         ):
-            if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
+            if self.is_fsdp_enabled:
                 self.accelerator.state.fsdp_plugin.save_model(self.accelerator, self.model, output_dir)
             else:
                 state_dict = self.model.state_dict()
 
                 if self.args.should_save:
                     self._save(output_dir, state_dict=state_dict)
-        elif self.deepspeed:
+        elif self.is_deepspeed_enabled:
             # this takes care of everything as long as we aren't under zero3
             if self.args.should_save:
                 self._save(output_dir)
@@ -2876,13 +2882,13 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
                 # now save the real model if stage3_gather_16bit_weights_on_model_save=True
                 # if false it will not be saved.
                 # This must be called on all ranks
-                if not self.deepspeed.save_16bit_model(output_dir, WEIGHTS_NAME):
+                if not self.model_wrapped.save_16bit_model(output_dir, WEIGHTS_NAME):
                     logger.warning(
                         "deepspeed.save_16bit_model didn't save the model, since"
                         " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
                         " zero_to_fp32.py to recover weights"
                     )
-                    self.deepspeed.save_checkpoint(output_dir)
+                    self.model_wrapped.save_checkpoint(output_dir)
 
         elif self.args.should_save:
             self._save(output_dir)
@@ -3162,15 +3168,10 @@ def evaluation_loop(
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
         # if eval is called w/o train init deepspeed here
-        if args.deepspeed and not self.deepspeed:
-            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
-            # from the checkpoint eventually
-            deepspeed_engine, _, _ = deepspeed_init(
-                self, num_training_steps=0, resume_from_checkpoint=None, inference=True
-            )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
+        if self.is_deepspeed_enabled and self.model_wrapped is self.model:
+            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
+            model = self.accelerator.prepare(self.model)
+            self.model_wrapped = self.deepspeed = model
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
@@ -3762,18 +3763,10 @@ def prediction_loop(
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
         # if eval is called w/o train init deepspeed here
-        if args.deepspeed and not self.deepspeed:
-            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
-            # from the checkpoint eventually
-            deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None)
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-            # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since
-            # for example the Z3-optimizer is a must for zero3 to work even for inference - what we
-            # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer
-            deepspeed_engine.optimizer.optimizer = None
-            deepspeed_engine.lr_scheduler = None
+        if self.is_deepspeed_enabled and self.model_wrapped is self.model:
+            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
+            model = self.accelerator.prepare(self.model)
+            self.model_wrapped = self.deepspeed = model
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index dee1dce0f6f7..011f3162a635 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -838,7 +838,7 @@ def __len__(self):
 
 
 def _get_learning_rate(self):
-    if self.deepspeed:
+    if self.is_deepspeed_enabled:
         # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
         # not run for the first few dozen steps while loss scale is too large, and thus during
         # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 403d437843ec..d213b08d6de7 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -64,7 +64,7 @@
     import torch.distributed as dist
 
 if is_accelerate_available():
-    from accelerate import PartialState
+    from accelerate.state import AcceleratorState, PartialState
     from accelerate.utils import DistributedType
 
 if is_torch_tpu_available(check_device=False):
@@ -1550,6 +1550,7 @@ def __post_init__(self):
         if isinstance(self.debug, str):
             self.debug = [DebugOption(s) for s in self.debug.split()]
 
+        self.deepspeed_plugin = None
         if self.deepspeed:
             # - must be run very last in arg parsing, since it will use a lot of these settings.
             # - must be run before the model is created.
@@ -1562,6 +1563,12 @@ def __post_init__(self):
             self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
             self.hf_deepspeed_config.trainer_config_process(self)
 
+            # Accelerate DeepSpeed Plugin
+            from accelerate.utils import DeepSpeedPlugin
+
+            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+            self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
+
         if self.push_to_hub_token is not None:
             warnings.warn(
                 "`--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
@@ -1660,6 +1667,8 @@ def ddp_timeout_delta(self) -> timedelta:
     def _setup_devices(self) -> "torch.device":
         requires_backends(self, ["torch"])
         logger.info("PyTorch: setting up devices")
+        AcceleratorState._reset_state()
+        PartialState._reset_state()
         if not is_sagemaker_mp_enabled() and not is_accelerate_available(check_partial_state=True):
             raise ImportError(
                 "Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 446952ef7a04..c460bc9c1508 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -365,16 +365,19 @@ def test_ds_scheduler_hf_optimizer(self):
         self.assertNotEqual(new_a, a)
 
     def test_hf_scheduler_ds_optimizer(self):
-        a = 0
         with mockenv_context(**self.dist_env_1_gpu):
             ds_config_zero2_dict = self.get_config_dict(ZERO2)
             del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
             trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
-            trainer.train()
-        new_a = trainer.model.a.item()
-        self.assertNotEqual(new_a, a)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertIn(
+            "Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
+            "Please configure a scheduler in the DeepSpeed config.",
+            str(context.exception),
+        )
 
     @require_deepspeed_aio
     def test_stage3_nvme_offload(self):
@@ -751,6 +754,8 @@ def test_config_object(self):
             config = deepspeed_config()
             self.assertTrue(bool(config), "Deepspeed config should be accessible")
 
+            # with accelerate integration below line is additionally required for this test to pass
+            trainer.accelerator.state._reset_state()
             del trainer
             # now weakref should gc the global and we shouldn't get anything here
             config = deepspeed_config()
@@ -783,8 +788,8 @@ def test_load_best_model(self, stage, dtype):
 
         with mockenv_context(**self.dist_env_1_gpu):
             args_dict = {
-                "per_gpu_train_batch_size": 1,
-                "per_gpu_eval_batch_size": 1,
+                "per_device_train_batch_size": 1,
+                "per_device_eval_batch_size": 1,
                 "gradient_accumulation_steps": 1,
                 "learning_rate": 1e-4,
                 "num_train_epochs": 1,

From 00f6ba0e7ebd5d19bb7d834a709d74dbb8a5a3d9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Wed, 31 May 2023 06:31:33 -0400
Subject: [PATCH 266/935] Skip failing test for now

---
 examples/pytorch/test_pytorch_examples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index f4682b8933e7..6b992612e863 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import sys
+import unittest
 from unittest.mock import patch
 
 import torch
@@ -239,6 +240,7 @@ def test_run_ner(self):
             self.assertGreaterEqual(result["eval_accuracy"], 0.75)
             self.assertLess(result["eval_loss"], 0.5)
 
+    @unittest.skip("Broken, fix me Sourab")
     def test_run_squad(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""

From 0963a2508b5bbd62313b52ecb3fe714451985850 Mon Sep 17 00:00:00 2001
From: Calico <93032279+calico-1226@users.noreply.github.com>
Date: Wed, 31 May 2023 19:39:51 +0800
Subject: [PATCH 267/935] fix(configuration_llama): add
 `keys_to_ignore_at_inference` to `LlamaConfig` (#23891)

---
 src/transformers/models/llama/configuration_llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 30325b82f787..e3d075310e84 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -79,6 +79,7 @@ class LlamaConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
     model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,

From 68d53bc7178866821282f45732c1e465f5160fa6 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 31 May 2023 07:54:26 -0400
Subject: [PATCH 268/935] Fix Trainer when model is loaded on a different GPU
 (#23792)

---
 src/transformers/trainer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index bbcce50a7d55..b76b3d65a8de 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -405,12 +405,12 @@ def __init__(
         else:
             self.is_model_parallel = False
 
-        if (
-            getattr(model, "hf_device_map", None) is not None
-            and len([device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]) > 1
-            and not self.is_model_parallel
-        ):
-            self.is_model_parallel = True
+        if getattr(model, "hf_device_map", None) is not None:
+            devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
+            if len(devices) > 1:
+                self.is_model_parallel = True
+            else:
+                self.is_model_parallel = self.args.device != torch.device(devices[0])
 
             # warn users
             logger.info(

From d68d6665f94f86cbd9d5f512c9d2942528661cad Mon Sep 17 00:00:00 2001
From: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 31 May 2023 15:42:30 +0200
Subject: [PATCH 269/935] Support shared tensors (#23871)

* Suport shared storage

* Really be sure we have the same storage

* Make style

* - Refactor storage identifier mechanism
 - Group everything into a single for loop

* Make style

* PR

* make style

* Update src/transformers/pytorch_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/modeling_utils.py | 30 ++++++++++++++++++------------
 src/transformers/pytorch_utils.py  | 11 +++++++++++
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 552d182190bf..588f7f6134bb 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -41,6 +41,7 @@
     Conv1D,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
+    id_tensor_storage,
     prune_conv1d_layer,
     prune_layer,
     prune_linear_layer,
@@ -304,26 +305,31 @@ def shard_checkpoint(
     """
     max_shard_size = convert_file_size_to_int(max_shard_size)
 
-    sharded_state_dicts = []
-    current_block = {}
-    current_block_size = 0
+    sharded_state_dicts = [{}]
+    last_block_size = 0
     total_size = 0
+    storage_id_to_block = {}
 
     for key, weight in state_dict.items():
+        storage_id = id_tensor_storage(weight)
+
+        # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
+        if storage_id in storage_id_to_block:
+            block_id = storage_id_to_block[storage_id]
+            sharded_state_dicts[block_id][key] = weight
+            continue
+
         weight_size = weight.numel() * dtype_byte_size(weight.dtype)
 
         # If this weight is going to tip up over the maximal size, we split.
-        if current_block_size + weight_size > max_shard_size:
-            sharded_state_dicts.append(current_block)
-            current_block = {}
-            current_block_size = 0
+        if last_block_size + weight_size > max_shard_size:
+            sharded_state_dicts.append({})
+            last_block_size = 0
 
-        current_block[key] = weight
-        current_block_size += weight_size
+        sharded_state_dicts[-1][key] = weight
+        last_block_size += weight_size
         total_size += weight_size
-
-    # Add the last block
-    sharded_state_dicts.append(current_block)
+        storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
 
     # If we only have one shard, we return it
     if len(sharded_state_dicts) == 1:
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 87c3e986d99f..6c5fc5f4a414 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -16,6 +16,7 @@
 
 import torch
 from packaging import version
+from safetensors.torch import storage_ptr, storage_size
 from torch import nn
 
 from .utils import logging
@@ -277,3 +278,13 @@ def meshgrid(
         if indexing != "ij":
             raise ValueError('torch.meshgrid only supports `indexing="ij"` for torch<1.10.')
         return torch.meshgrid(*tensors)
+
+
+def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
+    """
+    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
+    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
+    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
+    non-overlapping lifetimes may have the same id.
+    """
+    return tensor.device, storage_ptr(tensor), storage_size(tensor)

From d99f11e89888c2bd1287b0d6f7703e09a9adac10 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <cauyxy@163.com>
Date: Wed, 31 May 2023 21:47:46 +0800
Subject: [PATCH 270/935] ensure banned_mask and indices in same device
 (#23901)

* ensure banned_mask and indices in same device

* ensure banned_mask and indices in same device

switch the order in which indices and banned_mask are created and create banned_mask on the proper device
---
 src/transformers/generation/logits_process.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index dd51610afd43..51ef06d5e0b0 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -649,8 +649,8 @@ def _set_scores_to_inf_for_banned_tokens(
 
         else:
             if banned_mask_list:
-                banned_mask = torch.LongTensor(banned_mask_list)
-                indices = torch.ones(len(banned_mask))
+                indices = torch.ones(len(banned_mask_list))
+                banned_mask = torch.LongTensor(banned_mask_list, device=indices.device)
                 # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates:
                 # [ 0  1  1 ]
                 # [ 0  0  0 ]

From 8f915c450dc2d03329d14a9086e71f8a22ecf2ce Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Wed, 31 May 2023 14:59:30 +0100
Subject: [PATCH 271/935] Unpin numba (#23162)

* fix for ragged list

* unpin numba

* make style

* np.object -> object

* propagate changes to tokenizer as well

* np.long -> "long"

* revert tokenization changes

* check with tokenization changes

* list/tuple logic

* catch numpy

* catch else case

* clean up

* up

* better check

* trigger ci

* Empty commit to trigger CI
---
 setup.py                                      |  4 +---
 src/transformers/dependency_versions_table.py |  1 -
 src/transformers/feature_extraction_utils.py  | 10 +++++++++-
 src/transformers/tokenization_utils_base.py   | 10 +++++++++-
 tests/models/realm/test_modeling_realm.py     |  2 +-
 tests/models/realm/test_retrieval_realm.py    |  6 +++---
 6 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index 19754eaf3f21..8406adc3072f 100644
--- a/setup.py
+++ b/setup.py
@@ -132,7 +132,6 @@
     "librosa",
     "nltk",
     "natten>=0.14.6",
-    "numba<0.57.0",  # Can be removed once unpinned.
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
@@ -286,8 +285,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-# numba can be removed here once unpinned
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm", "numba")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 4819b959cb81..869e96d7fbf1 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -37,7 +37,6 @@
     "librosa": "librosa",
     "nltk": "nltk",
     "natten": "natten>=0.14.6",
-    "numba": "numba<0.57.0",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 7823c4cf9e61..9caf2b9f722b 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -156,7 +156,15 @@ def as_tensor(value):
             as_tensor = jnp.array
             is_tensor = is_jax_tensor
         else:
-            as_tensor = np.asarray
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
+                    value_lens = [len(val) for val in value]
+                    if len(set(value_lens)) > 1 and dtype is None:
+                        # we have a ragged list so handle explicitly
+                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
+                return np.asarray(value, dtype=dtype)
+
             is_tensor = is_numpy_array
 
         # Do the tensor conversion in batch
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index ecc9d5011d78..4b65b56f7a92 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -705,7 +705,15 @@ def convert_to_tensors(
             as_tensor = jnp.array
             is_tensor = is_jax_tensor
         else:
-            as_tensor = np.asarray
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
+                    value_lens = [len(val) for val in value]
+                    if len(set(value_lens)) > 1 and dtype is None:
+                        # we have a ragged list so handle explicitly
+                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
+                return np.asarray(value, dtype=dtype)
+
             is_tensor = is_numpy_array
 
         # Do the tensor conversion in batch
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
index 228e0344b8b5..ddd6c2645011 100644
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -392,7 +392,7 @@ def test_training(self):
                 b"This is the fourth record.",
                 b"This is the fifth record.",
             ],
-            dtype=np.object,
+            dtype=object,
         )
         retriever = RealmRetriever(block_records, tokenizer)
         model = RealmForOpenQA(openqa_config, retriever)
diff --git a/tests/models/realm/test_retrieval_realm.py b/tests/models/realm/test_retrieval_realm.py
index 939d98440049..ba65a6afdd67 100644
--- a/tests/models/realm/test_retrieval_realm.py
+++ b/tests/models/realm/test_retrieval_realm.py
@@ -100,7 +100,7 @@ def get_dummy_block_records(self):
                 b"This is the fifth record",
                 b"This is a longer longer longer record",
             ],
-            dtype=np.object,
+            dtype=object,
         )
         return block_records
 
@@ -116,7 +116,7 @@ def test_retrieve(self):
         retriever = self.get_dummy_retriever()
         tokenizer = retriever.tokenizer
 
-        retrieved_block_ids = np.array([0, 3], dtype=np.long)
+        retrieved_block_ids = np.array([0, 3], dtype="long")
         question_input_ids = tokenizer(["Test question"]).input_ids
         answer_ids = tokenizer(
             ["the fourth"],
@@ -151,7 +151,7 @@ def test_block_has_answer(self):
         retriever = self.get_dummy_retriever()
         tokenizer = retriever.tokenizer
 
-        retrieved_block_ids = np.array([0, 3, 5], dtype=np.long)
+        retrieved_block_ids = np.array([0, 3, 5], dtype="long")
         question_input_ids = tokenizer(["Test question"]).input_ids
         answer_ids = tokenizer(
             ["the fourth", "longer longer"],

From e42869b091c4bba9f5b2007196d4adceb54d3b28 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 31 May 2023 16:40:07 +0200
Subject: [PATCH 272/935] [`bnb`] add warning when no linear  (#23894)

* add warning for gpt2-like models

* more details

* adapt from suggestions
---
 src/transformers/modeling_utils.py     |  5 +-
 src/transformers/utils/bitsandbytes.py | 77 +++++++++++++++++---------
 2 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 588f7f6134bb..aa982873664e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2687,7 +2687,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             model = replace_with_bnb_linear(
                 model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config
             )
-
             # training in 8-bit is only available in 0.37.0+
             model._is_quantized_training_enabled = version.parse(
                 importlib_metadata.version("bitsandbytes")
@@ -2699,8 +2698,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         if load_in_8bit and torch_dtype is None:
             logger.warning(
                 "You are loading your model in 8bit but you did not specify a `torch_dtype` attribute."
-                "All non-linear modules will be loaded in full precision.",
-                " If you want to load the other modules in other precision, please specify a `torch_dtype` attribute.",
+                "All non-linear modules will be loaded in full precision."
+                " If you want to load the other modules in other precision, please specify a `torch_dtype` attribute."
             )
 
         if isinstance(device_map, str):
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 5cb82c44d699..d9b4c037b4d1 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -3,6 +3,7 @@
 
 from packaging import version
 
+from ..utils import logging
 from .import_utils import importlib_metadata, is_accelerate_available, is_bitsandbytes_available
 
 
@@ -15,6 +16,8 @@
     from accelerate import init_empty_weights
     from accelerate.utils import find_tied_parameters
 
+logger = logging.get_logger(__name__)
+
 
 def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
     """
@@ -106,33 +109,13 @@ class `Int8Params` from `bitsandbytes`.
             module._parameters[tensor_name] = new_value
 
 
-def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+def _replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
     """
-    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
-    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
-    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
-    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
-    bitsandbytes`
+    Private method that wraps the recursion for module replacement.
 
-    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
-    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
-    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
-    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
-    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
-    predictive degradation is possible for very large models (>=176B parameters).
-
-    Parameters:
-        model (`torch.nn.Module`):
-            Input model or `torch.nn.Module` as the function is run recursively.
-        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
-            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
-            for numerical stability reasons.
-        current_key_name (`List[`str`]`, *optional*):
-            An array to track the current key of the recursion. This is used to check whether the current key (part of
-            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
-            `disk`).
+    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
     """
-    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+    has_been_replaced = False
     for name, module in model.named_children():
         if current_key_name is None:
             current_key_name = []
@@ -149,6 +132,7 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
                             has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
                             threshold=quantization_config.llm_int8_threshold,
                         )
+                        has_been_replaced = True
                     else:
                         if (
                             quantization_config.llm_int8_skip_modules is not None
@@ -164,16 +148,59 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
                                 compress_statistics=quantization_config.bnb_4bit_use_double_quant,
                                 quant_type=quantization_config.bnb_4bit_quant_type,
                             )
+                            has_been_replaced = True
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
         # Remove the last key for recursion
         if len(list(module.children())) > 0:
-            replace_with_bnb_linear(
+            _, has_been_replaced = _replace_with_bnb_linear(
                 module,
                 modules_to_not_convert,
                 current_key_name,
                 quantization_config,
             )
+    return model, has_been_replaced
+
+
+def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
+    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
+    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
+    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
+    bitsandbytes`
+
+    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
+    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
+    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
+    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
+    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
+    predictive degradation is possible for very large models (>=176B parameters).
+
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
+            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
+            for numerical stability reasons.
+        current_key_name (`List[`str`]`, *optional*):
+            An array to track the current key of the recursion. This is used to check whether the current key (part of
+            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
+            `disk`).
+    """
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+    model, has_been_replaced = _replace_with_bnb_linear(
+        model, modules_to_not_convert, current_key_name, quantization_config
+    )
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
+            " this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+
     return model
 
 

From 84bac652f32e65ad0082f3fcb9e077aeff4f6fdd Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Wed, 31 May 2023 10:49:43 -0400
Subject: [PATCH 273/935] Move import check to before state reset (#23906)

* Move import check to before state reset

* Guard better
---
 src/transformers/training_args.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index d213b08d6de7..441a546e05ff 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1667,12 +1667,12 @@ def ddp_timeout_delta(self) -> timedelta:
     def _setup_devices(self) -> "torch.device":
         requires_backends(self, ["torch"])
         logger.info("PyTorch: setting up devices")
-        AcceleratorState._reset_state()
-        PartialState._reset_state()
-        if not is_sagemaker_mp_enabled() and not is_accelerate_available(check_partial_state=True):
-            raise ImportError(
-                "Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
-            )
+        if not is_sagemaker_mp_enabled():
+            if not is_accelerate_available(check_partial_state=True):
+                raise ImportError(
+                    "Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
+                )
+            AcceleratorState._reset_state(reset_partial_state=True)
         self.distributed_state = None
         if self.no_cuda:
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)

From 7adce8b53241bd0f537c5135541fbc166dbe9be8 Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Wed, 31 May 2023 10:52:35 -0400
Subject: [PATCH 274/935] fix: Replace `add_prefix_space` in `get_prompt_ids`
 with manual space for FastTokenizer compatibility (#23796)

* add ' ' replacement for add_prefix_space

* add fast tokenizer test
---
 .../models/whisper/tokenization_whisper.py             |  2 +-
 .../models/whisper/tokenization_whisper_fast.py        |  2 +-
 tests/models/whisper/test_tokenization_whisper.py      | 10 ++++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 4c7c9c89fd3b..428254a26a84 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -721,7 +721,7 @@ def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time
 
     def get_prompt_ids(self, text: str, return_tensors="np"):
         """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
-        batch_encoding = self("<|startofprev|>", text.strip(), add_prefix_space=True, add_special_tokens=False)
+        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
 
         # Check for special tokens
         prompt_text_ids = batch_encoding["input_ids"][1:]
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index be4ad842a7f6..a31fe0005667 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -494,7 +494,7 @@ def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time
     # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
     def get_prompt_ids(self, text: str, return_tensors="np"):
         """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
-        batch_encoding = self("<|startofprev|>", text.strip(), add_prefix_space=True, add_special_tokens=False)
+        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
 
         # Check for special tokens
         prompt_text_ids = batch_encoding["input_ids"][1:]
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 5022d29b730e..09c98db31755 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -213,6 +213,16 @@ def test_skip_special_tokens_skips_prompt_ids(self):
             rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
         )
 
+    def test_fast_tokenizer_get_prompt_ids(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        prompt = "This is test prompt text."
+        tokenizer_prompt_ids = tokenizer.get_prompt_ids(prompt)
+        fast_tokenizer_prompt_ids = rust_tokenizer.get_prompt_ids(prompt)
+
+        self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
+
 
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "openai/whisper-small.en"

From 55451c66ce80b776ec59e275fdf94a4cabe2723a Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Wed, 31 May 2023 11:30:39 -0400
Subject: [PATCH 275/935] Upgrade safetensors version (#23911)

* Upgrade safetensors

* Second table
---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8406adc3072f..f53b79e383fb 100644
--- a/setup.py
+++ b/setup.py
@@ -159,7 +159,7 @@
     "ruff>=0.0.241,<=0.0.259",
     "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses",
-    "safetensors>=0.2.1",
+    "safetensors>=0.3.1",
     "sagemaker>=2.31.0",
     "scikit-learn",
     "sentencepiece>=0.1.91,!=0.1.92",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 869e96d7fbf1..1ddd818be058 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -64,7 +64,7 @@
     "ruff": "ruff>=0.0.241,<=0.0.259",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
-    "safetensors": "safetensors>=0.2.1",
+    "safetensors": "safetensors>=0.3.1",
     "sagemaker": "sagemaker>=2.31.0",
     "scikit-learn": "scikit-learn",
     "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",

From c63bfc30233ad3161641dfe95d70bada1ee69694 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 31 May 2023 17:36:56 +0200
Subject: [PATCH 276/935] [`RWKV`] Fix RWKV 4bit (#23910)

fix RWKV 4bit
---
 src/transformers/models/rwkv/modeling_rwkv.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index cd577e9c7431..e79f99a309be 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -710,6 +710,13 @@ def _rescale_layers(self):
                         if hasattr(block.attention.output.weight, "SCB"):
                             block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
                             block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
+                        elif hasattr(block.attention.output.weight, "quant_state"):
+                            block.attention.output.weight.quant_state[0].div_(
+                                2 ** int(block_id // self.config.rescale_every)
+                            )
+                            block.feed_forward.value.weight.quant_state[0].div_(
+                                2 ** int(block_id // self.config.rescale_every)
+                            )
                         else:
                             block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
                             block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))

From 404d92538486b98f9e2db316939497ee30f62bd7 Mon Sep 17 00:00:00 2001
From: Hari <58052269+harisankar95@users.noreply.github.com>
Date: Wed, 31 May 2023 17:40:23 +0200
Subject: [PATCH 277/935] add conditional statement for auxiliary loss
 calculation (#23899)

* add conditional statement for auxiliary loss calculation

* fix style and copies
---
 src/transformers/models/beit/modeling_beit.py               | 6 ++++--
 .../models/data2vec/modeling_data2vec_vision.py             | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 2b2a06256cf5..b17721fb2bcd 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -1192,8 +1192,10 @@ def compute_loss(self, logits, auxiliary_logits, labels):
         # compute weighted loss
         loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
         main_loss = loss_fct(upsampled_logits, labels)
-        auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
-        loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+        loss = main_loss
+        if auxiliary_logits is not None:
+            auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+            loss += self.config.auxiliary_loss_weight * auxiliary_loss
 
         return loss
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index d238dd05d216..77b424354892 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -1120,8 +1120,10 @@ def compute_loss(self, logits, auxiliary_logits, labels):
         # compute weighted loss
         loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
         main_loss = loss_fct(upsampled_logits, labels)
-        auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
-        loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+        loss = main_loss
+        if auxiliary_logits is not None:
+            auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+            loss += self.config.auxiliary_loss_weight * auxiliary_loss
 
         return loss
 

From 8714b964ee2571584289415887401db0ae27b111 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 31 May 2023 17:01:53 +0100
Subject: [PATCH 278/935] Raise error if loss can't be calculated - ViT MIM 
 (#23872)

Raise error if loss can't be calculated
---
 src/transformers/models/vit/modeling_vit.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 474b92f72dd6..bfd440caae2b 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -695,6 +695,13 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if bool_masked_pos is not None and (self.config.patch_size != self.config.encoder_stride):
+            raise ValueError(
+                "When `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that "
+                "the reconstructed image has the same dimensions as the input."
+                f"Got `patch_size` = {self.config.patch_size} and `encoder_stride` = {self.config.encoder_stride}."
+            )
+
         outputs = self.vit(
             pixel_values,
             bool_masked_pos=bool_masked_pos,

From 0b3d092f636f39af38566cb1851c9560686c63e8 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 31 May 2023 12:02:05 -0400
Subject: [PATCH 279/935] Empty circleci config (#23913)

* Try easy first

* Add an empty job

* Fix name

* Fix method
---
 .circleci/create_circleci_config.py | 34 +++++++++++++++++++----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index c9d56754a940..3f7da48770e3 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -36,6 +36,17 @@
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]
 
 
+class EmptyJob:
+    job_name = "empty"
+
+    def to_dict(self):
+        return {
+            "working_directory": "~/transformers",
+            "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
+            "steps":["checkout"],
+        }
+
+
 @dataclass
 class CircleCIJob:
     name: str
@@ -573,17 +584,18 @@ def create_circleci_config(folder=None):
     if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0:
         jobs.extend(REPO_UTIL_TESTS)
 
-    if len(jobs) > 0:
-        config = {"version": "2.1"}
-        config["parameters"] = {
-            # Only used to accept the parameters from the trigger
-            "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": test_list},
-        }
-        config["jobs"] = {j.job_name: j.to_dict() for j in jobs}
-        config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
-        with open(os.path.join(folder, "generated_config.yml"), "w") as f:
-            f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False))
+    if len(jobs) == 0:
+        jobs = [EmptyJob()]
+    config = {"version": "2.1"}
+    config["parameters"] = {
+        # Only used to accept the parameters from the trigger
+        "nightly": {"type": "boolean", "default": False},
+        "tests_to_run": {"type": "string", "default": test_list},
+    }
+    config["jobs"] = {j.job_name: j.to_dict() for j in jobs}
+    config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
+    with open(os.path.join(folder, "generated_config.yml"), "w") as f:
+        f.write(yaml.dump(config, indent=2, width=1000000, sort_keys=False))
 
 
 if __name__ == "__main__":

From c608b8fc93c0fdc2d018911a14da7625f00c45f5 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 31 May 2023 17:12:27 +0100
Subject: [PATCH 280/935] Bug fix - flip_channel_order for channels first
 images (#23701)

Bug fix - flip_channel_order for channels_first
---
 src/transformers/image_transforms.py          | 29 ++++++++++++++
 .../layoutlmv2/image_processing_layoutlmv2.py | 17 +-------
 .../layoutlmv3/image_processing_layoutlmv3.py | 16 --------
 .../mobilevit/image_processing_mobilevit.py   | 38 ++++--------------
 tests/test_image_transforms.py                | 39 +++++++++++++++++++
 5 files changed, 77 insertions(+), 62 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 369ddc8d4c00..0893792e2dad 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -742,3 +742,32 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
 
     image = image.convert("RGB")
     return image
+
+
+def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
+    """
+    Flips the channel order of the image.
+
+    If the image is in RGB format, it will be converted to BGR and vice versa.
+
+    Args:
+        image (`np.ndarray`):
+            The image to flip.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+    """
+
+    input_data_format = infer_channel_dimension_format(image)
+    if input_data_format == ChannelDimension.LAST:
+        image = image[..., ::-1]
+    elif input_data_format == ChannelDimension.FIRST:
+        image = image[::-1, ...]
+    else:
+        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
+
+    if data_format is not None:
+        image = to_channel_dimension_format(image, data_format)
+    return image
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index 6ad0968b612b..c396b7e40324 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -19,12 +19,11 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
+from ...image_transforms import flip_channel_order, resize, to_channel_dimension_format, to_pil_image
 from ...image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    infer_channel_dimension_format,
     make_list_of_images,
     to_numpy_array,
     valid_images,
@@ -85,20 +84,6 @@ def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Op
     return words, normalized_boxes
 
 
-def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
-    input_data_format = infer_channel_dimension_format(image)
-    if input_data_format == ChannelDimension.LAST:
-        image = image[..., ::-1]
-    elif input_data_format == ChannelDimension.FIRST:
-        image = image[:, ::-1, ...]
-    else:
-        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
-
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-    return image
-
-
 class LayoutLMv2ImageProcessor(BaseImageProcessor):
     r"""
     Constructs a LayoutLMv2 image processor.
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 7152abf06c46..5b67bd94dba4 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -26,7 +26,6 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    infer_channel_dimension_format,
     make_list_of_images,
     to_numpy_array,
     valid_images,
@@ -86,20 +85,6 @@ def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Op
     return words, normalized_boxes
 
 
-def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
-    input_data_format = infer_channel_dimension_format(image)
-    if input_data_format == ChannelDimension.LAST:
-        image = image[..., ::-1]
-    elif input_data_format == ChannelDimension.FIRST:
-        image = image[:, ::-1, ...]
-    else:
-        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
-
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-    return image
-
-
 class LayoutLMv3ImageProcessor(BaseImageProcessor):
     r"""
     Constructs a LayoutLMv3 image processor.
@@ -356,7 +341,6 @@ def preprocess(
         if do_normalize:
             images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
 
-        # flip color channels from RGB to BGR (as Detectron2 requires this)
         images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index e121c2ae7ba1..fe0cc6ab22e2 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -19,12 +19,18 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, get_resize_output_image_size, rescale, resize, to_channel_dimension_format
+from ...image_transforms import (
+    center_crop,
+    flip_channel_order,
+    get_resize_output_image_size,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
 from ...image_utils import (
     ChannelDimension,
     ImageInput,
     PILImageResampling,
-    infer_channel_dimension_format,
     make_list_of_images,
     to_numpy_array,
     valid_images,
@@ -42,34 +48,6 @@
 logger = logging.get_logger(__name__)
 
 
-def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension]) -> np.ndarray:
-    """
-    Flip the color channels from RGB to BGR or vice versa.
-
-    Args:
-        image (`np.ndarray`):
-            The image, represented as a numpy array.
-        data_format (`ChannelDimension`, *`optional`*):
-            The channel dimension format of the image. If not provided, it will be the same as the input image.
-
-    Returns:
-        `np.ndarray`: The image with the flipped color channels.
-    """
-    input_data_format = infer_channel_dimension_format(image)
-
-    if input_data_format == ChannelDimension.LAST:
-        image = image[..., ::-1]
-    elif input_data_format == ChannelDimension.FIRST:
-        image = image[:, ::-1, ...]
-    else:
-        raise ValueError(f"Invalid input channel dimension format: {input_data_format}")
-
-    if data_format is not None:
-        image = to_channel_dimension_format(image, data_format)
-
-    return image
-
-
 class MobileViTImageProcessor(BaseImageProcessor):
     r"""
     Constructs a MobileViT image processor.
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index cb1524ac12cf..70db390394ca 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -39,6 +39,7 @@
         center_to_corners_format,
         convert_to_rgb,
         corners_to_center_format,
+        flip_channel_order,
         get_resize_output_image_size,
         id_to_rgb,
         normalize,
@@ -520,3 +521,41 @@ def test_convert_to_rgb(self):
         self.assertEqual(rgb_image.mode, "RGB")
         self.assertEqual(rgb_image.size, (2, 1))
         self.assertTrue(np.allclose(np.array(rgb_image), np.array([[[0, 0, 0], [255, 255, 255]]], dtype=np.uint8)))
+
+    def test_flip_channel_order(self):
+        # fmt: off
+        img_channels_first = np.array([
+            [[ 0,  1,  2,  3],
+             [ 4,  5,  6,  7]],
+
+            [[ 8,  9, 10, 11],
+             [12, 13, 14, 15]],
+
+            [[16, 17, 18, 19],
+             [20, 21, 22, 23]],
+        ])
+        # fmt: on
+        img_channels_last = np.moveaxis(img_channels_first, 0, -1)
+        # fmt: off
+        flipped_img_channels_first = np.array([
+            [[16, 17, 18, 19],
+             [20, 21, 22, 23]],
+
+            [[ 8,  9, 10, 11],
+             [12, 13, 14, 15]],
+
+            [[ 0,  1,  2,  3],
+             [ 4,  5,  6,  7]],
+        ])
+        # fmt: on
+        flipped_img_channels_last = np.moveaxis(flipped_img_channels_first, 0, -1)
+
+        self.assertTrue(np.allclose(flip_channel_order(img_channels_first), flipped_img_channels_first))
+        self.assertTrue(
+            np.allclose(flip_channel_order(img_channels_first, "channels_last"), flipped_img_channels_last)
+        )
+
+        self.assertTrue(np.allclose(flip_channel_order(img_channels_last), flipped_img_channels_last))
+        self.assertTrue(
+            np.allclose(flip_channel_order(img_channels_last, "channels_first"), flipped_img_channels_first)
+        )

From d13021e35fb391c88c48b085f29462625d8f4090 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 31 May 2023 23:04:55 +0530
Subject: [PATCH 281/935] remove the extra `accelerator.prepare`  (#23914)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

remove the extra `accelerator.prepare` that slipped in with multiple update from main 😅
---
 src/transformers/trainer.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index b76b3d65a8de..5c72b155b034 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1783,19 +1783,6 @@ def _inner_training_loop(
         if resume_from_checkpoint is not None and self.is_deepspeed_enabled:
             deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
 
-        # prepare using `accelerator` prepare
-        if use_accelerator_prepare:
-            model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
-                self.model, self.optimizer, self.lr_scheduler
-            )
-
-        if getattr(self.accelerator.state, "fsdp_plugin", None) is not None:
-            self.model = model
-
-        # for the rest of this function `model` is the outside model, whether it was wrapped or not
-        if model is not self.model:
-            self.model_wrapped = model
-
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
 

From 3ff443a6d96ec29f7e7b395db90ba4de558ac3f3 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 31 May 2023 13:44:26 -0400
Subject: [PATCH 282/935] Re-enable squad test (#23912)

* Re-enable squad test

* [all-test]

* [all-test] Fix all test command

* Fix the all-test
---
 examples/pytorch/test_pytorch_examples.py | 2 --
 utils/tests_fetcher.py                    | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index 6b992612e863..f4682b8933e7 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -19,7 +19,6 @@
 import logging
 import os
 import sys
-import unittest
 from unittest.mock import patch
 
 import torch
@@ -240,7 +239,6 @@ def test_run_ner(self):
             self.assertGreaterEqual(result["eval_accuracy"], 0.75)
             self.assertLess(result["eval_loss"], 0.5)
 
-    @unittest.skip("Broken, fix me Sourab")
     def test_run_squad(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index d8373da5ef5b..0480e44c3ea5 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -852,10 +852,10 @@ def parse_commit_message(commit_message):
 
         if commit_flags["test_all"]:
             with open(args.output_file, "w", encoding="utf-8") as f:
-                if args.filters is None:
-                    f.write("./tests/")
-                else:
-                    f.write(" ".join(args.filters))
+                f.write("tests")
+            example_file = Path(args.output_file).parent / "examples_test_list.txt"
+            with open(example_file, "w", encoding="utf-8") as f:
+                f.write("all")
 
             test_files_to_run = get_all_tests()
             create_json_map(test_files_to_run, args.json_output_file)

From 4aa13224a5bca560147a29c06b2e0597137caf3e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 31 May 2023 14:10:14 -0400
Subject: [PATCH 283/935] Update the update metadata job to use upload_folder
 (#23917)

---
 utils/update_metadata.py | 72 ++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index c34d8a392378..e760ce2fd075 100644
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -21,7 +21,7 @@
 
 import pandas as pd
 from datasets import Dataset
-from huggingface_hub import Repository
+from huggingface_hub import hf_hub_download, upload_folder
 
 from transformers.utils import direct_transformers_import
 
@@ -209,43 +209,49 @@ def update_metadata(token, commit_sha):
     """
     Update the metadata for the Transformers repo.
     """
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        repo = Repository(tmp_dir, clone_from="huggingface/transformers-metadata", repo_type="dataset", token=token)
+    frameworks_table = get_frameworks_table()
+    frameworks_dataset = Dataset.from_pandas(frameworks_table)
+
+    resolved_tags_file = hf_hub_download(
+        "huggingface/transformers-metadata", "pipeline_tags.json", repo_type="dataset", token=token
+    )
+    tags_dataset = Dataset.from_json(resolved_tags_file)
+    table = {
+        tags_dataset[i]["model_class"]: (tags_dataset[i]["pipeline_tag"], tags_dataset[i]["auto_class"])
+        for i in range(len(tags_dataset))
+    }
+    table = update_pipeline_and_auto_class_table(table)
+
+    # Sort the model classes to avoid some nondeterministic updates to create false update commits.
+    model_classes = sorted(table.keys())
+    tags_table = pd.DataFrame(
+        {
+            "model_class": model_classes,
+            "pipeline_tag": [table[m][0] for m in model_classes],
+            "auto_class": [table[m][1] for m in model_classes],
+        }
+    )
+    tags_dataset = Dataset.from_pandas(tags_table)
 
-        frameworks_table = get_frameworks_table()
-        frameworks_dataset = Dataset.from_pandas(frameworks_table)
+    with tempfile.TemporaryDirectory() as tmp_dir:
         frameworks_dataset.to_json(os.path.join(tmp_dir, "frameworks.json"))
-
-        tags_dataset = Dataset.from_json(os.path.join(tmp_dir, "pipeline_tags.json"))
-        table = {
-            tags_dataset[i]["model_class"]: (tags_dataset[i]["pipeline_tag"], tags_dataset[i]["auto_class"])
-            for i in range(len(tags_dataset))
-        }
-        table = update_pipeline_and_auto_class_table(table)
-
-        # Sort the model classes to avoid some nondeterministic updates to create false update commits.
-        model_classes = sorted(table.keys())
-        tags_table = pd.DataFrame(
-            {
-                "model_class": model_classes,
-                "pipeline_tag": [table[m][0] for m in model_classes],
-                "auto_class": [table[m][1] for m in model_classes],
-            }
-        )
-        tags_dataset = Dataset.from_pandas(tags_table)
         tags_dataset.to_json(os.path.join(tmp_dir, "pipeline_tags.json"))
 
-        if repo.is_repo_clean():
-            print("Nothing to commit!")
+        if commit_sha is not None:
+            commit_message = (
+                f"Update with commit {commit_sha}\n\nSee: "
+                f"https://github.com/huggingface/transformers/commit/{commit_sha}"
+            )
         else:
-            if commit_sha is not None:
-                commit_message = (
-                    f"Update with commit {commit_sha}\n\nSee: "
-                    f"https://github.com/huggingface/transformers/commit/{commit_sha}"
-                )
-            else:
-                commit_message = "Update"
-            repo.push_to_hub(commit_message)
+            commit_message = "Update"
+
+        upload_folder(
+            repo_id="huggingface/transformers-metadata",
+            folder_path=tmp_dir,
+            repo_type="dataset",
+            token=token,
+            commit_message=commit_message,
+        )
 
 
 def check_pipeline_tags():

From 6affd9cd7ced98fe928293036f24379845e88f9d Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Wed, 31 May 2023 21:31:28 +0200
Subject: [PATCH 284/935] [PushToHub] Make it possible to upload folders
 (#23920)

Add first draft
---
 src/transformers/utils/hub.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 335547f2b825..c8611c492341 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -701,9 +701,31 @@ def _upload_modified_files(
             for f in os.listdir(working_dir)
             if f not in files_timestamps or os.path.getmtime(os.path.join(working_dir, f)) > files_timestamps[f]
         ]
+
+        # filter for actual files + folders at the root level
+        modified_files = [
+            f
+            for f in modified_files
+            if os.path.isfile(os.path.join(working_dir, f)) or os.path.isdir(os.path.join(working_dir, f))
+        ]
+
         operations = []
+        # upload standalone files
         for file in modified_files:
             operations.append(CommitOperationAdd(path_or_fileobj=os.path.join(working_dir, file), path_in_repo=file))
+            if os.path.isdir(os.path.join(working_dir, file)):
+                # go over individual files of folder
+                for f in os.listdir(os.path.join(working_dir, file)):
+                    operations.append(
+                        CommitOperationAdd(
+                            path_or_fileobj=os.path.join(working_dir, file, f), path_in_repo=os.path.join(file, f)
+                        )
+                    )
+            else:
+                operations.append(
+                    CommitOperationAdd(path_or_fileobj=os.path.join(working_dir, file), path_in_repo=file)
+                )
+
         logger.info(f"Uploading the following files to {repo_id}: {','.join(modified_files)}")
         return create_commit(
             repo_id=repo_id, operations=operations, commit_message=commit_message, token=token, create_pr=create_pr

From fabe17a726bbf6081cfbcc975d8ac451a81f3e2d Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 31 May 2023 15:32:21 -0400
Subject: [PATCH 285/935] Skip device placement for past key values in decoder
 models (#23919)

---
 src/transformers/modeling_utils.py                          | 6 +++++-
 src/transformers/models/bart/modeling_bart.py               | 1 +
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py      | 1 +
 src/transformers/models/blip_2/modeling_blip_2.py           | 1 +
 src/transformers/models/bloom/modeling_bloom.py             | 1 +
 src/transformers/models/bridgetower/modeling_bridgetower.py | 1 +
 src/transformers/models/codegen/modeling_codegen.py         | 1 +
 src/transformers/models/gpt2/modeling_gpt2.py               | 1 +
 src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py | 1 +
 src/transformers/models/gpt_neo/modeling_gpt_neo.py         | 1 +
 src/transformers/models/gpt_neox/modeling_gpt_neox.py       | 1 +
 .../models/gpt_neox_japanese/modeling_gpt_neox_japanese.py  | 1 +
 src/transformers/models/gptj/modeling_gptj.py               | 1 +
 .../models/gptsan_japanese/modeling_gptsan_japanese.py      | 1 +
 src/transformers/models/llama/modeling_llama.py             | 1 +
 15 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index aa982873664e..1c4ce11b5236 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1052,6 +1052,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     main_input_name = "input_ids"
     _auto_class = None
     _no_split_modules = None
+    _skip_keys_device_placement = None
     _keep_in_fp32_modules = None
 
     # a list of `re` patterns of `state_dict` keys that should be removed from the list of missing
@@ -2887,7 +2888,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # Dispatch model with hooks on all devices if necessary
         if device_map is not None:
-            dispatch_model(model, device_map=device_map, offload_dir=offload_folder, offload_index=offload_index)
+            kwargs = {"device_map": device_map, "offload_dir": offload_folder, "offload_index": offload_index}
+            if "skip_keys" in inspect.signature(dispatch_model).parameters:
+                kwargs["skip_keys"] = model._skip_keys_device_placement
+            dispatch_model(model, **kwargs)
 
         if output_loading_info:
             if loading_info is None:
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index c1da4eb288e8..59d99f9aa133 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -509,6 +509,7 @@ class BartPretrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
     _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         std = self.config.init_std
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index e4c64e12b554..61972c53f017 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1597,6 +1597,7 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         std = self.config.init_std
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 381a3fe0ed7a..2cb77f44a28b 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -286,6 +286,7 @@ class Blip2PreTrainedModel(PreTrainedModel):
         r"language_model.lm_head.weight",
     ]
     _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
     _keep_in_fp32_modules = ["wo"]
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 5c0d570cbe9c..a3d2242f8a6a 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -481,6 +481,7 @@ class BloomPreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["BloomBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 29bd5b581d92..eb95f12c854e 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -982,6 +982,7 @@ class BridgeTowerPreTrainedModel(PreTrainedModel):
     base_model_prefix = "bridgetower"
     supports_gradient_checkpointing = False
     _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         if isinstance(module, BridgeTowerVisionModel):
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 82a4f95dde1f..aff556d2f144 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -315,6 +315,7 @@ class CodeGenPreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["CodeGenBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 5fc5451141aa..00d92f0bb23c 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -449,6 +449,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
     is_parallelizable = True
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPT2Block"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index f6ec24d7773c..0bbe76482379 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -372,6 +372,7 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTBigCodeBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 02f7d5534bde..f98a73373b4a 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -363,6 +363,7 @@ class GPTNeoPreTrainedModel(PreTrainedModel):
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 488093594637..5ae2807608a9 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -62,6 +62,7 @@ class GPTNeoXPreTrainedModel(PreTrainedModel):
     base_model_prefix = "gpt_neox"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoXLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index d18554480bd7..aeab9434e515 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -50,6 +50,7 @@ class GPTNeoXJapanesePreTrainedModel(PreTrainedModel):
     base_model_prefix = "gpt_neox_japanese"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoXJapaneseLayer"]
+    _skip_keys_device_placement = "past_key_values"
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 82cb280caba9..91b32d1a8242 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -340,6 +340,7 @@ class GPTJPreTrainedModel(PreTrainedModel):
     is_parallelizable = True
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTJBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index 4343340a7f73..a0b68543b3da 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -692,6 +692,7 @@ class GPTSanJapanesePreTrainedModel(PreTrainedModel):
     base_model_prefix = "gptsan_japanese"
     supports_gradient_checkpointing = False
     _no_split_modules = ["GPTSanJapaneseBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     @property
     def dummy_inputs(self):
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 80cfdfa5f066..346da82d86f5 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -342,6 +342,7 @@ class LlamaPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
     _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
 
     def _init_weights(self, module):

From 9603ef890a4d111b5837a3cfbc7971c7d3c94297 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:36:45 +0100
Subject: [PATCH 286/935] [Flax Whisper] Update decode docstring (#23908)

---
 src/transformers/models/whisper/modeling_flax_whisper.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index 5fbc158a44a0..c9f3bbba9c3c 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -1017,16 +1017,17 @@ def decode(
         ```python
         >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
         >>> from datasets import load_dataset
+        >>> import jax.numpy as jnp
 
         >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
-        >>> input_features = inputs.input_features
+        >>> input_features = processor(ds[0]["audio"]["array"], return_tensors="np").input_features
+
         >>> encoder_outputs = model.encode(input_features=input_features)
         >>> decoder_start_token_id = model.config.decoder_start_token_id
 
-        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((input_features.shape[0], 1), dtype="i4") * decoder_start_token_id
 
         >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
         >>> last_decoder_hidden_states = outputs.last_hidden_state

From 9a35a7b9e1d884e1426a156763f7707ba96b8a68 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 1 Jun 2023 22:07:57 +0900
Subject: [PATCH 287/935] Effectively allow `encoder_outputs` input to be a
 tuple in pix2struct (#23932)

consistentcy
---
 src/transformers/models/pix2struct/modeling_pix2struct.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index cd08c69086f7..1a900e264c02 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1734,6 +1734,12 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
 
         hidden_states = encoder_outputs[0]
 

From af2c36793fd6cde4b595a4b6f800a96dff86556e Mon Sep 17 00:00:00 2001
From: Sheon Han <sheon.han@gmail.com>
Date: Thu, 1 Jun 2023 07:10:15 -0700
Subject: [PATCH 288/935] Fix doc string nits (#23929)

---
 src/transformers/configuration_utils.py             | 2 +-
 src/transformers/models/vit/image_processing_vit.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 3b7f668b9ea8..6e8104c0cd2e 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -136,7 +136,7 @@ class PretrainedConfig(PushToHubMixin):
         diversity_penalty (`float`, *optional*, defaults to 0.0):
             Value to control diversity for group beam search. that will be used by default in the `generate` method of
             the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
-        temperature (`float`, *optional*, defaults to 1):
+        temperature (`float`, *optional*, defaults to 1.0):
             The value used to module the next token probabilities that will be used by default in the `generate` method
             of the model. Must be strictly positive.
         top_k (`int`, *optional*, defaults to 50):
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 66d68e7f82d7..71f50d8b9cab 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -56,7 +56,7 @@ class ViTImageProcessor(BaseImageProcessor):
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
             `preprocess` method.
-        do_normalize:
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
             method.
         image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):

From 91931882767d2f64073330982ed4a1b8c9aaeee0 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 1 Jun 2023 10:25:43 -0400
Subject: [PATCH 289/935] Pin rhoknp (#23937)

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index f53b79e383fb..394cc9cb25c8 100644
--- a/setup.py
+++ b/setup.py
@@ -153,7 +153,7 @@
     "ray[tune]",
     "regex!=2019.12.17",
     "requests",
-    "rhoknp>=1.1.0",
+    "rhoknp>=1.1.0,<1.3.1",
     "rjieba",
     "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
     "ruff>=0.0.241,<=0.0.259",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 1ddd818be058..9419287ae2cf 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -58,7 +58,7 @@
     "ray[tune]": "ray[tune]",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
-    "rhoknp": "rhoknp>=1.1.0",
+    "rhoknp": "rhoknp>=1.1.0,<1.3.1",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
     "ruff": "ruff>=0.0.241,<=0.0.259",

From 857d4e1c8764169277fa9a318d4082d39cc08f12 Mon Sep 17 00:00:00 2001
From: Adam Lewis <23342526+Adam-D-Lewis@users.noreply.github.com>
Date: Thu, 1 Jun 2023 09:54:01 -0500
Subject: [PATCH 290/935] rename DocumentQuestionAnsweringTool parameter input
 to match docstring (#23939)

rename encode input to match docstring
---
 src/transformers/tools/document_question_answering.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tools/document_question_answering.py b/src/transformers/tools/document_question_answering.py
index 8a44a1a95112..7b5e8782bd78 100644
--- a/src/transformers/tools/document_question_answering.py
+++ b/src/transformers/tools/document_question_answering.py
@@ -46,13 +46,13 @@ def __init__(self, *args, **kwargs):
 
         super().__init__(*args, **kwargs)
 
-    def encode(self, image: "Image", question: str):
+    def encode(self, document: "Image", question: str):
         task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
         prompt = task_prompt.replace("{user_input}", question)
         decoder_input_ids = self.pre_processor.tokenizer(
             prompt, add_special_tokens=False, return_tensors="pt"
         ).input_ids
-        pixel_values = self.pre_processor(image, return_tensors="pt").pixel_values
+        pixel_values = self.pre_processor(document, return_tensors="pt").pixel_values
 
         return {"decoder_input_ids": decoder_input_ids, "pixel_values": pixel_values}
 

From 5929f86ebba157b3ea3460622215a2b9db69d44b Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Thu, 1 Jun 2023 16:54:50 +0200
Subject: [PATCH 291/935] Update stale.yml to use HuggingFaceBot (#23941)

---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 9412442a7d0a..2d7b8fd418db 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -10,7 +10,7 @@ jobs:
     if: github.repository == 'huggingface/transformers'
     runs-on: ubuntu-latest
     env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      GITHUB_TOKEN: ${{ secrets.HUGGINGFACE_PUSH }}
     steps:
     - uses: actions/checkout@v3
 

From 8088ca418575736d5d068c0311efc82d33bc1b69 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Thu, 1 Jun 2023 16:15:00 +0100
Subject: [PATCH 292/935] Make TF ESM inv_freq non-trainable like PyTorch
 (#23940)

Make TF inv_freq non-trainable like PyTorch
---
 src/transformers/models/esm/modeling_tf_esm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index 126473ee529a..c5cdf53c59e6 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -110,7 +110,7 @@ def __init__(self, dim: int, name=None):
     def build(self, input_shape):
         super().build(input_shape)
         self.inv_freq = self.add_weight(
-            "inv_freq", shape=(self.dim // 2,), dtype=tf.float32, initializer=get_initializer(1.0)
+            "inv_freq", shape=(self.dim // 2,), dtype=tf.float32, initializer=get_initializer(1.0), trainable=False
         )
         self.inv_freq.assign(
             1.0 / (10000 ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim))

From dc67da01829090ec92dfc24653242cf3f56d1a01 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Thu, 1 Jun 2023 17:58:11 +0200
Subject: [PATCH 293/935] Revert "Update stale.yml to use HuggingFaceBot"
 (#23943)

Revert "Update stale.yml to use HuggingFaceBot (#23941)"

This reverts commit 5929f86ebba157b3ea3460622215a2b9db69d44b.
---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 2d7b8fd418db..9412442a7d0a 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -10,7 +10,7 @@ jobs:
     if: github.repository == 'huggingface/transformers'
     runs-on: ubuntu-latest
     env:
-      GITHUB_TOKEN: ${{ secrets.HUGGINGFACE_PUSH }}
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
     - uses: actions/checkout@v3
 

From d1fa349e78d5dc47d75b24f19274525781d8b7c1 Mon Sep 17 00:00:00 2001
From: Brendon Soong <58584180+soongbren@users.noreply.github.com>
Date: Fri, 2 Jun 2023 01:17:27 +0800
Subject: [PATCH 294/935] #23675 Registering Malay language (#23689)

* #23675 Registering Malay language

* removing untranslated files

* some translate

* more updates to toctree

* inc index

* additional translations for toctree

* translations of more sections

* removing untranslated file

* translated index.mdx to malay
---
 docs/source/ms/_toctree.yml | 688 ++++++++++++++++++++++++++++++++++++
 docs/source/ms/index.mdx    | 456 ++++++++++++++++++++++++
 2 files changed, 1144 insertions(+)
 create mode 100644 docs/source/ms/_toctree.yml
 create mode 100644 docs/source/ms/index.mdx

diff --git a/docs/source/ms/_toctree.yml b/docs/source/ms/_toctree.yml
new file mode 100644
index 000000000000..0ec1ee59ad89
--- /dev/null
+++ b/docs/source/ms/_toctree.yml
@@ -0,0 +1,688 @@
+- sections:
+    - local: index
+      title: 🤗 Transformers
+    - local: quicktour
+      title: Lawatan cepat
+    - local: installation
+      title: Pemasangan
+  title: Mulakan
+- sections:
+    - local: pipeline_tutorial
+      title: Jalankan inferens dengan saluran paip
+    - local: autoclass_tutorial
+      title: Tulis kod mudah alih dengan AutoClass
+    - local: preprocessing
+      title: Praproses data
+    - local: training
+      title: Perhalusi model yang telah dilatih
+    - local: run_scripts
+      title: Latih dengan skrip
+    - local: accelerate
+      title: Sediakan latihan yang diedarkan dengan 🤗 Accelerate
+    - local: model_sharing
+      title: Kongsi model anda
+    - local: transformers_agents
+      title: Ejen
+  title: Tutorials
+- sections:
+    - sections:
+        - local: tasks/sequence_classification
+          title: Klasifikasi teks
+        - local: tasks/token_classification
+          title: Klasifikasi token
+        - local: tasks/question_answering
+          title: Soalan menjawab
+        - local: tasks/language_modeling
+          title: Pemodelan bahasa sebab-akibat
+        - local: tasks/masked_language_modeling
+          title: Pemodelan bahasa Masked
+        - local: tasks/translation
+          title: Terjemahan
+        - local: tasks/summarization
+          title: Rumusan
+        - local: tasks/multiple_choice
+          title: Pilihan
+      title: Natural Language Processing
+      isExpanded: false
+    - sections:
+        - local: tasks/audio_classification
+          title: Klasifikasi audio
+        - local: tasks/asr
+          title: Pengecaman pertuturan automatik
+      title: Audio
+      isExpanded: false
+    - sections:
+        - local: tasks/image_classification
+          title: Klasifikasi imej
+        - local: tasks/semantic_segmentation
+          title: Segmentasi semantik
+        - local: tasks/video_classification
+          title: Klasifikasi video
+        - local: tasks/object_detection
+          title: Pengesanan objek
+        - local: tasks/zero_shot_object_detection
+          title: Pengesanan objek Zero-Shot
+        - local: tasks/zero_shot_image_classification
+          title: Klasifikasi imej tangkapan Zero-Shot
+        - local: tasks/monocular_depth_estimation
+          title: Anggaran kedalaman
+      title: Visi komputer
+      isExpanded: false
+    - sections:
+        - local: tasks/image_captioning
+          title: Kapsyen imej
+        - local: tasks/document_question_answering
+          title: Menjawab Soalan Dokumen
+        - local: tasks/text-to-speech
+          title: Teks kepada ucapan
+      title: Multimodal
+      isExpanded: false
+  title: Panduan Tugasan
+- sections:
+    - local: fast_tokenizers
+      title: Gunakan tokenizer cepat dari 🤗 Tokenizers
+    - local: multilingual
+      title: Jalankan inferens dengan model berbilang bahasa
+    - local: generation_strategies
+      title: Sesuaikan strategi penjanaan teks
+    - local: create_a_model
+      title: Gunakan API khusus model
+    - local: custom_models
+      title: Kongsi model tersuai
+    - local: sagemaker
+      title: Jalankan latihan di Amazon SageMaker
+    - local: serialization
+      title: Eksport ke ONNX
+    - local: torchscript
+      title: Eksport ke TorchScript
+    - local: benchmarks
+      title: Penanda aras
+    - local: Buku nota dengan contoh
+      title: Notebooks with examples
+    - local: Sumber komuniti
+      title: Community resources
+    - local: Sumber komuniti
+      title: Custom Tools and Prompts
+    - local: Alat dan Gesaan Tersuai
+      title: Selesaikan masalah
+  title: Panduan Developer
+- sections:
+    - local: performance
+      title: Gambaran keseluruhan
+    - local: perf_train_gpu_one
+      title: Latihan pada satu GPU
+    - local: perf_train_gpu_many
+      title: Latihan pada banyak GPU
+    - local: perf_train_cpu
+      title: Latihan mengenai CPU
+    - local: perf_train_cpu_many
+      title: Latihan pada banyak CPU
+    - local: perf_train_tpu
+      title: Latihan mengenai TPU
+    - local: perf_train_tpu_tf
+      title: Latihan tentang TPU dengan TensorFlow
+    - local: perf_train_special
+      title: Latihan mengenai Perkakasan Khusus
+    - local: perf_infer_cpu
+      title: Inferens pada CPU
+    - local: perf_infer_gpu_one
+      title: Inferens pada satu GPU
+    - local: perf_infer_gpu_many
+      title: Inferens pada banyak GPUs
+    - local: perf_infer_special
+      title: Inferens pada Perkakasan Khusus
+    - local: perf_hardware
+      title: Perkakasan tersuai untuk latihan
+    - local: big_models
+      title: Menghidupkan model besar
+    - local: debugging
+      title: Penyahpepijatan
+    - local: hpo_train
+      title: Carian Hiperparameter menggunakan API Pelatih
+    - local: tf_xla
+      title: Penyepaduan XLA untuk Model TensorFlow
+  title: Prestasi dan kebolehskalaan
+- sections:
+    - local: contributing
+      title: Bagaimana untuk menyumbang kepada transformer?
+    - local: add_new_model
+      title: Bagaimana untuk menambah model pada 🤗 Transformers?
+    - local: add_tensorflow_model
+      title: Bagaimana untuk menukar model Transformers kepada TensorFlow?
+    - local: add_new_pipeline
+      title: Bagaimana untuk menambah saluran paip ke 🤗 Transformers?
+    - local: testing
+      title: Ujian
+    - local: pr_checks
+      title: Menyemak Permintaan Tarik
+  title: Sumbangkan
+
+- sections:
+    - local: philosophy
+      title: Falsafah
+    - local: glossary
+      title: Glosari
+    - local: task_summary
+      title: Apa 🤗 Transformers boleh buat
+    - local: tasks_explained
+      title: Bagaimana 🤗 Transformers menyelesaikan tugasan
+    - local: model_summary
+      title: Keluarga model Transformer
+    - local: tokenizer_summary
+      title: Ringkasan tokenizer
+    - local: attention
+      title: Mekanisme perhatian
+    - local: pad_truncation
+      title: Padding dan pemotongan
+    - local: bertology
+      title: BERTology
+    - local: perplexity
+      title: Kekeliruan model panjang tetap
+    - local: pipeline_webserver
+      title: Saluran paip untuk inferens pelayan web
+  title: Panduan konsep
+- sections:
+    - sections:
+        - local: main_classes/agent
+          title: Ejen dan Alat
+        - local: model_doc/auto
+          title: Kelas Auto
+        - local: main_classes/callback
+          title: Panggilan balik
+        - local: main_classes/configuration
+          title: Configuration
+        - local: main_classes/data_collator
+          title: Data Collator
+        - local: main_classes/keras_callbacks
+          title: Keras callbacks
+        - local: main_classes/logging
+          title: Logging
+        - local: main_classes/model
+          title: Models
+        - local: main_classes/text_generation
+          title: Text Generation
+        - local: main_classes/onnx
+          title: ONNX
+        - local: main_classes/optimizer_schedules
+          title: Optimization
+        - local: main_classes/output
+          title: Model outputs
+        - local: main_classes/pipelines
+          title: Pipelines
+        - local: main_classes/processors
+          title: Processors
+        - local: main_classes/quantization
+          title: Quantization
+        - local: main_classes/tokenizer
+          title: Tokenizer
+        - local: main_classes/trainer
+          title: Trainer
+        - local: main_classes/deepspeed
+          title: DeepSpeed Integration
+        - local: main_classes/feature_extractor
+          title: Feature Extractor
+        - local: main_classes/image_processor
+          title: Image Processor
+      title: Main Classes
+    - sections:
+        - isExpanded: false
+          sections:
+            - local: model_doc/albert
+              title: ALBERT
+            - local: model_doc/bart
+              title: BART
+            - local: model_doc/barthez
+              title: BARThez
+            - local: model_doc/bartpho
+              title: BARTpho
+            - local: model_doc/bert
+              title: BERT
+            - local: model_doc/bert-generation
+              title: BertGeneration
+            - local: model_doc/bert-japanese
+              title: BertJapanese
+            - local: model_doc/bertweet
+              title: Bertweet
+            - local: model_doc/big_bird
+              title: BigBird
+            - local: model_doc/bigbird_pegasus
+              title: BigBirdPegasus
+            - local: model_doc/biogpt
+              title: BioGpt
+            - local: model_doc/blenderbot
+              title: Blenderbot
+            - local: model_doc/blenderbot-small
+              title: Blenderbot Small
+            - local: model_doc/bloom
+              title: BLOOM
+            - local: model_doc/bort
+              title: BORT
+            - local: model_doc/byt5
+              title: ByT5
+            - local: model_doc/camembert
+              title: CamemBERT
+            - local: model_doc/canine
+              title: CANINE
+            - local: model_doc/codegen
+              title: CodeGen
+            - local: model_doc/convbert
+              title: ConvBERT
+            - local: model_doc/cpm
+              title: CPM
+            - local: model_doc/cpmant
+              title: CPMANT
+            - local: model_doc/ctrl
+              title: CTRL
+            - local: model_doc/deberta
+              title: DeBERTa
+            - local: model_doc/deberta-v2
+              title: DeBERTa-v2
+            - local: model_doc/dialogpt
+              title: DialoGPT
+            - local: model_doc/distilbert
+              title: DistilBERT
+            - local: model_doc/dpr
+              title: DPR
+            - local: model_doc/electra
+              title: ELECTRA
+            - local: model_doc/encoder-decoder
+              title: Encoder Decoder Models
+            - local: model_doc/ernie
+              title: ERNIE
+            - local: model_doc/ernie_m
+              title: ErnieM
+            - local: model_doc/esm
+              title: ESM
+            - local: model_doc/flan-t5
+              title: FLAN-T5
+            - local: model_doc/flan-ul2
+              title: FLAN-UL2
+            - local: model_doc/flaubert
+              title: FlauBERT
+            - local: model_doc/fnet
+              title: FNet
+            - local: model_doc/fsmt
+              title: FSMT
+            - local: model_doc/funnel
+              title: Funnel Transformer
+            - local: model_doc/openai-gpt
+              title: GPT
+            - local: model_doc/gpt_neo
+              title: GPT Neo
+            - local: model_doc/gpt_neox
+              title: GPT NeoX
+            - local: model_doc/gpt_neox_japanese
+              title: GPT NeoX Japanese
+            - local: model_doc/gptj
+              title: GPT-J
+            - local: model_doc/gpt2
+              title: GPT2
+            - local: model_doc/gpt_bigcode
+              title: GPTBigCode
+            - local: model_doc/gptsan-japanese
+              title: GPTSAN Japanese
+            - local: model_doc/gpt-sw3
+              title: GPTSw3
+            - local: model_doc/herbert
+              title: HerBERT
+            - local: model_doc/ibert
+              title: I-BERT
+            - local: model_doc/jukebox
+              title: Jukebox
+            - local: model_doc/led
+              title: LED
+            - local: model_doc/llama
+              title: LLaMA
+            - local: model_doc/longformer
+              title: Longformer
+            - local: model_doc/longt5
+              title: LongT5
+            - local: model_doc/luke
+              title: LUKE
+            - local: model_doc/m2m_100
+              title: M2M100
+            - local: model_doc/marian
+              title: MarianMT
+            - local: model_doc/markuplm
+              title: MarkupLM
+            - local: model_doc/mbart
+              title: MBart and MBart-50
+            - local: model_doc/mega
+              title: MEGA
+            - local: model_doc/megatron-bert
+              title: MegatronBERT
+            - local: model_doc/megatron_gpt2
+              title: MegatronGPT2
+            - local: model_doc/mluke
+              title: mLUKE
+            - local: model_doc/mobilebert
+              title: MobileBERT
+            - local: model_doc/mpnet
+              title: MPNet
+            - local: model_doc/mt5
+              title: MT5
+            - local: model_doc/mvp
+              title: MVP
+            - local: model_doc/nezha
+              title: NEZHA
+            - local: model_doc/nllb
+              title: NLLB
+            - local: model_doc/nllb-moe
+              title: NLLB-MoE
+            - local: model_doc/nystromformer
+              title: Nyströmformer
+            - local: model_doc/open-llama
+              title: Open-Llama
+            - local: model_doc/opt
+              title: OPT
+            - local: model_doc/pegasus
+              title: Pegasus
+            - local: model_doc/pegasus_x
+              title: PEGASUS-X
+            - local: model_doc/phobert
+              title: PhoBERT
+            - local: model_doc/plbart
+              title: PLBart
+            - local: model_doc/prophetnet
+              title: ProphetNet
+            - local: model_doc/qdqbert
+              title: QDQBert
+            - local: model_doc/rag
+              title: RAG
+            - local: model_doc/realm
+              title: REALM
+            - local: model_doc/reformer
+              title: Reformer
+            - local: model_doc/rembert
+              title: RemBERT
+            - local: model_doc/retribert
+              title: RetriBERT
+            - local: model_doc/roberta
+              title: RoBERTa
+            - local: model_doc/roberta-prelayernorm
+              title: RoBERTa-PreLayerNorm
+            - local: model_doc/roc_bert
+              title: RoCBert
+            - local: model_doc/roformer
+              title: RoFormer
+            - local: model_doc/rwkv
+              title: RWKV
+            - local: model_doc/splinter
+              title: Splinter
+            - local: model_doc/squeezebert
+              title: SqueezeBERT
+            - local: model_doc/switch_transformers
+              title: SwitchTransformers
+            - local: model_doc/t5
+              title: T5
+            - local: model_doc/t5v1.1
+              title: T5v1.1
+            - local: model_doc/tapex
+              title: TAPEX
+            - local: model_doc/transfo-xl
+              title: Transformer XL
+            - local: model_doc/ul2
+              title: UL2
+            - local: model_doc/xmod
+              title: X-MOD
+            - local: model_doc/xglm
+              title: XGLM
+            - local: model_doc/xlm
+              title: XLM
+            - local: model_doc/xlm-prophetnet
+              title: XLM-ProphetNet
+            - local: model_doc/xlm-roberta
+              title: XLM-RoBERTa
+            - local: model_doc/xlm-roberta-xl
+              title: XLM-RoBERTa-XL
+            - local: model_doc/xlm-v
+              title: XLM-V
+            - local: model_doc/xlnet
+              title: XLNet
+            - local: model_doc/yoso
+              title: YOSO
+          title: Text models
+        - isExpanded: false
+          sections:
+            - local: model_doc/beit
+              title: BEiT
+            - local: model_doc/bit
+              title: BiT
+            - local: model_doc/conditional_detr
+              title: Conditional DETR
+            - local: model_doc/convnext
+              title: ConvNeXT
+            - local: model_doc/convnextv2
+              title: ConvNeXTV2
+            - local: model_doc/cvt
+              title: CvT
+            - local: model_doc/deformable_detr
+              title: Deformable DETR
+            - local: model_doc/deit
+              title: DeiT
+            - local: model_doc/deta
+              title: DETA
+            - local: model_doc/detr
+              title: DETR
+            - local: model_doc/dinat
+              title: DiNAT
+            - local: model_doc/dit
+              title: DiT
+            - local: model_doc/dpt
+              title: DPT
+            - local: model_doc/efficientformer
+              title: EfficientFormer
+            - local: model_doc/efficientnet
+              title: EfficientNet
+            - local: model_doc/focalnet
+              title: FocalNet
+            - local: model_doc/glpn
+              title: GLPN
+            - local: model_doc/imagegpt
+              title: ImageGPT
+            - local: model_doc/levit
+              title: LeViT
+            - local: model_doc/mask2former
+              title: Mask2Former
+            - local: model_doc/maskformer
+              title: MaskFormer
+            - local: model_doc/mobilenet_v1
+              title: MobileNetV1
+            - local: model_doc/mobilenet_v2
+              title: MobileNetV2
+            - local: model_doc/mobilevit
+              title: MobileViT
+            - local: model_doc/nat
+              title: NAT
+            - local: model_doc/poolformer
+              title: PoolFormer
+            - local: model_doc/regnet
+              title: RegNet
+            - local: model_doc/resnet
+              title: ResNet
+            - local: model_doc/segformer
+              title: SegFormer
+            - local: model_doc/swiftformer
+              title: SwiftFormer
+            - local: model_doc/swin
+              title: Swin Transformer
+            - local: model_doc/swinv2
+              title: Swin Transformer V2
+            - local: model_doc/swin2sr
+              title: Swin2SR
+            - local: model_doc/table-transformer
+              title: Table Transformer
+            - local: model_doc/timesformer
+              title: TimeSformer
+            - local: model_doc/upernet
+              title: UperNet
+            - local: model_doc/van
+              title: VAN
+            - local: model_doc/videomae
+              title: VideoMAE
+            - local: model_doc/vit
+              title: Vision Transformer (ViT)
+            - local: model_doc/vit_hybrid
+              title: ViT Hybrid
+            - local: model_doc/vit_mae
+              title: ViTMAE
+            - local: model_doc/vit_msn
+              title: ViTMSN
+            - local: model_doc/yolos
+              title: YOLOS
+          title: Vision models
+        - isExpanded: false
+          sections:
+            - local: model_doc/audio-spectrogram-transformer
+              title: Audio Spectrogram Transformer
+            - local: model_doc/clap
+              title: CLAP
+            - local: model_doc/hubert
+              title: Hubert
+            - local: model_doc/mctct
+              title: MCTCT
+            - local: model_doc/sew
+              title: SEW
+            - local: model_doc/sew-d
+              title: SEW-D
+            - local: model_doc/speech_to_text
+              title: Speech2Text
+            - local: model_doc/speech_to_text_2
+              title: Speech2Text2
+            - local: model_doc/speecht5
+              title: SpeechT5
+            - local: model_doc/unispeech
+              title: UniSpeech
+            - local: model_doc/unispeech-sat
+              title: UniSpeech-SAT
+            - local: model_doc/wav2vec2
+              title: Wav2Vec2
+            - local: model_doc/wav2vec2-conformer
+              title: Wav2Vec2-Conformer
+            - local: model_doc/wav2vec2_phoneme
+              title: Wav2Vec2Phoneme
+            - local: model_doc/wavlm
+              title: WavLM
+            - local: model_doc/whisper
+              title: Whisper
+            - local: model_doc/xls_r
+              title: XLS-R
+            - local: model_doc/xlsr_wav2vec2
+              title: XLSR-Wav2Vec2
+          title: Audio models
+        - isExpanded: false
+          sections:
+            - local: model_doc/align
+              title: ALIGN
+            - local: model_doc/altclip
+              title: AltCLIP
+            - local: model_doc/blip
+              title: BLIP
+            - local: model_doc/blip-2
+              title: BLIP-2
+            - local: model_doc/bridgetower
+              title: BridgeTower
+            - local: model_doc/chinese_clip
+              title: Chinese-CLIP
+            - local: model_doc/clip
+              title: CLIP
+            - local: model_doc/clipseg
+              title: CLIPSeg
+            - local: model_doc/data2vec
+              title: Data2Vec
+            - local: model_doc/deplot
+              title: DePlot
+            - local: model_doc/donut
+              title: Donut
+            - local: model_doc/flava
+              title: FLAVA
+            - local: model_doc/git
+              title: GIT
+            - local: model_doc/groupvit
+              title: GroupViT
+            - local: model_doc/layoutlm
+              title: LayoutLM
+            - local: model_doc/layoutlmv2
+              title: LayoutLMV2
+            - local: model_doc/layoutlmv3
+              title: LayoutLMV3
+            - local: model_doc/layoutxlm
+              title: LayoutXLM
+            - local: model_doc/lilt
+              title: LiLT
+            - local: model_doc/lxmert
+              title: LXMERT
+            - local: model_doc/matcha
+              title: MatCha
+            - local: model_doc/mgp-str
+              title: MGP-STR
+            - local: model_doc/oneformer
+              title: OneFormer
+            - local: model_doc/owlvit
+              title: OWL-ViT
+            - local: model_doc/perceiver
+              title: Perceiver
+            - local: model_doc/pix2struct
+              title: Pix2Struct
+            - local: model_doc/sam
+              title: Segment Anything
+            - local: model_doc/speech-encoder-decoder
+              title: Speech Encoder Decoder Models
+            - local: model_doc/tapas
+              title: TAPAS
+            - local: model_doc/trocr
+              title: TrOCR
+            - local: model_doc/tvlt
+              title: TVLT
+            - local: model_doc/vilt
+              title: ViLT
+            - local: model_doc/vision-encoder-decoder
+              title: Vision Encoder Decoder Models
+            - local: model_doc/vision-text-dual-encoder
+              title: Vision Text Dual Encoder
+            - local: model_doc/visual_bert
+              title: VisualBERT
+            - local: model_doc/xclip
+              title: X-CLIP
+          title: Multimodal models
+        - isExpanded: false
+          sections:
+            - local: model_doc/decision_transformer
+              title: Decision Transformer
+            - local: model_doc/trajectory_transformer
+              title: Trajectory Transformer
+          title: Reinforcement learning models
+        - isExpanded: false
+          sections:
+            - local: model_doc/informer
+              title: Informer
+            - local: model_doc/time_series_transformer
+              title: Time Series Transformer
+          title: Time series models
+        - isExpanded: false
+          sections:
+            - local: model_doc/graphormer
+              title: Graphormer
+          title: Graph models
+      title: Models
+    - sections:
+        - local: internal/modeling_utils
+          title: Custom Layers and Utilities
+        - local: internal/pipelines_utils
+          title: Utilities for pipelines
+        - local: internal/tokenization_utils
+          title: Utilities for Tokenizers
+        - local: internal/trainer_utils
+          title: Utilities for Trainer
+        - local: internal/generation_utils
+          title: Utilities for Generation
+        - local: internal/image_processing_utils
+          title: Utilities for Image Processors
+        - local: internal/audio_utils
+          title: Utilities for Audio processing
+        - local: internal/file_utils
+          title: General Utilities
+        - local: internal/time_series_utils
+          title: Utilities for Time Series
+      title: Internal Helpers
+  title: API
diff --git a/docs/source/ms/index.mdx b/docs/source/ms/index.mdx
new file mode 100644
index 000000000000..c7dc916d167e
--- /dev/null
+++ b/docs/source/ms/index.mdx
@@ -0,0 +1,456 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Dilesenkan di bawah Lesen Apache, Versi 2.0 ("Lesen"); anda tidak boleh menggunakan fail ini kecuali dengan mematuhi
+Lesen. Anda boleh mendapatkan salinan Lesen di
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Melainkan diperlukan oleh undang-undang yang terpakai atau dipersetujui secara bertulis, perisian yang diedarkan di bawah Lesen diedarkan pada
+ASAS ""SEBAGAIMANA ADANYA"", TANPA WARANTI ATAU SEBARANG JENIS SYARAT, sama ada nyata atau tersirat. Lihat Lesen untuk
+bahasa tertentu yang mengawal kebenaran dan pengehadan di bawah Lesen.
+-->
+
+# 🤗 Transformers
+
+Pembelajaran Mesin terkini untuk [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), dan [JAX](https://jax.readthedocs.io/en/latest/).
+
+🤗 Transformers menyediakan API dan alatan untuk memuat turun dan melatih model pra-latihan terkini dengan mudah. Menggunakan model terlatih boleh mengurangkan kos pengiraan anda, jejak karbon dan menjimatkan masa serta sumber yang diperlukan untuk melatih model dari awal. Model ini menyokong tugas biasa dalam modaliti yang berbeza, seperti:
+
+📝 **Natural Language Processing**: klasifikasi teks, pengecaman entiti bernama, menjawab soalan, pemodelan bahasa, ringkasan, terjemahan, pilihan berganda dan penjanaan teks.<br>
+🖼️ **Computer Vision**: pengelasan imej, pengesanan objek dan pembahagian.<br>
+🗣️ **Audio**: pengecaman pertuturan automatik dan klasifikasi audio.<br>
+🐙 **Multimodal**: jawapan soalan jadual, pengecaman aksara optik, pengekstrakan maklumat daripada dokumen yang diimbas, klasifikasi video dan jawapan soalan visual.
+
+🤗 Transformer menyokong kebolehoperasian rangka kerja antara PyTorch, TensorFlow, and JAX. Ini memberikan fleksibiliti untuk menggunakan rangka kerja yang berbeza pada setiap peringkat kehidupan model; latih model dalam tiga baris kod dalam satu rangka kerja, dan muatkannya untuk inferens dalam rangka kerja yang lain. Model juga boleh dieksport ke format seperti ONNX.
+
+Sertai komuniti yang semakin berkembang di [Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), atau [Discord](https://discord.com/invite/JfAtkvEtRb) hari ini!
+
+## Jika anda sedang mencari sokongan tersuai daripada pasukan Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Kandungan
+
+Dokumentasi disusun kepada lima bahagian:
+
+- **MULAKAN** menyediakan lawatan pantas ke perpustakaan dan arahan pemasangan untuk bangun dan berjalan.
+- **TUTORIAL** ialah tempat yang bagus untuk bermula jika anda seorang pemula. Bahagian ini akan membantu anda memperoleh kemahiran asas yang anda perlukan untuk mula menggunakan perpustakaan.
+- **PANDUAN CARA-CARA** menunjukkan kepada anda cara untuk mencapai matlamat tertentu, seperti memperhalusi model terlatih untuk pemodelan bahasa atau cara menulis dan berkongsi model tersuai.
+- **PANDUAN KONSEP** menawarkan lebih banyak perbincangan dan penjelasan tentang konsep dan idea asas di sebalik model, tugasan dan falsafah reka bentuk 🤗 Transformers.
+- **API** menerangkan semua kelas dan fungsi:
+
+  - **KELAS UTAMA** memperincikan kelas yang paling penting seperti konfigurasi, model, tokenizer dan saluran paip.
+  - **MODEL** memperincikan kelas dan fungsi yang berkaitan dengan setiap model yang dilaksanakan dalam perpustakaan.
+  - **PEMBANTU DALAMAN** memperincikan kelas utiliti dan fungsi yang digunakan secara dalaman.
+
+### Model yang disokong
+
+<!--Senarai ini dikemas kini secara automatik daripada README dengan _make fix-copies_. Jangan kemas kini secara manual! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
+1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MatCha](model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace).
+1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Rangka kerja yang disokong
+
+Jadual di bawah mewakili sokongan semasa dalam perpustakaan untuk setiap model tersebut, sama ada model tersebut mempunyai Python
+tokenizer (dipanggil ""lambat""). Tokenizer ""pantas"" yang disokong oleh perpustakaan Tokenizers 🤗, sama ada mereka mempunyai sokongan dalam Jax (melalui
+Flax), PyTorch, dan/atau TensorFlow.
+
+<!--Jadual ini dikemas kini secara automatik daripada modul auto dengan _make fix-copies_. Jangan kemas kini secara manual!-->
+
+|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             ALIGN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Autoformer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BLIP              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            BLIP-2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          BridgeTower          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          ConvNeXTV2           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CPM-Ant            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             DETA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ErnieM             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           FocalNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          GPTBigCode           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        GPTSAN-japanese        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Graphormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Informer            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LLaMA             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Mask2Former          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
+|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             MEGA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            MGP-STR            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           NLLB-MOE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           OneFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           OpenLlama           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Pix2Struct           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             RWKV              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              SAM              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|           SpeechT5            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SwiftFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             X-MOD             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- Tamat -->

From e03a9cc0cd7623a8d5208d7a4206f628b2bd5513 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:21:22 -0400
Subject: [PATCH 295/935] Modify device_map behavior when loading a model using
 from_pretrained (#23922)

* Modify device map behavior for 4/8 bits model

* Remove device_map arg for training 4/8 bit model

* Remove index

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Add Exceptions

* Modify comment

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Fix formatting

* Get current device with accelerate

* Revert "Get current device with accelerate"

This reverts commit 46f00799103bbe15bd58762ba029aab35363c4f7.

* Fix Exception

* Modify quantization doc

* Fix error

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/main_classes/quantization.mdx |  8 ++-
 docs/source/en/perf_infer_gpu_one.mdx        |  1 +
 src/transformers/modeling_utils.py           | 60 +++++++++++++++-----
 tests/bitsandbytes/test_4bit.py              |  4 +-
 tests/bitsandbytes/test_mixed_int8.py        |  4 +-
 5 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/main_classes/quantization.mdx b/docs/source/en/main_classes/quantization.mdx
index c168b11a8302..e093bc3bafa9 100644
--- a/docs/source/en/main_classes/quantization.mdx
+++ b/docs/source/en/main_classes/quantization.mdx
@@ -86,6 +86,7 @@ With this integration we were able to load large models on smaller devices and r
 <Tip warning={true}>
 
 Note that once a model has been loaded in 8-bit it is currently not possible to push the quantized weights on the Hub except if you use the latest `transformers` and `bitsandbytes`. Note also that you cannot train 8-bit weights as this is not supported yet. However you can use 8-bit models to train extra parameters, this will be covered in the next section.
+Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
 
 </Tip>
 
@@ -162,9 +163,10 @@ You can load a quantized model from the Hub by using `from_pretrained` method. M
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit")
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
 ```
-Note that in this case, you don't need to specify the arguments `load_in_8bit=True` and `device_map="auto"`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
+Note that in this case, you don't need to specify the arguments `load_in_8bit=True`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
+Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
 
 ### Advanced usecases
 
@@ -253,6 +255,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been loaded in 8-bit. 
 This enables fine-tuning large models such as `flan-t5-large` or `facebook/opt-6.7b` in a single google Colab. Please have a look at [`peft`](https://github.com/huggingface/peft) library for more details.
 
+Note that you don't need to pass `device_map` when loading the model for training. It will automatically load your model on your GPU. You can also set the device map to a specific device if needed (e.g. `cuda:0`, `0`, `torch.device('cuda:0')`). Please note that `device_map=auto` should be used for inference only. 
+
 ### BitsAndBytesConfig
 
 [[autodoc]] BitsAndBytesConfig
diff --git a/docs/source/en/perf_infer_gpu_one.mdx b/docs/source/en/perf_infer_gpu_one.mdx
index 4bcf3c111116..5c8f874c37a7 100644
--- a/docs/source/en/perf_infer_gpu_one.mdx
+++ b/docs/source/en/perf_infer_gpu_one.mdx
@@ -65,6 +65,7 @@ from transformers import AutoModelForCausalLM
 model_name = "bigscience/bloom-2b5"
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
 ```
+Note that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
 
 ### Running FP4 models - multi GPU setup 
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 1c4ce11b5236..fb142432863b 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2055,10 +2055,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
                 </Tip>
 
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                 A map that specifies where each submodule should go. It doesn't need to be refined to each
                 parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
-                same device.
+                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
+                like `1`) on which the model will be allocated, the device map will map the entire model to this
+                device. Passing `device_map = 0` means put the whole model on GPU 0.
 
                 To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                 more information about each option see [designing a device
@@ -2186,6 +2188,26 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
                 " ignored."
             )
+
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+
         if device_map is not None:
             if low_cpu_mem_usage is None:
                 low_cpu_mem_usage = True
@@ -2226,12 +2248,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "`quantization_config` argument at the same time."
                 )
 
-            # in the case a user loads an 8bit model from the Hub and assigns a new quantization_config
-            if device_map is None:
-                device_map = "auto"
-                if low_cpu_mem_usage is None:
-                    low_cpu_mem_usage = True
-
         if load_in_8bit or load_in_4bit:
             if not (is_accelerate_available() and is_bitsandbytes_available()):
                 raise ImportError(
@@ -2251,10 +2267,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 torch_dtype = torch.float16
 
             if device_map is None:
-                raise ValueError(
-                    "A device map needs to be passed to run convert models into 8-bit and 4-bit formats. Please run"
-                    "`.from_pretrained` with `device_map='auto'`"
+                if torch.cuda.is_available():
+                    device_map = {"": torch.cuda.current_device()}
+                else:
+                    raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+                logger.info(
+                    "The device_map was not initialized."
+                    "Setting device_map to {'':torch.cuda.current_device()}."
+                    "If you want to use the model for inference, please set device_map ='auto' "
                 )
+                if low_cpu_mem_usage is None:
+                    low_cpu_mem_usage = True
+
             if from_tf or from_flax:
                 raise ValueError(
                     "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
@@ -2318,10 +2342,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 if torch_dtype is None:
                     torch_dtype = torch.float16
                 if device_map is None:
-                    device_map = "auto"
+                    if torch.cuda.is_available():
+                        device_map = {"": torch.cuda.current_device()}
+                    else:
+                        raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+                    logger.info(
+                        "The device_map was not initialized."
+                        "Setting device_map to {'':torch.cuda.current_device()}."
+                        "If you want to use the model for inference, please set device_map ='auto' "
+                    )
+                    if low_cpu_mem_usage is None:
+                        low_cpu_mem_usage = True
 
-                if low_cpu_mem_usage is None:
-                    low_cpu_mem_usage = True
         elif not is_8bit_serializable and not load_in_8bit and hasattr(config, "quantization_config"):
             logger.warning(
                 "Detected the presence of a `quantization_config` attribute in the model's configuration but you don't have the correct"
diff --git a/tests/bitsandbytes/test_4bit.py b/tests/bitsandbytes/test_4bit.py
index 1d0ea6dc3de2..0b6445b2c11c 100644
--- a/tests/bitsandbytes/test_4bit.py
+++ b/tests/bitsandbytes/test_4bit.py
@@ -429,7 +429,9 @@ def test_training(self):
             return
 
         # Step 1: freeze all parameters
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
+
+        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
 
         for param in model.parameters():
             param.requires_grad = False  # freeze the model - train adapters later
diff --git a/tests/bitsandbytes/test_mixed_int8.py b/tests/bitsandbytes/test_mixed_int8.py
index b31aaa386ae0..d178dbff1c32 100644
--- a/tests/bitsandbytes/test_mixed_int8.py
+++ b/tests/bitsandbytes/test_mixed_int8.py
@@ -684,7 +684,9 @@ def test_training(self):
             return
 
         # Step 1: freeze all parameters
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
+
+        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
 
         for param in model.parameters():
             param.requires_grad = False  # freeze the model - train adapters later

From c62b01d0b0f1d31fce440accb7a3c06e6a29d746 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 1 Jun 2023 22:10:24 +0200
Subject: [PATCH 296/935] use _make_causal_mask in clip/vit models (#23942)

use _make_causal_mask in clip models
---
 src/transformers/models/clip/modeling_clip.py | 32 +++++++++++--------
 .../models/clipseg/modeling_clipseg.py        | 32 +++++++++++--------
 .../models/groupvit/modeling_groupvit.py      | 32 +++++++++++--------
 .../models/owlvit/modeling_owlvit.py          | 31 +++++++++++-------
 .../models/x_clip/modeling_x_clip.py          | 32 +++++++++++--------
 5 files changed, 96 insertions(+), 63 deletions(-)

diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index cb2dbaafe1be..b6ea28617c02 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -673,6 +673,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class CLIPTextTransformer(nn.Module):
     def __init__(self, config: CLIPTextConfig):
         super().__init__()
@@ -711,12 +729,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
         # CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(
-            bsz, seq_len, hidden_states.dtype, device=hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -752,15 +767,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype, device=None):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=device)
-        mask.fill_(torch.finfo(dtype).min)
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 @add_start_docstrings(
     """The text model from CLIP without any head or projection on top.""",
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index d5e06bf26f3d..85b119653068 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -683,6 +683,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class CLIPSegTextTransformer(nn.Module):
     # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegTextConfig):
@@ -723,12 +741,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
         # CLIPSeg's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIPSeg/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(
-            bsz, seq_len, hidden_states.dtype, device=hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -764,15 +779,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype, device=None):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=device)
-        mask.fill_(torch.finfo(dtype).min)
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class CLIPSegTextModel(CLIPSegPreTrainedModel):
     config_class = CLIPSegTextConfig
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index a7e08901e54f..c19ebd13b91d 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1066,6 +1066,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder, CLIP_TEXT->GROUPVIT_TEXT
 class GroupViTTextTransformer(nn.Module):
     def __init__(self, config: GroupViTTextConfig):
@@ -1105,12 +1123,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        bsz, seq_len = input_shape
         # CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(
-            bsz, seq_len, hidden_states.dtype, device=hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1146,15 +1161,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype, device=None):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=device)
-        mask.fill_(torch.finfo(dtype).min)
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class GroupViTTextModel(GroupViTPreTrainedModel):
     config_class = GroupViTTextConfig
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 1e296e01ac38..f65a0688578e 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -783,6 +783,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class OwlViTTextTransformer(nn.Module):
     def __init__(self, config: OwlViTTextConfig):
         super().__init__()
@@ -816,10 +834,10 @@ def forward(
         input_ids = input_ids.view(-1, input_shape[-1])
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        num_samples, seq_len = input_shape  # num_samples = batch_size * num_max_text_queries
+        # num_samples, seq_len = input_shape  where num_samples = batch_size * num_max_text_queries
         # OWLVIT's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(num_samples, seq_len).to(hidden_states.device)
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len]
@@ -854,15 +872,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, bsz, seq_len):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len)
-        mask.fill_(torch.tensor(float("-inf")))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class OwlViTTextModel(OwlViTPreTrainedModel):
     config_class = OwlViTTextConfig
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 337d919cf4a6..8db4ee0fd194 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -737,6 +737,24 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
 class XCLIPTextTransformer(nn.Module):
     def __init__(self, config: XCLIPTextConfig):
         super().__init__()
@@ -775,12 +793,9 @@ def forward(
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        batch_size, seq_len = input_shape
         # X_CLIP's text model uses causal mask, prepare it here.
         # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
         # expand attention_mask
         if attention_mask is not None:
             # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
@@ -812,15 +827,6 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    def _build_causal_attention_mask(self, batch_size, seq_len, dtype):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(batch_size, seq_len, seq_len, dtype=dtype)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
-        mask.triu_(1)  # zero out the lower diagonal
-        mask = mask.unsqueeze(1)  # expand mask
-        return mask
-
 
 class XCLIPTextModel(XCLIPPreTrainedModel):
     config_class = XCLIPTextConfig

From f49a3453caa6fe606bb31c571423f72264152fce Mon Sep 17 00:00:00 2001
From: wasupandceacar <wasupandceacar@gmail.com>
Date: Fri, 2 Jun 2023 04:10:52 +0800
Subject: [PATCH 297/935] Fix `ReduceLROnPlateau` object has no attribute
 'get_last_lr' (#23944)

* Fix 'ReduceLROnPlateau' object has no attribute 'get_last_lr'

* fix style
---
 src/transformers/trainer_pt_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 011f3162a635..bb9b00d2cc42 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -851,7 +851,10 @@ def _get_learning_rate(self):
             else:
                 raise
     else:
-        last_lr = self.lr_scheduler.get_last_lr()[0]
+        if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+            last_lr = self.optimizer.param_groups[0]["lr"]
+        else:
+            last_lr = self.lr_scheduler.get_last_lr()[0]
         if torch.is_tensor(last_lr):
             last_lr = last_lr.item()
     return last_lr

From 5dfd407b37ac683dc91637e9913b0ae9205d2acd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 2 Jun 2023 10:30:24 +0100
Subject: [PATCH 298/935] [MMS] Scaling Speech Technology to 1,000+ Languages |
 Add attention adapter to Wav2Vec2 (#23813)

* add fine-tuned with adapter layer

* Add set_target_lang to tokenizer

* Implement load adapter

* add tests

* make style

* Apply suggestions from code review

* Update src/transformers/models/wav2vec2/tokenization_wav2vec2.py

* make fix-copies

* Apply suggestions from code review

* make fix-copies

* make style again

* mkae style again

* fix doc string

* Update tests/models/wav2vec2/test_tokenization_wav2vec2.py

* Apply suggestions from code review

* fix

* Correct wav2vec2 adapter

* mkae style

* Update src/transformers/models/wav2vec2/modeling_wav2vec2.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* add more nice docs

* finish

* finish

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions from code review

* all finish

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/mms.mdx              | 118 +++++++++
 .../models/auto/configuration_auto.py         |   1 +
 .../models/hubert/modeling_hubert.py          |  43 +++-
 src/transformers/models/sew/modeling_sew.py   |   9 +-
 .../models/sew_d/modeling_sew_d.py            |   9 +-
 .../models/unispeech/modeling_unispeech.py    |  43 +++-
 .../unispeech_sat/modeling_unispeech_sat.py   |  43 +++-
 .../models/wav2vec2/configuration_wav2vec2.py |   5 +
 ..._original_pytorch_checkpoint_to_pytorch.py |  99 ++++++--
 .../models/wav2vec2/modeling_wav2vec2.py      | 233 +++++++++++++++++-
 .../models/wav2vec2/tokenization_wav2vec2.py  |  41 ++-
 .../modeling_wav2vec2_conformer.py            |   9 +-
 .../models/wavlm/modeling_wavlm.py            |   9 +-
 .../models/wav2vec2/test_modeling_wav2vec2.py | 139 +++++++++++
 .../wav2vec2/test_tokenization_wav2vec2.py    |  45 ++++
 24 files changed, 823 insertions(+), 33 deletions(-)
 create mode 100644 docs/source/en/model_doc/mms.mdx

diff --git a/README.md b/README.md
index e8e3f26d0ad6..c6371abd7bbb 100644
--- a/README.md
+++ b/README.md
@@ -401,6 +401,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
diff --git a/README_es.md b/README_es.md
index 8d971e6f3047..bd503bb71727 100644
--- a/README_es.md
+++ b/README_es.md
@@ -376,6 +376,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
diff --git a/README_hd.md b/README_hd.md
index bb94c9961a60..731e9975b506 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -348,6 +348,7 @@ conda install -c huggingface transformers
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA से) साथ वाला पेपर [Megatron-LM: ट्रेनिंग मल्टी-बिलियन पैरामीटर लैंग्वेज मॉडल्स यूजिंग मॉडल पैरेललिज़्म] (https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा पोस्ट किया गया।
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research से) Peng Wang, Cheng Da, and Cong Yao. द्वाराअनुसंधान पत्र [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) के साथ जारी किया गया
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (फ्रॉम Studio Ousia) साथ में पेपर [mLUKE: द पावर ऑफ एंटिटी रिप्रेजेंटेशन इन मल्टीलिंगुअल प्रीट्रेन्ड लैंग्वेज मॉडल्स](https://arxiv.org/abs/2110.08151) रयोकन री, इकुया यामाडा, और योशिमासा त्सुरोका द्वारा।
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook से) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. द्वाराअनुसंधान पत्र [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) के साथ जारी किया गया
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [मोबाइलबर्ट: संसाधन-सीमित उपकरणों के लिए एक कॉम्पैक्ट टास्क-अज्ञेय बीईआरटी] (https://arxiv.org/abs/2004.02984) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, और Denny Zhou द्वारा पोस्ट किया गया।
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
diff --git a/README_ja.md b/README_ja.md
index 25a2cdb1d538..9559f4d85ef9 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -410,6 +410,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research から) Peng Wang, Cheng Da, and Cong Yao. から公開された研究論文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook から) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. から公開された研究論文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
diff --git a/README_ko.md b/README_ko.md
index 7c160536f19b..f4203e67118c 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -325,6 +325,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research 에서 제공)은 Peng Wang, Cheng Da, and Cong Yao.의 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)논문과 함께 발표했습니다.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia 에서) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 의 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 논문과 함께 발표했습니다.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook 에서 제공)은 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.의 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)논문과 함께 발표했습니다.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain 에서) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 의 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 논문과 함께 발표했습니다.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 75353804d30b..2601f280e72b 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -349,6 +349,7 @@ conda install -c huggingface transformers
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (来自 Alibaba Research) 伴随论文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) 由 Peng Wang, Cheng Da, and Cong Yao 发布。
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (来自 Facebook) 伴随论文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 由 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli 发布。
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 066604362c13..cd37ca9d148e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -361,6 +361,7 @@ conda install -c huggingface transformers
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dfe899032ea8..8b43fb517792 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -543,6 +543,8 @@
         title: Hubert
       - local: model_doc/mctct
         title: MCTCT
+      - local: model_doc/mms
+        title: MMS
       - local: model_doc/sew
         title: SEW
       - local: model_doc/sew-d
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index b7442050c21a..edf5bc00b3c7 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -162,6 +162,7 @@ The documentation is organized into five sections:
 1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
diff --git a/docs/source/en/model_doc/mms.mdx b/docs/source/en/model_doc/mms.mdx
new file mode 100644
index 000000000000..bd32617370cb
--- /dev/null
+++ b/docs/source/en/model_doc/mms.mdx
@@ -0,0 +1,118 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# MMS
+
+## Overview
+
+The MMS model was proposed in [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2111.09296) 
+by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli
+
+The abstract from the paper is the following:
+
+*Expanding the language coverage of speech technology has the potential to improve access to information for many more people. 
+However, current speech technology is restricted to about one hundred languages which is a small fraction of the over 7,000
+languages spoken around the world. 
+The Massively Multilingual Speech (MMS) project increases the number of supported languages by 10-40x, depending on the task. 
+The main ingredients are a new dataset based on readings of publicly available religious texts and effectively leveraging
+self-supervised learning. We built pre-trained wav2vec 2.0 models covering 1,406 languages, 
+a single multilingual automatic speech recognition model for 1,107 languages, speech synthesis models 
+for the same number of languages, as well as a language identification model for 4,017 languages. 
+Experiments show that our multilingual speech recognition model more than halves the word error rate of 
+Whisper on 54 languages of the FLEURS benchmark while being trained on a small fraction of the labeled data.*
+
+Tips:
+
+- MMS is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. The raw waveform should be pre-processed with [`Wav2Vec2FeatureExtractor`].
+- MMS model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
+  [`Wav2Vec2CTCTokenizer`].
+- MMS can load different language adapter weights for different languages via [`~Wav2Vec2PreTrainedModel.load_adapter`]. Language adapters only consists of roughly 2 million parameters 
+  and can therefore be efficiently loaded on the fly when needed.
+
+Relevant checkpoints can be found under https://huggingface.co/models?other=mms.
+
+MMS's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
+
+The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
+
+## Inference
+
+By default MMS loads adapter weights for English, but those can be easily switched out for another language.
+Let's look at an example.
+
+First, we load audio data in different languages using the [Datasets](https://github.com/huggingface/datasets).
+
+```py
+from datasets import load_dataset, Audio
+
+# English
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+en_sample = next(iter(stream_data))["audio"]["array"]
+
+# French
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "fr", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+fr_sample = next(iter(stream_data))["audio"]["array"]
+```
+
+Next, we load the model and processor
+
+```py
+from transformers import Wav2Vec2ForCTC, AutoProcessor
+import torch
+
+model_id = "facebook/mms-1b-all"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Wav2Vec2ForCTC.from_pretrained(model_id)
+```
+
+Now we process the audio data, pass the processed audio data to the model and transcribe the model output,
+just like we usually do for [`Wav2Vec2ForCTC`].
+
+```py
+inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+ids = torch.argmax(outputs, dim=-1)[0]
+transcription = processor.decode(ids)
+# 'joe keton disapproved of films and buster also had reservations about the media'
+```
+
+We can now keep the same model in memory and simply switch out the language adapters by
+calling the convenient [`~Wav2Vec2ForCTC.load_adapter`] function for the model and [`~Wav2Vec2CTCTokenizer.set_target_lang`] for the tokenizer.
+We pass the target language as an input - `"fra"` for French.
+
+```py
+processor.tokenizer.set_target_lang("fra")
+model.load_adapter("fra")
+
+inputs = processor(fr_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+ids = torch.argmax(outputs, dim=-1)[0]
+transcription = processor.decode(ids)
+# "ce dernier est volé tout au long de l'histoire romaine"
+```
+
+In the same way the language can be switched out for all other supported languages. Please have a look at:
+
+```py
+processor.tokenizer.vocab.keys()
+```
+
+to see all supported languages.
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fbffe8382266..2c05989d9938 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -515,6 +515,7 @@
         ("megatron_gpt2", "Megatron-GPT2"),
         ("mgp-str", "MGP-STR"),
         ("mluke", "mLUKE"),
+        ("mms", "MMS"),
         ("mobilebert", "MobileBERT"),
         ("mobilenet_v1", "MobileNetV1"),
         ("mobilenet_v2", "MobileNetV2"),
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 1c2493825642..a62b0180e9c1 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -603,6 +603,32 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->Hubert
+class HubertAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert
 class HubertEncoderLayerStableLayerNorm(nn.Module):
     def __init__(self, config):
@@ -618,6 +644,11 @@ def __init__(self, config):
         self.feed_forward = HubertFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = HubertAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -633,6 +664,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -1096,7 +1130,7 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Hubert, wav2vec2->hubert, WAV_2_VEC_2->HUBERT
 class HubertForCTC(HubertPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.hubert = HubertModel(config)
@@ -1114,6 +1148,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 3e41496ea13b..1d45facf22e8 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -969,7 +969,7 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEW, wav2vec2->sew, WAV_2_VEC_2->SEW
 class SEWForCTC(SEWPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.sew = SEWModel(config)
@@ -987,6 +987,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 417fd81c6e9d..f1642fab1a77 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -1509,7 +1509,7 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV_2_VEC_2->SEWD
 class SEWDForCTC(SEWDPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.sew_d = SEWDModel(config)
@@ -1527,6 +1527,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 71dcaad1193d..d7a10fbf59f6 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -639,6 +639,32 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeech
+class UniSpeechAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->UniSpeech
 class UniSpeechEncoderLayerStableLayerNorm(nn.Module):
     def __init__(self, config):
@@ -654,6 +680,11 @@ def __init__(self, config):
         self.feed_forward = UniSpeechFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -669,6 +700,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -1340,7 +1374,7 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
 class UniSpeechForCTC(UniSpeechPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.unispeech = UniSpeechModel(config)
@@ -1358,6 +1392,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index ce3f5b80ca13..d2b258453091 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -653,6 +653,32 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeechSat
+class UniSpeechSatAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->UniSpeechSat
 class UniSpeechSatEncoderLayerStableLayerNorm(nn.Module):
     def __init__(self, config):
@@ -668,6 +694,11 @@ def __init__(self, config):
         self.feed_forward = UniSpeechSatFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechSatAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -683,6 +714,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -1347,7 +1381,7 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.unispeech_sat = UniSpeechSatModel(config)
@@ -1365,6 +1399,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index 6f7709e535d1..34049305734b 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -180,6 +180,9 @@ class Wav2Vec2Config(PretrainedConfig):
         num_adapter_layers (`int`, *optional*, defaults to 3):
             Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
             True`.
+        adapter_attn_dim (`int`, *optional*):
+            Dimension of the attention adapter weights to be used in each attention block. An example of a model using
+            attention adapters is [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
         output_hidden_size (`int`, *optional*):
             Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
             if `add_adapter is True`.
@@ -256,6 +259,7 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
+        adapter_attn_dim=None,
         **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
@@ -326,6 +330,7 @@ def __init__(
         self.adapter_stride = adapter_stride
         self.num_adapter_layers = num_adapter_layers
         self.output_hidden_size = output_hidden_size or hidden_size
+        self.adapter_attn_dim = adapter_attn_dim
 
         # SequenceClassification-specific parameter. Feel free to ignore for other classes.
         self.classifier_proj_size = classifier_proj_size
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index 4656f5b811e1..3e9cb3c03041 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -49,6 +49,7 @@
     "fc2": "encoder.layers.*.feed_forward.output_dense",
     "final_layer_norm": "encoder.layers.*.final_layer_norm",
     "encoder.layer_norm": "encoder.layer_norm",
+    "adapter_layer": "encoder.layers.*.adapter_layer",
     "w2v_model.layer_norm": "feature_projection.layer_norm",
     "quantizer.weight_proj": "quantizer.weight_proj",
     "quantizer.vars": "quantizer.codevectors",
@@ -66,12 +67,26 @@
 ]
 
 
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
+def set_recursively(key, value, full_name, weight_type, hf_pointer):
     for attribute in key.split("."):
         hf_pointer = getattr(hf_pointer, attribute)
 
-    if weight_type is not None:
+    hf_param_name = None
+    for param_key in PARAM_MAPPING.keys():
+        if full_name.endswith(param_key):
+            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
+            weight_type = "param"
+
+    if weight_type is not None and weight_type != "param":
         hf_shape = getattr(hf_pointer, weight_type).shape
+    elif weight_type is not None and weight_type == "param":
+        shape_pointer = hf_pointer
+        for attribute in hf_param_name.split("."):
+            shape_pointer = getattr(shape_pointer, attribute)
+        hf_shape = shape_pointer.shape
+
+        # let's reduce dimension
+        value = value[0]
     else:
         hf_shape = hf_pointer.shape
 
@@ -89,12 +104,71 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type):
         hf_pointer.weight_v.data = value
     elif weight_type == "bias":
         hf_pointer.bias.data = value
+    elif weight_type == "param":
+        for attribute in hf_param_name.split("."):
+            hf_pointer = getattr(hf_pointer, attribute)
+        hf_pointer.data = value
     else:
         hf_pointer.data = value
 
     logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
 
 
+def rename_dict(key, value, full_name, weight_type, hf_dict):
+    hf_param_name = None
+    for param_key in PARAM_MAPPING.keys():
+        if full_name.endswith(param_key):
+            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
+            weight_type = "param"
+
+    if weight_type is not None and weight_type != "param":
+        full_key = ".".join([key, weight_type])
+    elif weight_type is not None and weight_type == "param":
+        full_key = ".".join([key, hf_param_name])
+    else:
+        full_key = key
+
+    hf_dict[full_key] = value if "lm_head" in full_key else value[0]
+
+
+PARAM_MAPPING = {
+    "W_a": "linear_1.weight",
+    "W_b": "linear_2.weight",
+    "b_a": "linear_1.bias",
+    "b_b": "linear_2.bias",
+    "ln_W": "norm.weight",
+    "ln_b": "norm.bias",
+}
+
+
+def load_wav2vec2_layer(name, value, hf_model=None, hf_dict=None):
+    is_used = False
+    for key, mapped_key in MAPPING.items():
+        mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+        if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+            is_used = True
+            if "*" in mapped_key:
+                layer_index = name.split(key)[0].split(".")[-2]
+                mapped_key = mapped_key.replace("*", layer_index)
+            if "weight_g" in name:
+                weight_type = "weight_g"
+            elif "weight_v" in name:
+                weight_type = "weight_v"
+            elif "bias" in name:
+                weight_type = "bias"
+            elif "weight" in name:
+                # TODO: don't match quantizer.weight_proj
+                weight_type = "weight"
+            else:
+                weight_type = None
+            if hf_dict is not None:
+                rename_dict(mapped_key, value, name, weight_type, hf_dict)
+            else:
+                set_recursively(mapped_key, value, name, weight_type, hf_model)
+            return is_used
+    return is_used
+
+
 def recursively_load_weights(fairseq_model, hf_model, is_headless):
     unused_weights = []
     fairseq_dict = fairseq_model.state_dict()
@@ -113,26 +187,7 @@ def recursively_load_weights(fairseq_model, hf_model, is_headless):
             )
             is_used = True
         else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
+            is_used = load_wav2vec2_layer(name, value, hf_model)
         if not is_used:
             unused_weights.append(name)
 
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index bcf7b7a3ad8d..51811dfdd31d 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -42,12 +42,21 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    cached_file,
+    is_safetensors_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_wav2vec2 import Wav2Vec2Config
 
 
+WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
+WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
+
+if is_safetensors_available():
+    from safetensors.torch import load_file as safe_load_file
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -708,6 +717,11 @@ def __init__(self, config):
         self.feed_forward = Wav2Vec2FeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = Wav2Vec2AttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -723,6 +737,9 @@ def forward(
         hidden_states = attn_residual + hidden_states
         hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
         outputs = (hidden_states,)
 
         if output_attentions:
@@ -1034,6 +1051,31 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+class Wav2Vec2AttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
 class Wav2Vec2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1132,6 +1174,188 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureEncoder)):
             module.gradient_checkpointing = value
 
+    @property
+    def _adapters(self):
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
+
+        adapter_weights = {}
+        for name, module in self.named_modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                for param_name, param in module.named_parameters():
+                    adapter_weights[".".join([name, param_name])] = param
+
+        if isinstance(self, Wav2Vec2ForCTC):
+            for name, param in self.lm_head.named_parameters():
+                adapter_weights[".".join(["lm_head", name])] = param
+
+        return adapter_weights
+
+    def load_adapter(self, target_lang: str, **kwargs):
+        r"""
+        Load a language adapter model from a pre-trained adapter model.
+
+        Parameters:
+            target_lang (`str`):
+                Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
+                adapter.<lang>.safetensors or adapter.<lang>.bin
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        Examples:
+
+        ```python
+        >>> from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+        >>> ckpt = "facebook/mms-1b-all"
+        >>> processor = AutoProcessor.from_pretrained(ckpt)
+        >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
+        >>> # set specific language
+        >>> processor.tokenizer.set_target_lang("spa")
+        >>> model.load_adapter("spa")
+        ```
+        """
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        model_path_or_id = self.config._name_or_path
+        state_dict = None
+
+        # 1. Let's first try loading a safetensors adapter weight
+        if use_safetensors is not False:
+            filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = safe_load_file(weight_path)
+
+            except EnvironmentError:
+                if use_safetensors:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                if use_safetensors:
+                    raise EnvironmentError(
+                        f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                        f" directory containing a file named {filepath}."
+                    )
+
+        # 2. If this didn't work let's try loading a PyTorch adapter weight
+        if state_dict is None:
+            filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = torch.load(weight_path, map_location="cpu")
+
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                # to the original exception.
+                raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                    f" directory containing a file named {filepath}."
+                )
+
+        adapter_weights = self._adapters
+        unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
+        missing_keys = set(adapter_weights.keys()) - set(state_dict.keys())
+
+        if len(unexpected_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.")
+        elif len(missing_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.")
+
+        # make sure now vocab size is correct
+        target_vocab_size = state_dict["lm_head.weight"].shape[0]
+        if target_vocab_size != self.config.vocab_size:
+            self.lm_head = nn.Linear(
+                self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype
+            )
+            self.config.vocab_size = target_vocab_size
+
+        # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights
+        state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
+        self.load_state_dict(state_dict, strict=False)
+
 
 WAV_2_VEC_2_START_DOCSTRING = r"""
     Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
@@ -1614,7 +1838,7 @@ def forward(
     WAV_2_VEC_2_START_DOCSTRING,
 )
 class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.wav2vec2 = Wav2Vec2Model(config)
@@ -1632,6 +1856,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 15d3471da0d2..472fd2d649c9 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -148,6 +148,9 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
             The token used for defining the end of a word.
         do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to accept lowercase input and lowercase the output when decoding.
+        target_lang (`str`, *optional*):
+            A target language the tokenizer should set by default. `target_lang` has to be defined for multi-lingual,
+            nested vocabulary such as [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
 
         **kwargs
             Additional keyword arguments passed along to [`PreTrainedTokenizer`]
@@ -168,6 +171,7 @@ def __init__(
         word_delimiter_token="|",
         replace_word_delimiter_char=" ",
         do_lower_case=False,
+        target_lang=None,
         **kwargs,
     ):
         super().__init__(
@@ -178,6 +182,7 @@ def __init__(
             do_lower_case=do_lower_case,
             word_delimiter_token=word_delimiter_token,
             replace_word_delimiter_char=replace_word_delimiter_char,
+            target_lang=target_lang,
             **kwargs,
         )
 
@@ -185,9 +190,18 @@ def __init__(
 
         self.do_lower_case = do_lower_case
         self.replace_word_delimiter_char = replace_word_delimiter_char
+        self.target_lang = target_lang
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
+            self.vocab = json.load(vocab_handle)
+
+        # if target lang is defined vocab must be a nested dict
+        # with each target lang being one vocabulary
+        if target_lang is not None:
+            self.encoder = self.vocab[target_lang]
+        else:
+            self.encoder = self.vocab
+
         self.decoder = {v: k for k, v in self.encoder.items()}
 
         # make sure that tokens made of several
@@ -198,6 +212,27 @@ def __init__(
 
         self._create_trie(self.unique_no_split_tokens)
 
+    def set_target_lang(self, target_lang: str):
+        """
+        Set the target language of a nested multi-lingual dictionary
+        """
+        if self.vocab == self.encoder:
+            raise ValueError(f"{self.vocab} is not a multi-lingual, nested tokenizer. Cannot set target language.")
+
+        if target_lang not in self.vocab:
+            raise ValueError(f"{target_lang} does not exist. Choose one of {', '.join(self.vocab.keys())}.")
+
+        self.target_lang = target_lang
+        self.init_kwargs["target_lang"] = target_lang
+        self.encoder = self.vocab[target_lang]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        # make sure that tokens made of several
+        # characters are not split at tokenization
+        for token in self.encoder.keys():
+            if len(token) > 1:
+                self.unique_no_split_tokens.append(token)
+
     @property
     def word_delimiter_token(self) -> str:
         """
@@ -231,7 +266,7 @@ def vocab_size(self) -> int:
         return len(self.decoder)
 
     def get_vocab(self) -> Dict:
-        return dict(self.encoder, **self.added_tokens_encoder)
+        return dict(self.vocab, **self.added_tokens_encoder)
 
     def _tokenize(self, text, **kwargs):
         """
@@ -606,7 +641,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         )
 
         with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+            f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
 
         return (vocab_file,)
 
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 94ce66e4f3d7..594ca15b697a 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1600,7 +1600,7 @@ def forward(
 )
 class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
@@ -1618,6 +1618,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 718c4d61dd59..0dabcd234e10 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -1268,7 +1268,7 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForCTC(WavLMPreTrainedModel):
-    def __init__(self, config):
+    def __init__(self, config, target_lang=None):
         super().__init__(config)
 
         self.wavlm = WavLMModel(config)
@@ -1286,6 +1286,13 @@ def __init__(self, config):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang)
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index f3e93b670c32..8fc82eb96e20 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -54,6 +54,7 @@
 
 if is_torch_available():
     import torch
+    from safetensors.torch import save_file as safe_save_file
 
     from transformers import (
         Wav2Vec2FeatureExtractor,
@@ -67,6 +68,8 @@
         Wav2Vec2Processor,
     )
     from transformers.models.wav2vec2.modeling_wav2vec2 import (
+        WAV2VEC2_ADAPTER_PT_FILE,
+        WAV2VEC2_ADAPTER_SAFE_FILE,
         Wav2Vec2GumbelVectorQuantizer,
         _compute_mask_indices,
         _sample_negative_indices,
@@ -290,6 +293,17 @@ def create_and_check_model_with_adapter_proj_dim(self, config, input_values, att
             (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
         )
 
+    def create_and_check_model_with_attn_adapter(self, config, input_values, attention_mask):
+        config.adapter_attn_dim = 16
+        model = Wav2Vec2ForCTC(config=config)
+
+        self.parent.assertIsNotNone(model._adapters)
+
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))
+
     def create_and_check_batch_inference(self, config, input_values, *args):
         # test does not pass for models making use of `group_norm`
         # check: https://github.com/pytorch/fairseq/issues/3227
@@ -844,6 +858,10 @@ def test_model_with_adapter_proj_dim(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
 
+    def test_model_with_attn_adapter(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_attn_adapter(*config_and_inputs)
+
     def test_batched_inference(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_batch_inference(*config_and_inputs)
@@ -1098,6 +1116,85 @@ def test_mask_time_feature_prob_ctc_single_batch(self):
     def test_feed_forward_chunking(self):
         pass
 
+    def test_load_attn_adapter(self):
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        def get_logits(model, input_features):
+            model = model.to(torch_device)
+            batch = processor(
+                input_features,
+                padding=True,
+                sampling_rate=processor.feature_extractor.sampling_rate,
+                return_tensors="pt",
+            )
+
+            with torch.no_grad():
+                logits = model(
+                    input_values=batch["input_values"].to(torch_device),
+                    attention_mask=batch["attention_mask"].to(torch_device),
+                ).logits
+            return logits
+
+        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
+
+        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", adapter_attn_dim=16)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            model.save_pretrained(tempdir)
+            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
+
+            logits = get_logits(model, input_features)
+            adapter_weights = model._adapters
+
+            # save safe weights
+            safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng"))
+            safe_save_file(adapter_weights, safe_filepath, metadata={"format": "pt"})
+
+            model.load_adapter("eng")
+            model.load_adapter("eng", use_safetensors=True)
+
+            with self.assertRaises(OSError):
+                model.load_adapter("eng", use_safetensors=False)
+            with self.assertRaises(Exception):
+                model.load_adapter("ita", use_safetensors=True)
+            logits_2 = get_logits(model, input_features)
+
+            self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            model.save_pretrained(tempdir)
+            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
+
+            logits = get_logits(model, input_features)
+            adapter_weights = model._adapters
+
+            # save pt weights
+            pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng"))
+            torch.save(adapter_weights, pt_filepath)
+
+            model.load_adapter("eng")
+            model.load_adapter("eng", use_safetensors=False)
+
+            with self.assertRaises(OSError):
+                model.load_adapter("eng", use_safetensors=True)
+
+            logits_2 = get_logits(model, input_features)
+
+            self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
+        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
+        logits = get_logits(model, input_features)
+
+        model.load_adapter("eng")
+        model.load_adapter("eng", use_safetensors=False)
+        model.load_adapter("eng", use_safetensors=True)
+
+        logits_2 = get_logits(model, input_features)
+
+        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
     @slow
     def test_model_from_pretrained(self):
         model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
@@ -1768,3 +1865,45 @@ def test_inference_speaker_verification(self):
 
         # TODO: update the tolerance after the CI moves to torch 1.10
         self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
+
+    @require_torchaudio
+    def test_inference_mms_1b_all(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
+
+        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}
+
+        def run_model(lang):
+            ds = load_dataset("common_voice", lang, split="test", streaming=True)
+            sample = next(iter(ds))
+
+            wav2vec2_lang = LANG_MAP[lang]
+
+            model.load_adapter(wav2vec2_lang)
+            processor.tokenizer.set_target_lang(wav2vec2_lang)
+
+            resampled_audio = torchaudio.functional.resample(
+                torch.tensor(sample["audio"]["array"]), 48_000, 16_000
+            ).numpy()
+
+            inputs = processor(resampled_audio, sampling_rate=16_000, return_tensors="pt")
+            input_values = inputs.input_values.to(torch_device)
+            attention_mask = inputs.attention_mask.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(input_values, attention_mask=attention_mask).logits
+
+            ids = torch.argmax(outputs, dim=-1)[0]
+
+            transcription = processor.decode(ids)
+            return transcription
+
+        TRANSCRIPTIONS = {
+            "it": "mi hanno fatto un'offerta che non potevo proprio rifiutare",
+            "es": "bien y qué regalo vas a abrir primero",
+            "fr": "un vrai travail intéressant va enfin être mené sur ce sujet",
+            "en": "twas the time of day and olof spen slept during the summer",
+        }
+
+        for lang in LANG_MAP.keys():
+            assert run_model(lang) == TRANSCRIPTIONS[lang]
diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
index 9715680e27bf..9bfae65f6ca4 100644
--- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py
+++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
@@ -772,3 +772,48 @@ def test_convert_tokens_to_string_format(self):
                 output = tokenizer.convert_tokens_to_string(tokens)
 
                 self.assertIsInstance(output["text"], str)
+
+    def test_nested_vocab(self):
+        eng_vocab = {"a": 7, "b": 8}
+        spa_vocab = {"a": 23, "c": 88}
+        ita_vocab = {"a": 6, "d": 9}
+
+        nested_vocab = {"eng": eng_vocab, "spa": spa_vocab, "ita": ita_vocab}
+
+        def check_tokenizer(tokenizer, check_ita_first=False):
+            if check_ita_first:
+                self.assertEqual(tokenizer.decode([6, 9, 9]), "ad")
+                self.assertEqual(tokenizer.encoder, ita_vocab)
+                tokenizer.set_target_lang("eng")
+
+            self.assertEqual(tokenizer.encoder, eng_vocab)
+            self.assertEqual(tokenizer.decode([7, 8, 7]), "aba")
+
+            tokenizer.set_target_lang("spa")
+            self.assertEqual(tokenizer.decode([23, 88, 23]), "aca")
+            self.assertEqual(tokenizer.encoder, spa_vocab)
+
+            tokenizer.set_target_lang("eng")
+            self.assertEqual(tokenizer.encoder, eng_vocab)
+            self.assertEqual(tokenizer.decode([7, 7, 8]), "ab")
+
+            tokenizer.set_target_lang("ita")
+            self.assertEqual(tokenizer.decode([6, 9, 9]), "ad")
+            self.assertEqual(tokenizer.encoder, ita_vocab)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            tempfile_path = os.path.join(tempdir, "vocab.json")
+            with open(tempfile_path, "w") as temp_file:
+                json.dump(nested_vocab, temp_file)
+
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tempdir, target_lang="eng")
+
+        check_tokenizer(tokenizer)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            # should have saved target lang as "ita" since it was last one
+            tokenizer.save_pretrained(tempdir)
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tempdir)
+
+            self.assertEqual(tokenizer.target_lang, "ita")
+            check_tokenizer(tokenizer, check_ita_first=True)

From 07c54413ac24f891fc37920f6c61ad8b7b035dc3 Mon Sep 17 00:00:00 2001
From: Shehan Munasinghe <shehanmunasinghe@gmail.com>
Date: Fri, 2 Jun 2023 15:07:02 +0530
Subject: [PATCH 299/935] Add MobileViTv2 (#22820)

* generated code from add-new-model-like

* Add code for modeling, config, and weight conversion

* add tests for image-classification, update modeling and config

* add code, tests for semantic-segmentation

* make style, make quality, make fix-copies

* make fix-copies

* Update modeling_mobilevitv2.py

fix bugs

* Update _toctree.yml

* update modeling, config

fix bugs

* Edit docs - fix bug MobileViTv2v2 -> MobileViTv2

* Update mobilevitv2.mdx

* update docstrings

* Update configuration_mobilevitv2.py

make style

* Update convert_mlcvnets_to_pytorch.py

remove unused options

* Update convert_mlcvnets_to_pytorch.py

make style

* Add suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make style, make quality

* Add suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review

Remove MobileViTv2ImageProcessor

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make style

* Add suggestions from code review

Rename MobileViTv2 -> MobileViTV2

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add suggestions from code review

Co-Authored-By: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update modeling_mobilevitv2.py

make style

* Update serialization.mdx

* Update modeling_mobilevitv2.py

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    3 +-
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.mdx                      |    2 +
 docs/source/en/model_doc/mobilevitv2.mdx      |   53 +
 docs/source/en/tasks/image_classification.mdx |    2 +-
 .../source/en/tasks/semantic_segmentation.mdx |    2 +-
 src/transformers/__init__.py                  |   18 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/mobilevitv2/__init__.py            |   71 ++
 .../mobilevitv2/configuration_mobilevitv2.py  |  168 +++
 .../convert_mlcvnets_to_pytorch.py            |  326 +++++
 .../mobilevitv2/modeling_mobilevitv2.py       | 1044 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   31 +
 tests/models/mobilevitv2/__init__.py          |    0
 .../mobilevitv2/test_modeling_mobilevitv2.py  |  383 ++++++
 24 files changed, 2116 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/en/model_doc/mobilevitv2.mdx
 create mode 100644 src/transformers/models/mobilevitv2/__init__.py
 create mode 100644 src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
 create mode 100644 src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
 create mode 100644 src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
 create mode 100644 tests/models/mobilevitv2/__init__.py
 create mode 100644 tests/models/mobilevitv2/test_modeling_mobilevitv2.py

diff --git a/README.md b/README.md
index c6371abd7bbb..7ae0e7ec2b96 100644
--- a/README.md
+++ b/README.md
@@ -406,6 +406,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_es.md b/README_es.md
index bd503bb71727..493c56c385af 100644
--- a/README_es.md
+++ b/README_es.md
@@ -381,6 +381,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_hd.md b/README_hd.md
index 731e9975b506..ca28c89a0aea 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -353,6 +353,7 @@ conda install -c huggingface transformers
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_ja.md b/README_ja.md
index 9559f4d85ef9..001ef70b23b8 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -415,6 +415,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
diff --git a/README_ko.md b/README_ko.md
index f4203e67118c..b6416ca2ae6b 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -330,6 +330,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 2601f280e72b..aed58c1f808c 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -340,7 +340,7 @@ conda install -c huggingface transformers
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov  
 1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (来自 Google AI) 伴随论文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 由 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos 发布。
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
@@ -354,6 +354,7 @@ conda install -c huggingface transformers
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index cd37ca9d148e..6d2cae9292e3 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -366,6 +366,7 @@ conda install -c huggingface transformers
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8b43fb517792..60f98607bf62 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -494,6 +494,8 @@
         title: MobileNetV2
       - local: model_doc/mobilevit
         title: MobileViT
+      - local: model_doc/mobilevitv2
+        title: MobileViTV2
       - local: model_doc/nat
         title: NAT
       - local: model_doc/poolformer
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index edf5bc00b3c7..ee47a3ab5e61 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -167,6 +167,7 @@ The documentation is organized into five sections:
 1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
@@ -369,6 +370,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          MobileViTV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/mobilevitv2.mdx b/docs/source/en/model_doc/mobilevitv2.mdx
new file mode 100644
index 000000000000..1cfbb6808ab1
--- /dev/null
+++ b/docs/source/en/model_doc/mobilevitv2.mdx
@@ -0,0 +1,53 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# MobileViTV2
+
+## Overview
+
+The MobileViTV2 model was proposed in [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+
+MobileViTV2 is the second version of MobileViT, constructed by replacing the multi-headed self-attention in MobileViT with separable self-attention.
+
+The abstract from the paper is the following:
+
+*Mobile vision transformers (MobileViT) can achieve state-of-the-art performance across several mobile vision tasks, including classification and detection. Though these models have fewer parameters, they have high latency as compared to convolutional neural network-based models. The main efficiency bottleneck in MobileViT is the multi-headed self-attention (MHA) in transformers, which requires O(k2) time complexity with respect to the number of tokens (or patches) k. Moreover, MHA requires costly operations (e.g., batch-wise matrix multiplication) for computing self-attention, impacting latency on resource-constrained devices. This paper introduces a separable self-attention method with linear complexity, i.e. O(k). A simple yet effective characteristic of the proposed method is that it uses element-wise operations for computing self-attention, making it a good choice for resource-constrained devices. The improved model, MobileViTV2, is state-of-the-art on several mobile vision tasks, including ImageNet object classification and MS-COCO object detection. With about three million parameters, MobileViTV2 achieves a top-1 accuracy of 75.6% on the ImageNet dataset, outperforming MobileViT by about 1% while running 3.2× faster on a mobile device.*
+
+Tips:
+
+- MobileViTV2 is more like a CNN than a Transformer model. It does not work on sequence data but on batches of images. Unlike ViT, there are no embeddings. The backbone model outputs a feature map.
+- One can use [`MobileViTImageProcessor`] to prepare images for the model. Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB).
+- The available image classification checkpoints are pre-trained on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k) (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes).
+- The segmentation model uses a [DeepLabV3](https://arxiv.org/abs/1706.05587) head. The available semantic segmentation checkpoints are pre-trained on [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/).
+
+This model was contributed by [shehan97](https://huggingface.co/shehan97).
+The original code can be found [here](https://github.com/apple/ml-cvnets).
+
+
+## MobileViTV2Config
+
+[[autodoc]] MobileViTV2Config
+
+## MobileViTV2Model
+
+[[autodoc]] MobileViTV2Model
+    - forward
+
+## MobileViTV2ForImageClassification
+
+[[autodoc]] MobileViTV2ForImageClassification
+    - forward
+
+## MobileViTV2ForSemanticSegmentation
+
+[[autodoc]] MobileViTV2ForSemanticSegmentation
+    - forward
diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx
index 635b0b6358c8..d8f6f63ce09b 100644
--- a/docs/source/en/tasks/image_classification.mdx
+++ b/docs/source/en/tasks/image_classification.mdx
@@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
 <!--End of the generated tip-->
 
 </Tip>
diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.mdx
index ffaa251688f5..eac2507c9a1c 100644
--- a/docs/source/en/tasks/semantic_segmentation.mdx
+++ b/docs/source/en/tasks/semantic_segmentation.mdx
@@ -28,7 +28,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
+[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 14ba5ec4100b..edb15c037b35 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -387,6 +387,7 @@
     "models.mobilenet_v1": ["MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV1Config"],
     "models.mobilenet_v2": ["MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV2Config"],
     "models.mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig"],
+    "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
     "models.mt5": ["MT5Config"],
     "models.mvp": ["MvpConfig", "MvpTokenizer"],
@@ -2073,6 +2074,15 @@
             "MobileViTPreTrainedModel",
         ]
     )
+    _import_structure["models.mobilevitv2"].extend(
+        [
+            "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MobileViTV2ForImageClassification",
+            "MobileViTV2ForSemanticSegmentation",
+            "MobileViTV2Model",
+            "MobileViTV2PreTrainedModel",
+        ]
+    )
     _import_structure["models.mpnet"].extend(
         [
             "MPNET_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4183,6 +4193,7 @@
     from .models.mobilenet_v1 import MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV1Config
     from .models.mobilenet_v2 import MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV2Config
     from .models.mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig
+    from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
     from .models.mt5 import MT5Config
     from .models.mvp import MvpConfig, MvpTokenizer
@@ -5601,6 +5612,13 @@
             MobileViTModel,
             MobileViTPreTrainedModel,
         )
+        from .models.mobilevitv2 import (
+            MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MobileViTV2ForImageClassification,
+            MobileViTV2ForSemanticSegmentation,
+            MobileViTV2Model,
+            MobileViTV2PreTrainedModel,
+        )
         from .models.mpnet import (
             MPNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             MPNetForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 1aa2a049aa1b..26e4643728c6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -130,6 +130,7 @@
     mobilenet_v1,
     mobilenet_v2,
     mobilevit,
+    mobilevitv2,
     mpnet,
     mt5,
     mvp,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2c05989d9938..72f8e26abf58 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -133,6 +133,7 @@
         ("mobilenet_v1", "MobileNetV1Config"),
         ("mobilenet_v2", "MobileNetV2Config"),
         ("mobilevit", "MobileViTConfig"),
+        ("mobilevitv2", "MobileViTV2Config"),
         ("mpnet", "MPNetConfig"),
         ("mt5", "MT5Config"),
         ("mvp", "MvpConfig"),
@@ -319,6 +320,7 @@
         ("mobilenet_v1", "MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v2", "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -520,6 +522,7 @@
         ("mobilenet_v1", "MobileNetV1"),
         ("mobilenet_v2", "MobileNetV2"),
         ("mobilevit", "MobileViT"),
+        ("mobilevitv2", "MobileViTV2"),
         ("mpnet", "MPNet"),
         ("mt5", "MT5"),
         ("mvp", "MVP"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index d3c6615527f3..cfa11f149cec 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -77,6 +77,7 @@
         ("mobilenet_v2", "MobileNetV2ImageProcessor"),
         ("mobilevit", "MobileViTImageProcessor"),
         ("mobilevit", "MobileViTImageProcessor"),
+        ("mobilevitv2", "MobileViTImageProcessor"),
         ("nat", "ViTImageProcessor"),
         ("oneformer", "OneFormerImageProcessor"),
         ("owlvit", "OwlViTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d6581f94d863..0a10eb96e9df 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -131,6 +131,7 @@
         ("mobilenet_v1", "MobileNetV1Model"),
         ("mobilenet_v2", "MobileNetV2Model"),
         ("mobilevit", "MobileViTModel"),
+        ("mobilevitv2", "MobileViTV2Model"),
         ("mpnet", "MPNetModel"),
         ("mt5", "MT5Model"),
         ("mvp", "MvpModel"),
@@ -457,6 +458,7 @@
         ("mobilenet_v1", "MobileNetV1ForImageClassification"),
         ("mobilenet_v2", "MobileNetV2ForImageClassification"),
         ("mobilevit", "MobileViTForImageClassification"),
+        ("mobilevitv2", "MobileViTV2ForImageClassification"),
         ("nat", "NatForImageClassification"),
         (
             "perceiver",
@@ -496,6 +498,7 @@
         ("dpt", "DPTForSemanticSegmentation"),
         ("mobilenet_v2", "MobileNetV2ForSemanticSegmentation"),
         ("mobilevit", "MobileViTForSemanticSegmentation"),
+        ("mobilevitv2", "MobileViTV2ForSemanticSegmentation"),
         ("segformer", "SegformerForSemanticSegmentation"),
         ("upernet", "UperNetForSemanticSegmentation"),
     ]
diff --git a/src/transformers/models/mobilevitv2/__init__.py b/src/transformers/models/mobilevitv2/__init__.py
new file mode 100644
index 000000000000..043caf7b7526
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/__init__.py
@@ -0,0 +1,71 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_mobilevitv2": [
+        "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MobileViTV2Config",
+        "MobileViTV2OnnxConfig",
+    ],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mobilevitv2"] = [
+        "MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MobileViTV2ForImageClassification",
+        "MobileViTV2ForSemanticSegmentation",
+        "MobileViTV2Model",
+        "MobileViTV2PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mobilevitv2 import (
+        MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileViTV2Config,
+        MobileViTV2OnnxConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mobilevitv2 import (
+            MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MobileViTV2ForImageClassification,
+            MobileViTV2ForSemanticSegmentation,
+            MobileViTV2Model,
+            MobileViTV2PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
new file mode 100644
index 000000000000..d98d88647e22
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MobileViTV2 model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "apple/mobilevitv2-1.0": "https://huggingface.co/apple/mobilevitv2-1.0/resolve/main/config.json",
+}
+
+
+class MobileViTV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a
+    MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MobileViTV2
+    [apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 256):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 2):
+            The size (resolution) of each patch.
+        expand_ratio (`float`, *optional*, defaults to 2.0):
+            Expansion factor for the MobileNetv2 layers.
+        hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
+        conv_kernel_size (`int`, *optional*, defaults to 3):
+            The size of the convolutional kernel in the MobileViTV2 layer.
+        output_stride (`int`, `optional`, defaults to 32):
+            The ratio of the spatial resolution of the output to the resolution of the input image.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        aspp_out_channels (`int`, `optional`, defaults to 512):
+            Number of output channels used in the ASPP layer for semantic segmentation.
+        atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
+            Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
+        aspp_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the ASPP layer for semantic segmentation.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+        n_attn_blocks (`List[int]`, *optional*, defaults to `[2, 4, 3]`):
+            The number of attention blocks in each MobileViTV2Layer
+        base_attn_unit_dims (`List[int]`, *optional*, defaults to `[128, 192, 256]`):
+            The base multiplier for dimensions of attention blocks in each MobileViTV2Layer
+        width_multiplier (`float`, *optional*, defaults to 1.0)
+            The width multiplier for MobileViTV2.
+        ffn_multiplier (`int`, *optional*, defaults to 2)
+            The FFN multiplier for MobileViTV2.
+        attn_dropout (`float`, *optional*, defaults to 0.0)
+            The dropout in the attention layer.
+        ffn_dropout (`float`, *optional*, defaults to 0.0)
+            The dropout between FFN layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import MobileViTV2Config, MobileViTV2Model
+
+    >>> # Initializing a mobilevitv2-small style configuration
+    >>> configuration = MobileViTV2Config()
+
+    >>> # Initializing a model from the mobilevitv2-small style configuration
+    >>> model = MobileViTV2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mobilevitv2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        image_size=256,
+        patch_size=2,
+        expand_ratio=2.0,
+        hidden_act="swish",
+        conv_kernel_size=3,
+        output_stride=32,
+        classifier_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        aspp_out_channels=512,
+        atrous_rates=[6, 12, 18],
+        aspp_dropout_prob=0.1,
+        semantic_loss_ignore_index=255,
+        n_attn_blocks=[2, 4, 3],
+        base_attn_unit_dims=[128, 192, 256],
+        width_multiplier=1.0,
+        ffn_multiplier=2,
+        attn_dropout=0.0,
+        ffn_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.expand_ratio = expand_ratio
+        self.hidden_act = hidden_act
+        self.conv_kernel_size = conv_kernel_size
+        self.output_stride = output_stride
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.n_attn_blocks = n_attn_blocks
+        self.base_attn_unit_dims = base_attn_unit_dims
+        self.width_multiplier = width_multiplier
+        self.ffn_multiplier = ffn_multiplier
+        self.ffn_dropout = ffn_dropout
+        self.attn_dropout = attn_dropout
+        self.classifier_dropout_prob = classifier_dropout_prob
+
+        # decode head attributes for semantic segmentation
+        self.aspp_out_channels = aspp_out_channels
+        self.atrous_rates = atrous_rates
+        self.aspp_dropout_prob = aspp_dropout_prob
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+
+
+class MobileViTV2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "image-classification":
+            return OrderedDict([("logits", {0: "batch"})])
+        else:
+            return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
new file mode 100644
index 000000000000..16e9029d7244
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MobileViTV2 checkpoints from the ml-cvnets library."""
+
+
+import argparse
+import collections
+import json
+from pathlib import Path
+
+import requests
+import torch
+import yaml
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import (
+    MobileViTImageProcessor,
+    MobileViTV2Config,
+    MobileViTV2ForImageClassification,
+    MobileViTV2ForSemanticSegmentation,
+)
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def load_orig_config_file(orig_cfg_file):
+    print("Loading config file...")
+
+    def flatten_yaml_as_dict(d, parent_key="", sep="."):
+        items = []
+        for k, v in d.items():
+            new_key = parent_key + sep + k if parent_key else k
+            if isinstance(v, collections.abc.MutableMapping):
+                items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items())
+            else:
+                items.append((new_key, v))
+        return dict(items)
+
+    config = argparse.Namespace()
+    with open(orig_cfg_file, "r") as yaml_file:
+        try:
+            cfg = yaml.load(yaml_file, Loader=yaml.FullLoader)
+
+            flat_cfg = flatten_yaml_as_dict(cfg)
+            for k, v in flat_cfg.items():
+                setattr(config, k, v)
+        except yaml.YAMLError as exc:
+            logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc)))
+    return config
+
+
+def get_mobilevitv2_config(task_name, orig_cfg_file):
+    config = MobileViTV2Config()
+
+    is_segmentation_model = False
+
+    # dataset
+    if task_name.startswith("imagenet1k_"):
+        config.num_labels = 1000
+        if int(task_name.strip().split("_")[-1]) == 384:
+            config.image_size = 384
+        else:
+            config.image_size = 256
+        filename = "imagenet-1k-id2label.json"
+    elif task_name.startswith("imagenet21k_to_1k_"):
+        config.num_labels = 21000
+        if int(task_name.strip().split("_")[-1]) == 384:
+            config.image_size = 384
+        else:
+            config.image_size = 256
+        filename = "imagenet-22k-id2label.json"
+    elif task_name.startswith("ade20k_"):
+        config.num_labels = 151
+        config.image_size = 512
+        filename = "ade20k-id2label.json"
+        is_segmentation_model = True
+    elif task_name.startswith("voc_"):
+        config.num_labels = 21
+        config.image_size = 512
+        filename = "pascal-voc-id2label.json"
+        is_segmentation_model = True
+
+    # orig_config
+    orig_config = load_orig_config_file(orig_cfg_file)
+    assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
+    config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
+    assert (
+        getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
+    ), "Norm layers other than layer_norm_2d is not supported"
+    config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
+    # config.image_size == getattr(orig_config,  'sampler.bs.crop_size_width', 256)
+
+    if is_segmentation_model:
+        config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16)
+        if "_deeplabv3" in task_name:
+            config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36])
+            config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512)
+            config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1)
+
+    # id2label
+    repo_id = "huggingface/label-files"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def create_rename_keys(state_dict, base_model=False):
+    if base_model:
+        model_prefix = ""
+    else:
+        model_prefix = "mobilevitv2."
+
+    rename_keys = []
+    for k in state_dict.keys():
+        if k[:8] == "encoder.":
+            k_new = k[8:]
+        else:
+            k_new = k
+
+        if ".block." in k:
+            k_new = k_new.replace(".block.", ".")
+        if ".conv." in k:
+            k_new = k_new.replace(".conv.", ".convolution.")
+        if ".norm." in k:
+            k_new = k_new.replace(".norm.", ".normalization.")
+
+        if "conv_1." in k:
+            k_new = k_new.replace("conv_1.", f"{model_prefix}conv_stem.")
+        for i in [1, 2]:
+            if f"layer_{i}." in k:
+                k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i-1}.layer.")
+        if ".exp_1x1." in k:
+            k_new = k_new.replace(".exp_1x1.", ".expand_1x1.")
+        if ".red_1x1." in k:
+            k_new = k_new.replace(".red_1x1.", ".reduce_1x1.")
+
+        for i in [3, 4, 5]:
+            if f"layer_{i}.0." in k:
+                k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i-1}.downsampling_layer.")
+            if f"layer_{i}.1.local_rep.0." in k:
+                k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i-1}.conv_kxk.")
+            if f"layer_{i}.1.local_rep.1." in k:
+                k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i-1}.conv_1x1.")
+
+        for i in [3, 4, 5]:
+            if i == 3:
+                j_in = [0, 1]
+            elif i == 4:
+                j_in = [0, 1, 2, 3]
+            elif i == 5:
+                j_in = [0, 1, 2]
+
+            for j in j_in:
+                if f"layer_{i}.1.global_rep.{j}." in k:
+                    k_new = k_new.replace(
+                        f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i-1}.transformer.layer.{j}."
+                    )
+            if f"layer_{i}.1.global_rep.{j+1}." in k:
+                k_new = k_new.replace(
+                    f"layer_{i}.1.global_rep.{j+1}.", f"{model_prefix}encoder.layer.{i-1}.layernorm."
+                )
+
+            if f"layer_{i}.1.conv_proj." in k:
+                k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.")
+
+        if "pre_norm_attn.0." in k:
+            k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
+        if "pre_norm_attn.1." in k:
+            k_new = k_new.replace("pre_norm_attn.1.", "attention.")
+        if "pre_norm_ffn.0." in k:
+            k_new = k_new.replace("pre_norm_ffn.0.", "layernorm_after.")
+        if "pre_norm_ffn.1." in k:
+            k_new = k_new.replace("pre_norm_ffn.1.", "ffn.conv1.")
+        if "pre_norm_ffn.3." in k:
+            k_new = k_new.replace("pre_norm_ffn.3.", "ffn.conv2.")
+
+        if "classifier.1." in k:
+            k_new = k_new.replace("classifier.1.", "classifier.")
+
+        if "seg_head." in k:
+            k_new = k_new.replace("seg_head.", "segmentation_head.")
+        if ".aspp_layer." in k:
+            k_new = k_new.replace(".aspp_layer.", ".")
+        if ".aspp_pool." in k:
+            k_new = k_new.replace(".aspp_pool.", ".")
+
+        rename_keys.append((k, k_new))
+    return rename_keys
+
+
+def remove_unused_keys(state_dict):
+    """remove unused keys (e.g.: seg_head.aux_head)"""
+    keys_to_ignore = []
+    for k in state_dict.keys():
+        if k.startswith("seg_head.aux_head."):
+            keys_to_ignore.append(k)
+    for k in keys_to_ignore:
+        state_dict.pop(k, None)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # url = "https://cdn.britannica.com/86/141086-050-9D7C75EE/Gulfstream-G450-business-jet-passengers.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our MobileViTV2 structure.
+    """
+    config = get_mobilevitv2_config(task_name, orig_config_path)
+
+    # load original state_dict
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+
+    # load huggingface model
+    if task_name.startswith("ade20k_") or task_name.startswith("voc_"):
+        model = MobileViTV2ForSemanticSegmentation(config).eval()
+        base_model = False
+    else:
+        model = MobileViTV2ForImageClassification(config).eval()
+        base_model = False
+
+    # remove and rename some keys of load the original model
+    state_dict = checkpoint
+    remove_unused_keys(state_dict)
+    rename_keys = create_rename_keys(state_dict, base_model=base_model)
+    for rename_key_src, rename_key_dest in rename_keys:
+        rename_key(state_dict, rename_key_src, rename_key_dest)
+
+    # load modified state_dict
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by MobileViTImageProcessor
+    feature_extractor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
+    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    outputs = model(**encoding)
+
+    # verify classification model
+    if task_name.startswith("imagenet"):
+        logits = outputs.logits
+        predicted_class_idx = logits.argmax(-1).item()
+        print("Predicted class:", model.config.id2label[predicted_class_idx])
+        if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0:
+            # expected_logits for base variant
+            expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
+            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {task_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--task",
+        default="imagenet1k_256",
+        type=str,
+        help=(
+            "Name of the task for which the MobileViTV2 model you'd like to convert is trained on . "
+            """
+                Classification (ImageNet-1k)
+                    - MobileViTV2 (256x256) : imagenet1k_256
+                    - MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384
+                    - MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) :
+                      imagenet21k_to_1k_256
+                    - MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on
+                      ImageNet-1k 384x384) : imagenet21k_to_1k_384
+                Segmentation
+                    - ADE20K Dataset : ade20k_deeplabv3
+                    - Pascal VOC 2012 Dataset: voc_deeplabv3
+            """
+        ),
+        choices=[
+            "imagenet1k_256",
+            "imagenet1k_384",
+            "imagenet21k_to_1k_256",
+            "imagenet21k_to_1k_384",
+            "ade20k_deeplabv3",
+            "voc_deeplabv3",
+        ],
+    )
+
+    parser.add_argument(
+        "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
+    )
+    parser.add_argument("--orig_config_path", required=True, type=str, help="Path to the original config file.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_mobilevitv2_checkpoint(
+        args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path
+    )
diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
new file mode 100644
index 000000000000..b8c071a74f4b
--- /dev/null
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -0,0 +1,1044 @@
+# coding=utf-8
+# Copyright 2023 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
+""" PyTorch MobileViTV2 model."""
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+    SemanticSegmenterOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mobilevitv2 import MobileViTV2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "MobileViTV2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "apple/mobilevitv2-1.0-imagenet1k-256"
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 8, 8]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "apple/mobilevitv2-1.0-imagenet1k-256"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "apple/mobilevitv2-1.0-imagenet1k-256"
+    # See all MobileViTV2 models at https://huggingface.co/models?filter=mobilevitv2
+]
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.make_divisible
+def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
+    """
+    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
+    original TensorFlow repo. It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_value < 0.9 * value:
+        new_value += divisor
+    return int(new_value)
+
+
+def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float:
+    return max(min_val, min(max_val, value))
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTConvLayer with MobileViT->MobileViTV2
+class MobileViTV2ConvLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+        dilation: int = 1,
+        use_normalization: bool = True,
+        use_activation: Union[bool, str] = True,
+    ) -> None:
+        super().__init__()
+        padding = int((kernel_size - 1) / 2) * dilation
+
+        if in_channels % groups != 0:
+            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
+        if out_channels % groups != 0:
+            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
+
+        self.convolution = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode="zeros",
+        )
+
+        if use_normalization:
+            self.normalization = nn.BatchNorm2d(
+                num_features=out_channels,
+                eps=1e-5,
+                momentum=0.1,
+                affine=True,
+                track_running_stats=True,
+            )
+        else:
+            self.normalization = None
+
+        if use_activation:
+            if isinstance(use_activation, str):
+                self.activation = ACT2FN[use_activation]
+            elif isinstance(config.hidden_act, str):
+                self.activation = ACT2FN[config.hidden_act]
+            else:
+                self.activation = config.hidden_act
+        else:
+            self.activation = None
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = self.convolution(features)
+        if self.normalization is not None:
+            features = self.normalization(features)
+        if self.activation is not None:
+            features = self.activation(features)
+        return features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTInvertedResidual with MobileViT->MobileViTV2
+class MobileViTV2InvertedResidual(nn.Module):
+    """
+    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
+    """
+
+    def __init__(
+        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1
+    ) -> None:
+        super().__init__()
+        expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)
+
+        if stride not in [1, 2]:
+            raise ValueError(f"Invalid stride {stride}.")
+
+        self.use_residual = (stride == 1) and (in_channels == out_channels)
+
+        self.expand_1x1 = MobileViTV2ConvLayer(
+            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
+        )
+
+        self.conv_3x3 = MobileViTV2ConvLayer(
+            config,
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=3,
+            stride=stride,
+            groups=expanded_channels,
+            dilation=dilation,
+        )
+
+        self.reduce_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_activation=False,
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        residual = features
+
+        features = self.expand_1x1(features)
+        features = self.conv_3x3(features)
+        features = self.reduce_1x1(features)
+
+        return residual + features if self.use_residual else features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTMobileNetLayer with MobileViT->MobileViTV2
+class MobileViTV2MobileNetLayer(nn.Module):
+    def __init__(
+        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1
+    ) -> None:
+        super().__init__()
+
+        self.layer = nn.ModuleList()
+        for i in range(num_stages):
+            layer = MobileViTV2InvertedResidual(
+                config,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=stride if i == 0 else 1,
+            )
+            self.layer.append(layer)
+            in_channels = out_channels
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer_module in self.layer:
+            features = layer_module(features)
+        return features
+
+
+class MobileViTV2LinearSelfAttention(nn.Module):
+    """
+    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
+    https://arxiv.org/abs/2206.02680
+
+    Args:
+        config (`MobileVitv2Config`):
+             Model configuration object
+        embed_dim (`int`):
+            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
+    """
+
+    def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None:
+        super().__init__()
+
+        self.qkv_proj = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=1 + (2 * embed_dim),
+            bias=True,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+
+        self.attn_dropout = nn.Dropout(p=config.attn_dropout)
+        self.out_proj = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            bias=True,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+        self.embed_dim = embed_dim
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # (batch_size, embed_dim, num_pixels_in_patch, num_patches) --> (batch_size, 1+2*embed_dim, num_pixels_in_patch, num_patches)
+        qkv = self.qkv_proj(hidden_states)
+
+        # Project hidden_states into query, key and value
+        # Query --> [batch_size, 1, num_pixels_in_patch, num_patches]
+        # value, key --> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1)
+
+        # apply softmax along num_patches dimension
+        context_scores = torch.nn.functional.softmax(query, dim=-1)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] x [batch_size, 1, num_pixels_in_patch, num_patches] -> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        context_vector = key * context_scores
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] --> [batch_size, embed_dim, num_pixels_in_patch, 1]
+        context_vector = torch.sum(context_vector, dim=-1, keepdim=True)
+
+        # combine context vector with values
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] * [batch_size, embed_dim, num_pixels_in_patch, 1] --> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        out = torch.nn.functional.relu(value) * context_vector.expand_as(value)
+        out = self.out_proj(out)
+        return out
+
+
+class MobileViTV2FFN(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        ffn_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.conv1 = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=ffn_latent_dim,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            use_normalization=False,
+            use_activation=True,
+        )
+        self.dropout1 = nn.Dropout(ffn_dropout)
+
+        self.conv2 = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=ffn_latent_dim,
+            out_channels=embed_dim,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            use_normalization=False,
+            use_activation=False,
+        )
+        self.dropout2 = nn.Dropout(ffn_dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.dropout1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.dropout2(hidden_states)
+        return hidden_states
+
+
+class MobileViTV2TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
+        self.attention = MobileViTV2LinearSelfAttention(config, embed_dim)
+        self.dropout1 = nn.Dropout(p=dropout)
+        self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
+        self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        layernorm_1_out = self.layernorm_before(hidden_states)
+        attention_output = self.attention(layernorm_1_out)
+        hidden_states = attention_output + hidden_states
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.ffn(layer_output)
+
+        layer_output = layer_output + hidden_states
+        return layer_output
+
+
+class MobileViTV2Transformer(nn.Module):
+    def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None:
+        super().__init__()
+
+        ffn_multiplier = config.ffn_multiplier
+
+        ffn_dims = [ffn_multiplier * d_model] * n_layers
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [int((d // 16) * 16) for d in ffn_dims]
+
+        self.layer = nn.ModuleList()
+        for block_idx in range(n_layers):
+            transformer_layer = MobileViTV2TransformerLayer(
+                config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx]
+            )
+            self.layer.append(transformer_layer)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class MobileViTV2Layer(nn.Module):
+    """
+    MobileViTV2 layer: https://arxiv.org/abs/2206.02680
+    """
+
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        in_channels: int,
+        out_channels: int,
+        attn_unit_dim: int,
+        n_attn_blocks: int = 2,
+        dilation: int = 1,
+        stride: int = 2,
+    ) -> None:
+        super().__init__()
+        self.patch_width = config.patch_size
+        self.patch_height = config.patch_size
+
+        cnn_out_dim = attn_unit_dim
+
+        if stride == 2:
+            self.downsampling_layer = MobileViTV2InvertedResidual(
+                config,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=stride if dilation == 1 else 1,
+                dilation=dilation // 2 if dilation > 1 else 1,
+            )
+            in_channels = out_channels
+        else:
+            self.downsampling_layer = None
+
+        # Local representations
+        self.conv_kxk = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=config.conv_kernel_size,
+            groups=in_channels,
+        )
+        self.conv_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=cnn_out_dim,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+
+        # Global representations
+        self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks)
+
+        # self.layernorm = MobileViTV2LayerNorm2D(attn_unit_dim, eps=config.layer_norm_eps)
+        self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps)
+
+        # Fusion
+        self.conv_projection = MobileViTV2ConvLayer(
+            config,
+            in_channels=cnn_out_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            use_normalization=True,
+            use_activation=False,
+        )
+
+    def unfolding(self, feature_map: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]:
+        batch_size, in_channels, img_height, img_width = feature_map.shape
+        patches = nn.functional.unfold(
+            feature_map,
+            kernel_size=(self.patch_height, self.patch_width),
+            stride=(self.patch_height, self.patch_width),
+        )
+        patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1)
+
+        return patches, (img_height, img_width)
+
+    def folding(self, patches: torch.Tensor, output_size: Tuple[int, int]) -> torch.Tensor:
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+        patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)
+
+        feature_map = nn.functional.fold(
+            patches,
+            output_size=output_size,
+            kernel_size=(self.patch_height, self.patch_width),
+            stride=(self.patch_height, self.patch_width),
+        )
+
+        return feature_map
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        # reduce spatial dimensions if needed
+        if self.downsampling_layer:
+            features = self.downsampling_layer(features)
+
+        # local representation
+        features = self.conv_kxk(features)
+        features = self.conv_1x1(features)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(features)
+
+        # learn global representations
+        patches = self.transformer(patches)
+        patches = self.layernorm(patches)
+
+        # convert patches back to feature maps
+        # [batch_size, patch_height, patch_width, input_dim] --> [batch_size, input_dim, patch_height, patch_width]
+        features = self.folding(patches, output_size)
+
+        features = self.conv_projection(features)
+        return features
+
+
+class MobileViTV2Encoder(nn.Module):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+        self.config = config
+
+        self.layer = nn.ModuleList()
+        self.gradient_checkpointing = False
+
+        # segmentation architectures like DeepLab and PSPNet modify the strides
+        # of the classification backbones
+        dilate_layer_4 = dilate_layer_5 = False
+        if config.output_stride == 8:
+            dilate_layer_4 = True
+            dilate_layer_5 = True
+        elif config.output_stride == 16:
+            dilate_layer_5 = True
+
+        dilation = 1
+
+        layer_0_dim = make_divisible(
+            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
+        )
+
+        layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16)
+        layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8)
+        layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8)
+        layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8)
+        layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8)
+
+        layer_1 = MobileViTV2MobileNetLayer(
+            config,
+            in_channels=layer_0_dim,
+            out_channels=layer_1_dim,
+            stride=1,
+            num_stages=1,
+        )
+        self.layer.append(layer_1)
+
+        layer_2 = MobileViTV2MobileNetLayer(
+            config,
+            in_channels=layer_1_dim,
+            out_channels=layer_2_dim,
+            stride=2,
+            num_stages=2,
+        )
+        self.layer.append(layer_2)
+
+        layer_3 = MobileViTV2Layer(
+            config,
+            in_channels=layer_2_dim,
+            out_channels=layer_3_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[0],
+        )
+        self.layer.append(layer_3)
+
+        if dilate_layer_4:
+            dilation *= 2
+
+        layer_4 = MobileViTV2Layer(
+            config,
+            in_channels=layer_3_dim,
+            out_channels=layer_4_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[1],
+            dilation=dilation,
+        )
+        self.layer.append(layer_4)
+
+        if dilate_layer_5:
+            dilation *= 2
+
+        layer_5 = MobileViTV2Layer(
+            config,
+            in_channels=layer_4_dim,
+            out_channels=layer_5_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[2],
+            dilation=dilation,
+        )
+        self.layer.append(layer_5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                )
+            else:
+                hidden_states = layer_module(hidden_states)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTPreTrainedModel with MobileViT->MobileViTV2,mobilevit->mobilevitv2
+class MobileViTV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MobileViTV2Config
+    base_model_prefix = "mobilevitv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MobileViTV2Encoder):
+            module.gradient_checkpointing = value
+
+
+MOBILEVITV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MOBILEVITV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`MobileViTImageProcessor.__call__`] for details.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MobileViTV2 model outputting raw hidden-states without any specific head on top.",
+    MOBILEVITV2_START_DOCSTRING,
+)
+class MobileViTV2Model(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config, expand_output: bool = True):
+        super().__init__(config)
+        self.config = config
+        self.expand_output = expand_output
+
+        layer_0_dim = make_divisible(
+            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
+        )
+
+        self.conv_stem = MobileViTV2ConvLayer(
+            config,
+            in_channels=config.num_channels,
+            out_channels=layer_0_dim,
+            kernel_size=3,
+            stride=2,
+            use_normalization=True,
+            use_activation=True,
+        )
+        self.encoder = MobileViTV2Encoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer_index, heads in heads_to_prune.items():
+            mobilevitv2_layer = self.encoder.layer[layer_index]
+            if isinstance(mobilevitv2_layer, MobileViTV2Layer):
+                for transformer_layer in mobilevitv2_layer.transformer.layer:
+                    transformer_layer.attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.conv_stem(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.expand_output:
+            last_hidden_state = encoder_outputs[0]
+
+            # global average pooling: (batch_size, channels, height, width) -> (batch_size, channels)
+            pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False)
+        else:
+            last_hidden_state = encoder_outputs[0]
+            pooled_output = None
+
+        if not return_dict:
+            output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)
+            return output + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    MOBILEVITV2_START_DOCSTRING,
+)
+class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilevitv2 = MobileViTV2Model(config)
+
+        out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
+        # Classifier head
+        self.classifier = (
+            nn.Linear(in_features=out_channels, out_features=config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTASPPPooling with MobileViT->MobileViTV2
+class MobileViTV2ASPPPooling(nn.Module):
+    def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None:
+        super().__init__()
+
+        self.global_pool = nn.AdaptiveAvgPool2d(output_size=1)
+
+        self.conv_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_normalization=True,
+            use_activation="relu",
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        spatial_size = features.shape[-2:]
+        features = self.global_pool(features)
+        features = self.conv_1x1(features)
+        features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False)
+        return features
+
+
+class MobileViTV2ASPP(nn.Module):
+    """
+    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
+    """
+
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+
+        encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
+        in_channels = encoder_out_channels
+        out_channels = config.aspp_out_channels
+
+        if len(config.atrous_rates) != 3:
+            raise ValueError("Expected 3 values for atrous_rates")
+
+        self.convs = nn.ModuleList()
+
+        in_projection = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_activation="relu",
+        )
+        self.convs.append(in_projection)
+
+        self.convs.extend(
+            [
+                MobileViTV2ConvLayer(
+                    config,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    dilation=rate,
+                    use_activation="relu",
+                )
+                for rate in config.atrous_rates
+            ]
+        )
+
+        pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels)
+        self.convs.append(pool_layer)
+
+        self.project = MobileViTV2ConvLayer(
+            config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu"
+        )
+
+        self.dropout = nn.Dropout(p=config.aspp_dropout_prob)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        pyramid = []
+        for conv in self.convs:
+            pyramid.append(conv(features))
+        pyramid = torch.cat(pyramid, dim=1)
+
+        pooled_features = self.project(pyramid)
+        pooled_features = self.dropout(pooled_features)
+        return pooled_features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTDeepLabV3 with MobileViT->MobileViTV2
+class MobileViTV2DeepLabV3(nn.Module):
+    """
+    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
+    """
+
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+        self.aspp = MobileViTV2ASPP(config)
+
+        self.dropout = nn.Dropout2d(config.classifier_dropout_prob)
+
+        self.classifier = MobileViTV2ConvLayer(
+            config,
+            in_channels=config.aspp_out_channels,
+            out_channels=config.num_labels,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+            bias=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        features = self.aspp(hidden_states[-1])
+        features = self.dropout(features)
+        features = self.classifier(features)
+        return features
+
+
+@add_start_docstrings(
+    """
+    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
+    """,
+    MOBILEVITV2_START_DOCSTRING,
+)
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation with MOBILEVIT->MOBILEVITV2,MobileViT->MobileViTV2,mobilevit->mobilevitv2
+class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilevitv2 = MobileViTV2Model(config, expand_output=False)
+        self.segmentation_head = MobileViTV2DeepLabV3(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SemanticSegmenterOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevitv2-small")
+        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevitv2-small")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # logits are of shape (batch_size, num_labels, height, width)
+        >>> logits = outputs.logits
+        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mobilevitv2(
+            pixel_values,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        logits = self.segmentation_head(encoder_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                # upsample logits to the images' original size
+                upsampled_logits = nn.functional.interpolate(
+                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+                )
+                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+                loss = loss_fct(upsampled_logits, labels)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c32528c4c764..069edb3ba851 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4743,6 +4743,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MobileViTV2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MobileViTV2ForSemanticSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MobileViTV2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MobileViTV2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/mobilevitv2/__init__.py b/tests/models/mobilevitv2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
new file mode 100644
index 000000000000..0da4afd375ea
--- /dev/null
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -0,0 +1,383 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MobileViTV2 model. """
+
+
+import inspect
+import unittest
+
+from transformers import MobileViTV2Config
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation, MobileViTV2Model
+    from transformers.models.mobilevitv2.modeling_mobilevitv2 import (
+        MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        make_divisible,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import MobileViTImageProcessor
+
+
+class MobileViTV2ConfigTester(ConfigTester):
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "width_multiplier"))
+
+
+class MobileViTV2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        patch_size=2,
+        num_channels=3,
+        hidden_act="swish",
+        conv_kernel_size=3,
+        output_stride=32,
+        classifier_dropout_prob=0.1,
+        initializer_range=0.02,
+        is_training=True,
+        use_labels=True,
+        num_labels=10,
+        scope=None,
+        width_multiplier=0.25,
+        ffn_dropout=0.0,
+        attn_dropout=0.0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.last_hidden_size = make_divisible(512 * width_multiplier, divisor=8)
+        self.hidden_act = hidden_act
+        self.conv_kernel_size = conv_kernel_size
+        self.output_stride = output_stride
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.use_labels = use_labels
+        self.is_training = is_training
+        self.num_labels = num_labels
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.width_multiplier = width_multiplier
+        self.ffn_dropout_prob = ffn_dropout
+        self.attn_dropout_prob = attn_dropout
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        pixel_labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels, pixel_labels
+
+    def get_config(self):
+        return MobileViTV2Config(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_act=self.hidden_act,
+            conv_kernel_size=self.conv_kernel_size,
+            output_stride=self.output_stride,
+            classifier_dropout_prob=self.classifier_dropout_prob,
+            initializer_range=self.initializer_range,
+            width_multiplier=self.width_multiplier,
+            ffn_dropout=self.ffn_dropout_prob,
+            attn_dropout=self.attn_dropout_prob,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
+        model = MobileViTV2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (
+                self.batch_size,
+                self.last_hidden_size,
+                self.image_size // self.output_stride,
+                self.image_size // self.output_stride,
+            ),
+        )
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
+        config.num_labels = self.num_labels
+        model = MobileViTV2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
+        config.num_labels = self.num_labels
+        model = MobileViTV2ForSemanticSegmentation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.logits.shape,
+            (
+                self.batch_size,
+                self.num_labels,
+                self.image_size // self.output_stride,
+                self.image_size // self.output_stride,
+            ),
+        )
+        result = model(pixel_values, labels=pixel_labels)
+        self.parent.assertEqual(
+            result.logits.shape,
+            (
+                self.batch_size,
+                self.num_labels,
+                self.image_size // self.output_stride,
+                self.image_size // self.output_stride,
+            ),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels, pixel_labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class MobileViTV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as MobileViTV2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (MobileViTV2Model, MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation)
+        if is_torch_available()
+        else ()
+    )
+
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MobileViTV2Model,
+            "image-classification": MobileViTV2ForImageClassification,
+            "image-segmentation": MobileViTV2ForSemanticSegmentation,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = MobileViTV2ModelTester(self)
+        self.config_tester = MobileViTV2ConfigTester(self, config_class=MobileViTV2Config, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="MobileViTV2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="MobileViTV2 does not support input and output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="MobileViTV2 does not output attentions")
+    def test_attention_outputs(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_stages = 5
+            self.assertEqual(len(hidden_states), expected_num_stages)
+
+            # MobileViTV2's feature maps are of shape (batch_size, num_channels, height, width)
+            # with the width and height being successively divided by 2.
+            divisor = 2
+            for i in range(len(hidden_states)):
+                self.assertListEqual(
+                    list(hidden_states[i].shape[-2:]),
+                    [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor],
+                )
+                divisor *= 2
+
+            self.assertEqual(self.model_tester.output_stride, divisor // 2)
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    def test_for_semantic_segmentation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MobileViTV2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class MobileViTV2ModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return (
+            MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-imagenet1k-256")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = MobileViTV2ForImageClassification.from_pretrained("shehan97/mobilevitv2-1.0-imagenet1k-256").to(
+            torch_device
+        )
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_semantic_segmentation(self):
+        model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+        model = model.to(torch_device)
+
+        image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        logits = outputs.logits
+
+        # verify the logits
+        expected_shape = torch.Size((1, 21, 32, 32))
+        self.assertEqual(logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [
+                [[7.0863, 7.1525, 6.8201], [6.6931, 6.8770, 6.8933], [6.2978, 7.0366, 6.9636]],
+                [[-3.7134, -3.6712, -3.6675], [-3.5825, -3.3549, -3.4777], [-3.3435, -3.3979, -3.2857]],
+                [[-2.9329, -2.8003, -2.7369], [-3.0564, -2.4780, -2.0207], [-2.6889, -1.9298, -1.7640]],
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_post_processing_semantic_segmentation(self):
+        model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+        model = model.to(torch_device)
+
+        image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        outputs.logits = outputs.logits.detach().cpu()
+
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
+        expected_shape = torch.Size((50, 60))
+        self.assertEqual(segmentation[0].shape, expected_shape)
+
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
+        expected_shape = torch.Size((32, 32))
+        self.assertEqual(segmentation[0].shape, expected_shape)

From dcb5e18c9e0d2782a82680298c5021dc74b2bf16 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 2 Jun 2023 11:35:52 +0100
Subject: [PATCH 300/935] add new mms functions to doc (#23954)

---
 docs/source/en/model_doc/wav2vec2.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/model_doc/wav2vec2.mdx b/docs/source/en/model_doc/wav2vec2.mdx
index 166f6bb36c8e..9b084de8e941 100644
--- a/docs/source/en/model_doc/wav2vec2.mdx
+++ b/docs/source/en/model_doc/wav2vec2.mdx
@@ -69,6 +69,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
     - save_vocabulary
     - decode
     - batch_decode
+    - set_target_lang
 
 ## Wav2Vec2FeatureExtractor
 
@@ -171,6 +172,7 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 
 [[autodoc]] Wav2Vec2ForCTC
     - forward
+    - load_adapter
 
 ## Wav2Vec2ForSequenceClassification
 

From 2fdba73a99e5795e07c0a11963b02d13a275b431 Mon Sep 17 00:00:00 2001
From: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Date: Fri, 2 Jun 2023 20:43:55 +0900
Subject: [PATCH 301/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20object=5Fdetection.mdx=20to=20Korean=20(#23164)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* translated object_detection.mdx

Co-Authored-By: Hyeonseo Yun <0525_hhgus@naver.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>
Co-Authored-By: simso <3035487+simso@users.noreply.github.com>
Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

---------

Co-authored-by: Hyeonseo Yun <0525_hhgus@naver.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: simso <3035487+simso@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml               |   6 +-
 docs/source/ko/tasks/object_detection.mdx | 585 ++++++++++++++++++++++
 2 files changed, 588 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/ko/tasks/object_detection.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index c676c1dd267a..661e1b751a43 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -56,8 +56,8 @@
         title: (번역중) Semantic segmentation
       - local: tasks/video_classification
         title: 영상 분류
-      - local: in_translation
-        title: (번역중) Object detection
+      - local: tasks/object_detection
+        title: 객체 탐지
       - local: tasks/zero_shot_object_detection
         title: 제로샷(zero-shot) 객체 탐지
       - local: tasks/zero_shot_image_classification
@@ -673,4 +673,4 @@
     - local: in_translation
       title: (번역중) Utilities for Time Series
     title: (번역중) Internal Helpers
-  title: (번역중) API
+  title: (번역중) API
\ No newline at end of file
diff --git a/docs/source/ko/tasks/object_detection.mdx b/docs/source/ko/tasks/object_detection.mdx
new file mode 100644
index 000000000000..9d65da72fb7c
--- /dev/null
+++ b/docs/source/ko/tasks/object_detection.mdx
@@ -0,0 +1,585 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 객체 탐지 [[object-detection]]
+
+[[open-in-colab]]
+
+객체 탐지는 이미지에서 인스턴스(예: 사람, 건물 또는 자동차)를 감지하는 컴퓨터 비전 작업입니다. 객체 탐지 모델은 이미지를 입력으로 받고 탐지된 바운딩 박스의 좌표와 관련된 레이블을 출력합니다. 
+하나의 이미지에는 여러 객체가 있을 수 있으며 각각은 자체적인 바운딩 박스와 레이블을 가질 수 있습니다(예: 차와 건물이 있는 이미지). 
+또한 각 객체는 이미지의 다른 부분에 존재할 수 있습니다(예: 이미지에 여러 대의 차가 있을 수 있음). 
+이 작업은 보행자, 도로 표지판, 신호등과 같은 것들을 감지하는 자율 주행에 일반적으로 사용됩니다. 
+다른 응용 분야로는 이미지 내 객체 수 계산 및 이미지 검색 등이 있습니다.
+
+이 가이드에서 다음을 배울 것입니다:
+
+ 1. 합성곱 백본(인풋 데이터의 특성을 추출하는 합성곱 네트워크)과 인코더-디코더 트랜스포머 모델을 결합한 [DETR](https://huggingface.co/docs/transformers/model_doc/detr) 모델을 [CPPE-5](https://huggingface.co/datasets/cppe-5) 데이터 세트에 대해 미세조정 하기
+ 2. 미세조정 한 모델을 추론에 사용하기.
+
+<Tip>
+이 튜토리얼의 태스크는 다음 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
+```bash
+pip install -q datasets transformers evaluate timm albumentations
+```
+
+허깅페이스 허브에서 데이터 세트를 가져오기 위한 🤗 Datasets과 모델을 학습하기 위한 🤗 Transformers, 데이터를 증강하기 위한 `albumentations`를 사용합니다. 
+DETR 모델의 합성곱 백본을 가져오기 위해서는 현재 `timm`이 필요합니다.
+
+커뮤니티에 모델을 업로드하고 공유할 수 있도록 Hugging Face 계정에 로그인하는 것을 권장합니다. 프롬프트가 나타나면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## CPPE-5 데이터 세트 가져오기 [[load-the-CPPE-5-dataset]]
+
+[CPPE-5](https://huggingface.co/datasets/cppe-5) 데이터 세트는 COVID-19 대유행 상황에서 의료 전문인력 보호 장비(PPE)를 식별하는 어노테이션이 포함된 이미지를 담고 있습니다.
+
+데이터 세트를 가져오세요:
+
+```py
+>>> from datasets import load_dataset
+
+>>> cppe5 = load_dataset("cppe-5")
+>>> cppe5
+DatasetDict({
+    train: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 29
+    })
+})
+```
+
+이 데이터 세트는 학습 세트 이미지 1,000개와 테스트 세트 이미지 29개를 갖고 있습니다.
+
+데이터에 익숙해지기 위해, 예시가 어떻게 구성되어 있는지 살펴보세요.
+
+```py
+>>> cppe5["train"][0]
+{'image_id': 15,
+ 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7F9EC9E77C10>,
+ 'width': 943,
+ 'height': 663,
+ 'objects': {'id': [114, 115, 116, 117],
+  'area': [3796, 1596, 152768, 81002],
+  'bbox': [[302.0, 109.0, 73.0, 52.0],
+   [810.0, 100.0, 57.0, 28.0],
+   [160.0, 31.0, 248.0, 616.0],
+   [741.0, 68.0, 202.0, 401.0]],
+  'category': [4, 4, 0, 0]}}
+```
+
+데이터 세트에 있는 예시는 다음의 영역을 가지고 있습니다:
+
+- `image_id`: 예시 이미지 id
+- `image`: 이미지를 포함하는 `PIL.Image.Image` 객체
+- `width`: 이미지의 너비
+- `height`: 이미지의 높이
+- `objects`: 이미지 안의 객체들의 바운딩 박스 메타데이터를 포함하는 딕셔너리:
+  - `id`: 어노테이션 id
+  - `area`: 바운딩 박스의 면적
+  - `bbox`: 객체의 바운딩 박스 ([COCO 포맷](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco)으로)
+  - `category`: 객체의 카테고리, 가능한 값으로는 `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` 및 `Mask (4)` 가 포함됩니다.
+
+`bbox` 필드가 DETR 모델이 요구하는 COCO 형식을 따른다는 것을 알 수 있습니다. 
+그러나 `objects` 내부의 필드 그룹은 DETR이 요구하는 어노테이션 형식과 다릅니다. 따라서 이 데이터를 학습에 사용하기 전에 전처리를 적용해야 합니다.
+
+데이터를 더 잘 이해하기 위해서 데이터 세트에서 한 가지 예시를 시각화하세요.
+
+```py
+>>> import numpy as np
+>>> import os
+>>> from PIL import Image, ImageDraw
+
+>>> image = cppe5["train"][0]["image"]
+>>> annotations = cppe5["train"][0]["objects"]
+>>> draw = ImageDraw.Draw(image)
+
+>>> categories = cppe5["train"].features["objects"].feature["category"].names
+
+>>> id2label = {index: x for index, x in enumerate(categories, start=0)}
+>>> label2id = {v: k for k, v in id2label.items()}
+
+>>> for i in range(len(annotations["id"])):
+...     box = annotations["bbox"][i - 1]
+...     class_idx = annotations["category"][i - 1]
+...     x, y, w, h = tuple(box)
+...     draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
+...     draw.text((x, y), id2label[class_idx], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/TdaqPJO.png" alt="CPPE-5 Image Example"/>
+</div>
+
+바운딩 박스와 연결된 레이블을 시각화하려면 데이터 세트의 메타 데이터, 특히 `category` 필드에서 레이블을 가져와야 합니다. 
+또한 레이블 ID를 레이블 클래스에 매핑하는 `id2label`과 반대로 매핑하는 `label2id` 딕셔너리를 만들어야 합니다. 
+모델을 설정할 때 이러한 매핑을 사용할 수 있습니다. 이러한 매핑은 허깅페이스 허브에서 모델을 공유했을 때 다른 사람들이 재사용할 수 있습니다.
+
+데이터를 더 잘 이해하기 위한 최종 단계로, 잠재적인 문제를 찾아보세요. 
+객체 감지를 위한 데이터 세트에서 자주 발생하는 문제 중 하나는 바운딩 박스가 이미지의 가장자리를 넘어가는 것입니다. 
+이러한 바운딩 박스를 "넘어가는 것(run away)"은 훈련 중에 오류를 발생시킬 수 있기에 이 단계에서 처리해야 합니다. 
+이 데이터 세트에도 같은 문제가 있는 몇 가지 예가 있습니다. 이 가이드에서는 간단하게하기 위해 데이터에서 이러한 이미지를 제거합니다.
+
+```py
+>>> remove_idx = [590, 821, 822, 875, 876, 878, 879]
+>>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx]
+>>> cppe5["train"] = cppe5["train"].select(keep)
+```
+
+## 데이터 전처리하기 [[preprocess-the-data]]
+
+모델을 미세 조정 하려면, 미리 학습된 모델에서 사용한 전처리 방식과 정확하게 일치하도록 사용할 데이터를 전처리해야 합니다. 
+[`AutoImageProcessor`]는 이미지 데이터를 처리하여 DETR 모델이 학습에 사용할 수 있는 `pixel_values`, `pixel_mask`, 그리고 `labels`를 생성하는 작업을 담당합니다. 
+이 이미지 프로세서에는 걱정하지 않아도 되는 몇 가지 속성이 있습니다:
+
+- `image_mean = [0.485, 0.456, 0.406 ]`
+- `image_std = [0.229, 0.224, 0.225]`
+
+
+이 값들은 모델 사전 훈련 중 이미지를 정규화하는 데 사용되는 평균과 표준 편차입니다. 
+이 값들은 추론 또는 사전 훈련된 이미지 모델을 세밀하게 조정할 때 복제해야 하는 중요한 값입니다.
+
+사전 훈련된 모델과 동일한 체크포인트에서 이미지 프로세서를 인스턴스화합니다.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "facebook/detr-resnet-50"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+`image_processor`에 이미지를 전달하기 전에, 데이터 세트에 두 가지 전처리를 적용해야 합니다:
+
+- 이미지 증강
+- DETR 모델의 요구에 맞게 어노테이션을 다시 포맷팅
+
+첫째로, 모델이 학습 데이터에 과적합 되지 않도록 데이터 증강 라이브러리 중 아무거나 사용하여 변환을 적용할 수 있습니다. 여기에서는 [Albumentations](https://albumentations.ai/docs/) 라이브러리를 사용합니다...
+이 라이브러리는 변환을 이미지에 적용하고 바운딩 박스를 적절하게 업데이트하도록 보장합니다.
+🤗 Datasets 라이브러리 문서에는 [객체 탐지를 위해 이미지를 보강하는 방법에 대한 자세한 가이드](https://huggingface.co/docs/datasets/object_detection)가 있으며, 
+이 예제와 정확히 동일한 데이터 세트를 사용합니다. 여기서는 각 이미지를 (480, 480) 크기로 조정하고, 좌우로 뒤집고, 밝기를 높이는 동일한 접근법을 적용합니다:
+
+
+```py
+>>> import albumentations
+>>> import numpy as np
+>>> import torch
+
+>>> transform = albumentations.Compose(
+...     [
+...         albumentations.Resize(480, 480),
+...         albumentations.HorizontalFlip(p=1.0),
+...         albumentations.RandomBrightnessContrast(p=1.0),
+...     ],
+...     bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
+... )
+```
+
+이미지 프로세서는 어노테이션이 다음과 같은 형식일 것으로 예상합니다: `{'image_id': int, 'annotations': List[Dict]}`, 여기서 각 딕셔너리는 COCO 객체 어노테이션입니다. 단일 예제에 대해 어노테이션의 형식을 다시 지정하는 함수를 추가해 보겠습니다:
+
+```py
+>>> def formatted_anns(image_id, category, area, bbox):
+...     annotations = []
+...     for i in range(0, len(category)):
+...         new_ann = {
+...             "image_id": image_id,
+...             "category_id": category[i],
+...             "isCrowd": 0,
+...             "area": area[i],
+...             "bbox": list(bbox[i]),
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+```
+
+이제 이미지와 어노테이션 전처리 변환을 결합하여 예제 배치에 사용할 수 있습니다:
+
+```py
+>>> # transforming a batch
+>>> def transform_aug_ann(examples):
+...     image_ids = examples["image_id"]
+...     images, bboxes, area, categories = [], [], [], []
+...     for image, objects in zip(examples["image"], examples["objects"]):
+...         image = np.array(image.convert("RGB"))[:, :, ::-1]
+...         out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+
+...         area.append(objects["area"])
+...         images.append(out["image"])
+...         bboxes.append(out["bboxes"])
+...         categories.append(out["category"])
+
+...     targets = [
+...         {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
+...         for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
+...     ]
+
+...     return image_processor(images=images, annotations=targets, return_tensors="pt")
+```
+
+이전 단계에서 만든 전처리 함수를 🤗 Datasets의 [`~datasets.Dataset.with_transform`] 메소드를 사용하여 데이터 세트 전체에 적용합니다.
+이 메소드는 데이터 세트의 요소를 가져올 때마다 전처리 함수를 적용합니다.
+
+이 시점에서는 전처리 후 데이터 세트에서 예시 하나를 가져와서 변환 후 모양이 어떻게 되는지 확인해 볼 수 있습니다.
+이때, `pixel_values` 텐서, `pixel_mask` 텐서, 그리고 `labels`로 구성된 텐서가 있어야 합니다.
+
+```py
+>>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann)
+>>> cppe5["train"][15]
+{'pixel_values': tensor([[[ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9638, -1.9638, -1.9638],
+          ...,
+          [-1.5699, -1.5699, -1.5699,  ..., -1.9980, -1.9980, -1.9980],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809]],
+
+         [[ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8256, -1.8256, -1.8256],
+          ...,
+          [-1.3179, -1.3179, -1.3179,  ..., -1.8606, -1.8606, -1.8606],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431]],
+
+         [[ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6302, -1.6302, -1.6302],
+          ...,
+          [-1.0201, -1.0201, -1.0201,  ..., -1.5604, -1.5604, -1.5604],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430]]]),
+ 'pixel_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         ...,
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1]]),
+ 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}}
+```
+
+각각의 이미지를 성공적으로 증강하고 이미지의 어노테이션을 준비했습니다. 
+그러나 전처리는 아직 끝나지 않았습니다. 마지막 단계로, 이미지를 배치로 만들 사용자 정의 `collate_fn`을 생성합니다.
+해당 배치에서 가장 큰 이미지에 이미지(현재 `pixel_values` 인)를 패드하고, 실제 픽셀(1)과 패딩(0)을 나타내기 위해 그에 해당하는 새로운 `pixel_mask`를 생성해야 합니다.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## DETR 모델 학습시키기 [[training-the-DETR-model]]
+
+이전 섹션에서 대부분의 작업을 수행하여 이제 모델을 학습할 준비가 되었습니다!
+이 데이터 세트의 이미지는 리사이즈 후에도 여전히 용량이 크기 때문에, 이 모델을 미세 조정 하려면 적어도 하나의 GPU가 필요합니다.
+
+학습은 다음의 단계를 수행합니다:
+
+1. [`AutoModelForObjectDetection`]을 사용하여 전처리와 동일한 체크포인트를 사용하여 모델을 가져옵니다.
+2. [`TrainingArguments`]에서 학습 하이퍼파라미터를 정의합니다.
+3. 모델, 데이터 세트, 이미지 프로세서 및 데이터 콜레이터와 함께 [`Trainer`]에 훈련 인수를 전달합니다.
+4. [`~Trainer.train`]를 호출하여 모델을 미세 조정 합니다.
+
+전처리에 사용한 체크포인트와 동일한 체크포인트에서 모델을 가져올 때, 데이터 세트의 메타데이터에서 만든 `label2id`와 `id2label` 매핑을 전달해야 합니다. 
+또한, `ignore_mismatched_sizes=True`를 지정하여 기존 분류 헤드(모델에서 분류에 사용되는 마지막 레이어)를 새 분류 헤드로 대체합니다.
+
+```py
+>>> from transformers import AutoModelForObjectDetection
+
+>>> model = AutoModelForObjectDetection.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+...     ignore_mismatched_sizes=True,
+... )
+```
+
+[`TrainingArguments`]에서 `output_dir`을 사용하여 모델을 저장할 위치를 지정한 다음, 필요에 따라 하이퍼파라미터를 구성하세요.
+사용하지 않는 열을 제거하지 않도록 주의해야 합니다. 만약 `remove_unused_columns`가 `True`일 경우 이미지 열이 삭제됩니다. 
+이미지 열이 없는 경우 `pixel_values`를 생성할 수 없기 때문에 `remove_unused_columns`를 `False`로 설정해야 합니다.
+모델을 Hub에 업로드하여 공유하려면 `push_to_hub`를 `True`로 설정하십시오(허깅페이스에 로그인하여 모델을 업로드해야 합니다).
+
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(
+...     output_dir="detr-resnet-50_finetuned_cppe5",
+...     per_device_train_batch_size=8,
+...     num_train_epochs=10,
+...     fp16=True,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=1e-5,
+...     weight_decay=1e-4,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+마지막으로 `model`, `training_args`, `collate_fn`, `image_processor`와 데이터 세트(`cppe5`)를 모두 가져온 후, [`~transformers.Trainer.train`]를 호출합니다.
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=collate_fn,
+...     train_dataset=cppe5["train"],
+...     tokenizer=image_processor,
+... )
+
+>>> trainer.train()
+```
+
+`training_args`에서 `push_to_hub`를 `True`로 설정한 경우, 학습 체크포인트는 허깅페이스 허브에 업로드됩니다. 
+학습 완료 후, [`~transformers.Trainer.push_to_hub`] 메소드를 호출하여 최종 모델을 허깅페이스 허브에 업로드합니다.
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## 평가하기 [[evaluate]]
+
+객체 탐지 모델은 일반적으로 일련의 <a href="https://cocodataset.org/#detection-eval">COCO-스타일 지표</a>로 평가됩니다. 
+기존에 구현된 평가 지표 중 하나를 사용할 수도 있지만, 여기에서는 허깅페이스 허브에 푸시한 최종 모델을 평가하는 데 `torchvision`에서 제공하는 평가 지표를 사용합니다.
+
+`torchvision` 평가자(evaluator)를 사용하려면 실측값인 COCO 데이터 세트를 준비해야 합니다. 
+COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 저장해야 하므로, 먼저 이미지와 어노테이션을 디스크에 저장해야 합니다. 
+학습을 위해 데이터를 준비할 때와 마찬가지로, cppe5["test"]에서의 어노테이션은 포맷을 맞춰야 합니다. 그러나 이미지는 그대로 유지해야 합니다.
+
+평가 단계는 약간의 작업이 필요하지만, 크게 세 가지 주요 단계로 나눌 수 있습니다. 
+먼저, `cppe5["test"]` 세트를 준비합니다: 어노테이션을 포맷에 맞게 만들고 데이터를 디스크에 저장합니다.
+
+```py
+>>> import json
+
+
+>>> # format annotations the same as for training, no need for data augmentation
+>>> def val_formatted_anns(image_id, objects):
+...     annotations = []
+...     for i in range(0, len(objects["id"])):
+...         new_ann = {
+...             "id": objects["id"][i],
+...             "category_id": objects["category"][i],
+...             "iscrowd": 0,
+...             "image_id": image_id,
+...             "area": objects["area"][i],
+...             "bbox": objects["bbox"][i],
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+
+
+>>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects
+>>> def save_cppe5_annotation_file_images(cppe5):
+...     output_json = {}
+...     path_output_cppe5 = f"{os.getcwd()}/cppe5/"
+
+...     if not os.path.exists(path_output_cppe5):
+...         os.makedirs(path_output_cppe5)
+
+...     path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
+...     categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
+...     output_json["images"] = []
+...     output_json["annotations"] = []
+...     for example in cppe5:
+...         ann = val_formatted_anns(example["image_id"], example["objects"])
+...         output_json["images"].append(
+...             {
+...                 "id": example["image_id"],
+...                 "width": example["image"].width,
+...                 "height": example["image"].height,
+...                 "file_name": f"{example['image_id']}.png",
+...             }
+...         )
+...         output_json["annotations"].extend(ann)
+...     output_json["categories"] = categories_json
+
+...     with open(path_anno, "w") as file:
+...         json.dump(output_json, file, ensure_ascii=False, indent=4)
+
+...     for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
+...         path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
+...         im.save(path_img)
+
+...     return path_output_cppe5, path_anno
+```
+
+다음으로, `cocoevaluator`와 함께 사용할 수 있는 `CocoDetection` 클래스의 인스턴스를 준비합니다.
+
+```py
+>>> import torchvision
+
+
+>>> class CocoDetection(torchvision.datasets.CocoDetection):
+...     def __init__(self, img_folder, feature_extractor, ann_file):
+...         super().__init__(img_folder, ann_file)
+...         self.feature_extractor = feature_extractor
+
+...     def __getitem__(self, idx):
+...         # read in PIL image and target in COCO format
+...         img, target = super(CocoDetection, self).__getitem__(idx)
+
+...         # preprocess image and target: converting target to DETR format,
+...         # resizing + normalization of both image and target)
+...         image_id = self.ids[idx]
+...         target = {"image_id": image_id, "annotations": target}
+...         encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
+...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
+...         target = encoding["labels"][0]  # remove batch dimension
+
+...         return {"pixel_values": pixel_values, "labels": target}
+
+
+>>> im_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+
+>>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
+>>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
+```
+
+마지막으로, 평가 지표를 가져와서 평가를 실행합니다.
+
+```py
+>>> import evaluate
+>>> from tqdm import tqdm
+
+>>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
+>>> val_dataloader = torch.utils.data.DataLoader(
+...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
+... )
+
+>>> with torch.no_grad():
+...     for idx, batch in enumerate(tqdm(val_dataloader)):
+...         pixel_values = batch["pixel_values"]
+...         pixel_mask = batch["pixel_mask"]
+
+...         labels = [
+...             {k: v for k, v in t.items()} for t in batch["labels"]
+...         ]  # these are in DETR format, resized + normalized
+
+...         # forward pass
+...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+
+...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
+...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api
+
+...         module.add(prediction=results, reference=labels)
+...         del batch
+
+>>> results = module.compute()
+>>> print(results)
+Accumulating evaluation results...
+DONE (t=0.08s).
+IoU metric: bbox
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.150
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.280
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.130
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.182
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.166
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.317
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.335
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.146
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.382
+```
+
+이러한 결과는 [`~transformers.TrainingArguments`]의 하이퍼파라미터를 조정하여 더욱 개선될 수 있습니다. 한번 시도해 보세요!
+
+## 추론하기 [[inference]]
+
+DETR 모델을 미세 조정 및 평가하고, 허깅페이스 허브에 업로드 했으므로 추론에 사용할 수 있습니다. 
+
+미세 조정된 모델을 추론에 사용하는 가장 간단한 방법은 [`pipeline`]에서 모델을 사용하는 것입니다. 
+모델과 함께 객체 탐지를 위한 파이프라인을 인스턴스화하고, 이미지를 전달하세요:
+
+```py
+>>> from transformers import pipeline
+>>> import requests
+
+>>> url = "https://i.imgur.com/2lnWoly.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> obj_detector = pipeline("object-detection", model="MariaK/detr-resnet-50_finetuned_cppe5")
+>>> obj_detector(image)
+```
+
+만약 원한다면 수동으로 `pipeline`의 결과를 재현할 수 있습니다:
+
+```py
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+
+>>> with torch.no_grad():
+...     inputs = image_processor(images=image, return_tensors="pt")
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([image.size[::-1]])
+...     results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     print(
+...         f"Detected {model.config.id2label[label.item()]} with confidence "
+...         f"{round(score.item(), 3)} at location {box}"
+...     )
+Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08]
+Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9]
+```
+
+결과를 시각화하겠습니다:
+```py
+>>> draw = ImageDraw.Draw(image)
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     x, y, x2, y2 = tuple(box)
+...     draw.rectangle((x, y, x2, y2), outline="red", width=1)
+...     draw.text((x, y), model.config.id2label[label.item()], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
+</div>
+

From 8940d315aa7333a9dfe78f3e625875dcd7d202b8 Mon Sep 17 00:00:00 2001
From: Claudius Kienle <privat@claudiuskienle.de>
Date: Fri, 2 Jun 2023 14:53:48 +0200
Subject: [PATCH 302/935] Trainer: fixed evaluate raising `KeyError` for
 ReduceLROnPlateau (#23952)

Trainer: fixed KeyError on evaluate for ReduceLROnPlateau

Co-authored-by: Claudius Kienle <claudius.kienle@artiminds.com>
---
 src/transformers/trainer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 5c72b155b034..678f065007f8 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2320,7 +2320,10 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for
 
             # Run delayed LR scheduler now that metrics are populated
             if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
-                self.lr_scheduler.step(metrics[self.args.metric_for_best_model])
+                metric_to_check = self.args.metric_for_best_model
+                if not metric_to_check.startswith("eval_"):
+                    metric_to_check = f"eval_{metric_to_check}"
+                self.lr_scheduler.step(metrics[metric_to_check])
 
         if self.control.should_save:
             self._save_checkpoint(model, trial, metrics=metrics)

From c9cf33777285681c62c0fc12a4d0afb50c82a9dc Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 2 Jun 2023 15:26:59 +0100
Subject: [PATCH 303/935] [Whisper Tokenizer] Skip special tokens when decoding
 with timestamps (#23945)

---
 .../models/whisper/tokenization_whisper.py    | 10 ++++--
 .../whisper/tokenization_whisper_fast.py      | 10 ++++--
 .../whisper/test_tokenization_whisper.py      | 32 +++++++++++++++++++
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 428254a26a84..b62d9cbcb744 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -491,7 +491,7 @@ def _normalize(self, text):
         normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
         return normalizer(text)
 
-    def _decode_with_timestamps(self, token_ids, time_precision=0.02) -> str:
+    def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
         """
         Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
         given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
@@ -505,7 +505,9 @@ def _decode_with_timestamps(self, token_ids, time_precision=0.02) -> str:
                 outputs.append([])
             else:
                 outputs[-1].append(token)
-        outputs = [s if isinstance(s, str) else self.decode(s) for s in outputs]
+        outputs = [
+            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+        ]
         return "".join(outputs)
 
     def _compute_offsets(self, token_ids, time_precision=0.02):
@@ -593,7 +595,9 @@ def decode(
             **kwargs,
         )
         if decode_with_timestamps:
-            text = self._decode_with_timestamps(token_ids, time_precision=time_precision)
+            text = self._decode_with_timestamps(
+                token_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
+            )
         # retrieve offsets
         if output_offsets:
             offsets = None
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index a31fe0005667..642be2e4cb07 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -199,7 +199,7 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
         return super()._encode_plus(*args, **kwargs)
 
     # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
-    def _decode_with_timestamps(self, token_ids, time_precision=0.02) -> str:
+    def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_precision=0.02) -> str:
         """
         Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
         given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
@@ -213,7 +213,9 @@ def _decode_with_timestamps(self, token_ids, time_precision=0.02) -> str:
                 outputs.append([])
             else:
                 outputs[-1].append(token)
-        outputs = [s if isinstance(s, str) else self.decode(s) for s in outputs]
+        outputs = [
+            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+        ]
         return "".join(outputs)
 
     # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets
@@ -303,7 +305,9 @@ def decode(
             **kwargs,
         )
         if decode_with_timestamps:
-            text = self._decode_with_timestamps(token_ids, time_precision=time_precision)
+            text = self._decode_with_timestamps(
+                token_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
+            )
         # retrieve offsets
         if output_offsets:
             offsets = None
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 09c98db31755..aea86525f52a 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -213,6 +213,38 @@ def test_skip_special_tokens_skips_prompt_ids(self):
             rust_tokenizer.decode(encoded_input, skip_special_tokens=True), expected_without_special_tokens
         )
 
+    def test_skip_special_tokens_with_timestamps(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # fmt: off
+        encoded_input = [
+            50258, 50363, 50364, 634, 575, 12525, 22618, 1968, 6144,
+            35617, 20084, 1756, 311, 589, 307, 534, 10281, 934,
+            439, 293, 50676, 50676, 393, 4411, 294, 309, 457,
+            707, 295, 33301, 286, 392, 6628, 13, 50836, 50257,
+        ]
+        # fmt: on
+
+        expected_with_special_tokens = "<|startoftranscript|><|notimestamps|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>"
+        expected_without_special_tokens = "<|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and<|6.24|><|6.24|> can discover in it but little of rocky Ithaca.<|9.44|>"
+        self.assertEqual(
+            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
+            expected_with_special_tokens,
+        )
+        self.assertEqual(
+            tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
+            expected_without_special_tokens,
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=False),
+            expected_with_special_tokens,
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(encoded_input, decode_with_timestamps=True, skip_special_tokens=True),
+            expected_without_special_tokens,
+        )
+
     def test_fast_tokenizer_get_prompt_ids(self):
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()

From 167a0d8f87343c035e01cfacbb7116edf30388b9 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 2 Jun 2023 15:28:52 +0100
Subject: [PATCH 304/935] Add an option to reduce compile() console spam
 (#23938)

* Add an option to reduce compile() console spam

* Add annotations to the example scripts

* Add notes to the quicktour docs as well

* minor fix
---
 docs/source/en/quicktour.mdx                          |  4 ++--
 docs/source/en/tasks/language_modeling.mdx            |  4 ++--
 docs/source/en/tasks/masked_language_modeling.mdx     |  4 ++--
 docs/source/en/tasks/multiple_choice.mdx              |  4 ++--
 docs/source/en/tasks/semantic_segmentation.mdx        |  4 ++--
 docs/source/en/tasks/sequence_classification.mdx      |  4 ++--
 docs/source/en/tasks/summarization.mdx                |  4 ++--
 docs/source/en/tasks/token_classification.mdx         |  4 ++--
 docs/source/en/tasks/translation.mdx                  |  4 ++--
 docs/source/en/training.mdx                           |  6 +++---
 .../tensorflow/contrastive-image-text/run_clip.py     |  2 ++
 .../image-classification/run_image_classification.py  |  2 ++
 examples/tensorflow/language-modeling-tpu/run_mlm.py  |  4 +++-
 examples/tensorflow/language-modeling/run_clm.py      |  3 ++-
 examples/tensorflow/language-modeling/run_mlm.py      |  5 +++--
 examples/tensorflow/multiple-choice/run_swag.py       |  2 ++
 examples/tensorflow/question-answering/run_qa.py      |  3 ++-
 .../tensorflow/summarization/run_summarization.py     |  2 ++
 examples/tensorflow/text-classification/run_glue.py   |  2 ++
 .../text-classification/run_text_classification.py    |  2 ++
 examples/tensorflow/token-classification/run_ner.py   |  3 ++-
 examples/tensorflow/translation/run_translation.py    |  2 ++
 src/transformers/modeling_tf_utils.py                 | 11 +++++++----
 23 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx
index 4c7defc039e7..221cf91ff85b 100644
--- a/docs/source/en/quicktour.mdx
+++ b/docs/source/en/quicktour.mdx
@@ -532,12 +532,12 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
    ... )  # doctest: +SKIP
    ```
 
-5. When you're ready, you can call `compile` and `fit` to start training:
+5. When you're ready, you can call `compile` and `fit` to start training. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
    ```py
    >>> from tensorflow.keras.optimizers import Adam
 
-   >>> model.compile(optimizer=Adam(3e-5))
+   >>> model.compile(optimizer=Adam(3e-5))  # No loss argument!
    >>> model.fit(tf_dataset)  # doctest: +SKIP
    ```
 
diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.mdx
index ea25e17efa47..676156ede860 100644
--- a/docs/source/en/tasks/language_modeling.mdx
+++ b/docs/source/en/tasks/language_modeling.mdx
@@ -306,12 +306,12 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> import tensorflow as tf
 
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
diff --git a/docs/source/en/tasks/masked_language_modeling.mdx b/docs/source/en/tasks/masked_language_modeling.mdx
index c41d6493c589..c81b27f33c82 100644
--- a/docs/source/en/tasks/masked_language_modeling.mdx
+++ b/docs/source/en/tasks/masked_language_modeling.mdx
@@ -301,12 +301,12 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> import tensorflow as tf
 
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
diff --git a/docs/source/en/tasks/multiple_choice.mdx b/docs/source/en/tasks/multiple_choice.mdx
index 4e4ff06e7f71..5a06c24d110b 100644
--- a/docs/source/en/tasks/multiple_choice.mdx
+++ b/docs/source/en/tasks/multiple_choice.mdx
@@ -335,10 +335,10 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.mdx
index eac2507c9a1c..89abe14c7574 100644
--- a/docs/source/en/tasks/semantic_segmentation.mdx
+++ b/docs/source/en/tasks/semantic_segmentation.mdx
@@ -377,7 +377,7 @@ Start by defining the hyperparameters, optimizer and learning rate schedule:
 ```
 
 Then, load SegFormer with [`TFAutoModelForSemanticSegmentation`] along with the label mappings, and compile it with the
-optimizer:
+optimizer. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> from transformers import TFAutoModelForSemanticSegmentation
@@ -387,7 +387,7 @@ optimizer:
 ...     id2label=id2label,
 ...     label2id=label2id,
 ... )
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Dataset.to_tf_dataset`] and the [`DefaultDataCollator`]:
diff --git a/docs/source/en/tasks/sequence_classification.mdx b/docs/source/en/tasks/sequence_classification.mdx
index fa15b5be30bc..db7bdc15b338 100644
--- a/docs/source/en/tasks/sequence_classification.mdx
+++ b/docs/source/en/tasks/sequence_classification.mdx
@@ -259,12 +259,12 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> import tensorflow as tf
 
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.mdx
index a9b890dcea97..ce480bccb75b 100644
--- a/docs/source/en/tasks/summarization.mdx
+++ b/docs/source/en/tasks/summarization.mdx
@@ -267,12 +267,12 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> import tensorflow as tf
 
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 The last two things to setup before you start training is to compute the ROUGE score from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
diff --git a/docs/source/en/tasks/token_classification.mdx b/docs/source/en/tasks/token_classification.mdx
index d85272d0f9f6..90af3a21bb75 100644
--- a/docs/source/en/tasks/token_classification.mdx
+++ b/docs/source/en/tasks/token_classification.mdx
@@ -361,12 +361,12 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> import tensorflow as tf
 
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 The last two things to setup before you start training is to compute the seqeval scores from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.mdx
index 361efbf54cc5..530390f1c331 100644
--- a/docs/source/en/tasks/translation.mdx
+++ b/docs/source/en/tasks/translation.mdx
@@ -276,12 +276,12 @@ Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPre
 ... )
 ```
 
-Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+Configure the model for training with [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 >>> import tensorflow as tf
 
->>> model.compile(optimizer=optimizer)
+>>> model.compile(optimizer=optimizer)  # No loss argument!
 ```
 
 The last two things to setup before you start training is to compute the SacreBLEU metric from the predictions, and provide a way to push your model to the Hub. Both are done by using [Keras callbacks](../main_classes/keras_callbacks).
diff --git a/docs/source/en/training.mdx b/docs/source/en/training.mdx
index 52b4f157a340..24afa907aeca 100644
--- a/docs/source/en/training.mdx
+++ b/docs/source/en/training.mdx
@@ -191,7 +191,7 @@ tokenized_data = dict(tokenized_data)
 labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
 ```
 
-Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model:
+Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
 
 ```py
 from transformers import TFAutoModelForSequenceClassification
@@ -200,7 +200,7 @@ from tensorflow.keras.optimizers import Adam
 # Load and compile our model
 model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
 # Lower learning rates are often better for fine-tuning transformers
-model.compile(optimizer=Adam(3e-5))
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
 
 model.fit(tokenized_data, labels)
 ```
@@ -261,7 +261,7 @@ list of samples into a batch and apply any preprocessing you want. See our
 Once you've created a `tf.data.Dataset`, you can compile and fit the model as before:
 
 ```py
-model.compile(optimizer=Adam(3e-5))
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
 
 model.fit(tf_dataset)
 ```
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 2c696244f629..42e6dd76b384 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -561,6 +561,8 @@ def filter_corrupt_images(examples):
             weight_decay_rate=training_args.weight_decay,
             adam_global_clipnorm=training_args.max_grad_norm,
         )
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
 
         if not training_args.do_eval:
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 6a4b7df4d0a0..ea70d8f53d8d 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -497,6 +497,8 @@ def compute_metrics(p):
                 collate_fn=collate_fn,
             ).with_options(dataset_options)
 
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
 
         push_to_hub_model_id = training_args.push_to_hub_model_id
diff --git a/examples/tensorflow/language-modeling-tpu/run_mlm.py b/examples/tensorflow/language-modeling-tpu/run_mlm.py
index 30923b982e1e..e9e9862a6da4 100644
--- a/examples/tensorflow/language-modeling-tpu/run_mlm.py
+++ b/examples/tensorflow/language-modeling-tpu/run_mlm.py
@@ -235,8 +235,10 @@ def main(args):
             num_warmup_steps=total_train_steps // 20,
             init_lr=args.learning_rate,
             weight_decay_rate=args.weight_decay_rate,
-            # TODO Add the other Adam parameters?
         )
+
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=["accuracy"])
 
     def decode_fn(example):
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 645dae55be86..650459d41819 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -537,7 +537,8 @@ def group_texts(examples):
             adam_global_clipnorm=training_args.max_grad_norm,
         )
 
-        # no user-specified loss = will use the model internal loss
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index cff0f51df09e..89d68ade4d40 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -559,8 +559,9 @@ def group_texts(examples):
             adam_global_clipnorm=training_args.max_grad_norm,
         )
 
-        # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
         # region Preparing push_to_hub and model card
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index d3ddca3f134c..7abceab543a4 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -455,6 +455,8 @@ def preprocess_function(examples):
             )
         else:
             optimizer = None
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
         # endregion
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index a42d5111814a..15c6884d94ae 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -656,7 +656,8 @@ def compute_metrics(p: EvalPrediction):
                 adam_global_clipnorm=training_args.max_grad_norm,
             )
 
-            # no user-specified loss = will use the model internal loss
+            # Transformers models compute the right loss for their task by default when labels are passed, and will
+            # use this for training unless you specify your own loss function in compile().
             model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
 
         else:
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index f3195d39d96d..21b77ed639e0 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -674,6 +674,8 @@ def compute_metrics(preds):
         # endregion
 
         # region Training
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         eval_metrics = None
         if training_args.do_train:
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index df062c342e5d..13f840ce174e 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -453,6 +453,8 @@ def compute_metrics(preds, label_ids):
             metrics = []
         else:
             metrics = ["accuracy"]
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla)
         # endregion
 
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 64799eda3c02..81d5683d1eea 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -487,6 +487,8 @@ def preprocess_function(examples):
             metrics = []
         else:
             metrics = ["accuracy"]
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, metrics=metrics)
         # endregion
 
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index a358cd12dd63..2d5ed748fe95 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -454,7 +454,8 @@ def tokenize_and_align_labels(examples):
             weight_decay_rate=training_args.weight_decay,
             adam_global_clipnorm=training_args.max_grad_norm,
         )
-
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
         # endregion
 
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 1f31c69245fb..551d89d0335a 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -643,6 +643,8 @@ def compute_metrics(preds):
 
         # region Training
         eval_metrics = None
+        # Transformers models compute the right loss for their task by default when labels are passed, and will
+        # use this for training unless you specify your own loss function in compile().
         model.compile(optimizer=optimizer, jit_compile=training_args.xla)
 
         if training_args.do_train:
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bac575e249df..41ee0402b70a 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1498,7 +1498,7 @@ def prepare_tf_dataset(
     def compile(
         self,
         optimizer="rmsprop",
-        loss="passthrough",
+        loss="auto_with_warning",
         metrics=None,
         loss_weights=None,
         weighted_metrics=None,
@@ -1510,13 +1510,16 @@ def compile(
         This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
         function themselves.
         """
-        if loss == "passthrough":
-            logger.warning(
+        if loss in ("auto_with_warning", "passthrough"):  # "passthrough" for workflow backward compatibility
+            logger.info(
                 "No loss specified in compile() - the model's internal loss computation will be used as the "
                 "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! "
                 "To disable this behaviour please pass a loss argument, or explicitly pass "
-                "`loss=None` if you do not want your model to compute a loss."
+                "`loss=None` if you do not want your model to compute a loss. You can also specify `loss='auto'` to "
+                "get the internal loss without printing this info string."
             )
+            loss = "auto"
+        if loss == "auto":
             loss = dummy_loss
             self._using_dummy_loss = True
         else:

From bacaab1629972b85664fe61ec3caa4da7b55b041 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Fri, 2 Jun 2023 22:02:34 +0530
Subject: [PATCH 305/935] Added time-series blogs to the models (#23857)

* added blogs to docs

* removed new-line
---
 docs/source/en/model_doc/informer.mdx                | 2 ++
 docs/source/en/model_doc/time_series_transformer.mdx | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docs/source/en/model_doc/informer.mdx b/docs/source/en/model_doc/informer.mdx
index a4801f5dd36a..7631e532c48c 100644
--- a/docs/source/en/model_doc/informer.mdx
+++ b/docs/source/en/model_doc/informer.mdx
@@ -25,6 +25,8 @@ The abstract from the paper is the following:
 This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
 The original code can be found [here](https://github.com/zhouhaoyi/Informer2020).
 
+Tips:
+- Check out the Informer blog-post in HuggingFace blog: [Multivariate Probabilistic Time Series Forecasting with Informer](https://huggingface.co/blog/informer)
 
 ## InformerConfig
 
diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.mdx
index 3bd67f985f5d..23be65142668 100644
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ b/docs/source/en/model_doc/time_series_transformer.mdx
@@ -25,6 +25,7 @@ The Time Series Transformer model is a vanilla encoder-decoder Transformer for t
 
 Tips:
 
+- Check out the Time Series Transformer blog-post in HuggingFace blog: [Probabilistic Time Series Forecasting with 🤗 Transformers](https://huggingface.co/blog/time-series-transformers)
 - Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`]
 adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a
 point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values.

From 539e2281cd97c35ef4122757f26c88f44115fa94 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 2 Jun 2023 16:23:44 -0400
Subject: [PATCH 306/935] Bump cryptography from 39.0.1 to 41.0.0 in
 /examples/research_projects/decision_transformer (#23964)

Bump cryptography in /examples/research_projects/decision_transformer

Bumps [cryptography](https://github.com/pyca/cryptography) from 39.0.1 to 41.0.0.
- [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pyca/cryptography/compare/39.0.1...41.0.0)

---
updated-dependencies:
- dependency-name: cryptography
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 8c0ba1cc64a3..694c225b468a 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -34,7 +34,7 @@ cmd2==2.4.0
 codecarbon==1.2.0
 colorlog==6.6.0
 cookiecutter==2.1.1
-cryptography==39.0.1
+cryptography==41.0.0
 csvw==2.0.0
 cycler==0.11.0
 Cython==0.29.28

From 3c3108972af74246bc3a0ecf3259fd2eafbacdef Mon Sep 17 00:00:00 2001
From: Kaede Fujisaki <kaede@hexe.net>
Date: Mon, 5 Jun 2023 20:10:31 +0900
Subject: [PATCH 307/935] Fix typo in doc comment of BitsAndBytesConfig
 (#23978)

---
 src/transformers/utils/quantization_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 2647418a13a6..9a2d4c82df9c 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -74,9 +74,9 @@ class BitsAndBytesConfig:
         bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
             This sets the computational type which might be different than the input time. For example, inputs might be
             fp32, but computation can be set to bf16 for speedups.
-        bnb_4bit_quant_type (`str`, {fp4, fn4}, defaults to `fp4`):
+        bnb_4bit_quant_type (`str`, {fp4, nf4}, defaults to `fp4`):
             This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
-            which are specified by `fp4` or `fn4`.
+            which are specified by `fp4` or `nf4`.
         bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
             This flag is used for nested quantization where the quantization constants from the first quantization are
             quantized again.

From 460b844360131c99d3dd4dbd9c08545ea2e6ac9e Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:58:10 +0530
Subject: [PATCH 308/935] fix trainer slow tests related to hyperparam search
 (#24011)

* fix trainer slow tests

* commit 2
---
 src/transformers/trainer.py | 56 ++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 678f065007f8..45fb2f1a68e0 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -339,31 +339,7 @@ def __init__(
         self.hp_name = None
         self.is_in_train = False
 
-        # create accelerator object
-        self.accelerator = Accelerator(
-            deepspeed_plugin=self.args.deepspeed_plugin,
-            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
-        )
-
-        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
-        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
-        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
-
-        # post accelerator creation setup
-        if self.is_fsdp_enabled:
-            fsdp_plugin = self.accelerator.state.fsdp_plugin
-            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get("limit_all_gathers", False)
-            fsdp_plugin.use_orig_params = self.args.fsdp_config.get("use_orig_params", False)
-
-        if self.is_deepspeed_enabled:
-            if getattr(self.args, "hf_deepspeed_config", None) is None:
-                from transformers.deepspeed import HfTrainerDeepSpeedConfig
-
-                ds_plugin = self.accelerator.state.deepspeed_plugin
-
-                ds_plugin.hf_ds_config = HfTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config)
-                ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config
-                ds_plugin.hf_ds_config.trainer_config_process(self.args)
+        self.create_accelerator_and_postprocess()
 
         # memory metrics - must set up as early as possible
         self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
@@ -1343,7 +1319,8 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
 
             self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
             self.args.hf_deepspeed_config.trainer_config_process(self.args)
-            self.accelerator.state.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
+            self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config)
+        self.create_accelerator_and_postprocess()
 
     def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
         if self.hp_search_backend is None or trial is None:
@@ -3924,3 +3901,30 @@ def _add_sm_patterns_to_gitignore(self) -> None:
         if not self.repo.is_repo_clean():
             self.repo.git_commit("Add *.sagemaker patterns to .gitignore.")
             self.repo.git_push()
+
+    def create_accelerator_and_postprocess(self):
+        # create accelerator object
+        self.accelerator = Accelerator(
+            deepspeed_plugin=self.args.deepspeed_plugin,
+            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
+        )
+
+        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+
+        # post accelerator creation setup
+        if self.is_fsdp_enabled:
+            fsdp_plugin = self.accelerator.state.fsdp_plugin
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get("limit_all_gathers", False)
+            fsdp_plugin.use_orig_params = self.args.fsdp_config.get("use_orig_params", False)
+
+        if self.is_deepspeed_enabled:
+            if getattr(self.args, "hf_deepspeed_config", None) is None:
+                from transformers.deepspeed import HfTrainerDeepSpeedConfig
+
+                ds_plugin = self.accelerator.state.deepspeed_plugin
+
+                ds_plugin.hf_ds_config = HfTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config)
+                ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config
+                ds_plugin.hf_ds_config.trainer_config_process(self.args)

From 5176dc2310c6659f8e4c4b90252ab585e6445ba6 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 5 Jun 2023 16:29:32 +0200
Subject: [PATCH 309/935] Skip `test_multi_gpu_data_parallel_forward` for
 `MobileViTV2ModelTest` (#24017)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/mobilevitv2/test_modeling_mobilevitv2.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index 0da4afd375ea..8b3a754d337a 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import MobileViTV2Config
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -220,6 +220,11 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
+    @require_torch_multi_gpu
+    @unittest.skip(reason="Got `CUDA error: misaligned address` for tests after this one being run.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 

From b1430190058e98951a4a262e17a2bd5de0a76101 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:08:15 +0200
Subject: [PATCH 310/935] Update README.md (#24022)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docs/README.md | 35 ++---------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 9aa74d4de94b..b4278cc0dc3f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -369,20 +369,7 @@ contains the example docstring to the [documentation_tests.txt](../utils/documen
 
 ### For Python files
 
-You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
-
-```bash
-python utils/prepare_for_doc_test.py src docs
-```
-
-If you work on a specific python module, say `modeling_wav2vec2.py`, you can run the command as follows (to avoid the unnecessary temporary changes in irrelevant files):
-
-```bash
-python utils/prepare_for_doc_test.py src/transformers/utils/doc.py src/transformers/models/wav2vec2/modeling_wav2vec2.py
-```
-(`utils/doc.py` should always be included)
-
-Then you can run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
+Run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
 
 ```bash
 pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure
@@ -394,32 +381,14 @@ If you want to isolate a specific docstring, just add `::` after the file name t
 pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
 ```
 
-Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
-
-```bash
-python utils/prepare_for_doc_test.py src docs --remove_new_line
-```
-
 ### For Markdown files
 
-You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
-
-```bash
-python utils/prepare_for_doc_test.py src docs
-```
-
-Then you can test locally a given file with this command (here testing the quicktour):
+You can test locally a given file with this command (here testing the quicktour):
 
 ```bash
 pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
 ```
 
-Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
-
-```bash
-python utils/prepare_for_doc_test.py src docs --remove_new_line
-```
-
 ### Writing doctests
 
 Here are a few tips to help you debug the doctests and make them pass:

From b4919cb520edeecf90f20e6cb524708634d46888 Mon Sep 17 00:00:00 2001
From: Bearnardd <43574448+Bearnardd@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:10:47 +0200
Subject: [PATCH 311/935] Auto tokenizer registration (#23965)

add check loop over extra content
---
 src/transformers/models/auto/configuration_auto.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 72f8e26abf58..1b82bc40da1f 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -645,6 +645,10 @@ def config_class_to_model_type(config):
     for key, cls in CONFIG_MAPPING_NAMES.items():
         if cls == config:
             return key
+    # if key not found check in extra content
+    for key, cls in CONFIG_MAPPING._extra_content.items():
+        if cls.__name__ == config:
+            return key
     return None
 
 

From 7824fa431eec5e07811b907deb519102f6b0f932 Mon Sep 17 00:00:00 2001
From: Yessen Kanapin <yessenzhar@gmail.com>
Date: Mon, 5 Jun 2023 08:19:58 -0700
Subject: [PATCH 312/935] expose safe_serialization argument in the pipeline
 API (#23775)

expose safe_serialization argument of PreTrainedModel and TFPreTrainedModel in the save_pretrained of the pipeline api

Co-authored-by: Yessen Kanapin <yessen@deepinfra.com>
---
 src/transformers/pipelines/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 4fc256266449..466ce81999d9 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -821,13 +821,15 @@ def __init__(
                 # then we should keep working
                 self.image_processor = self.feature_extractor
 
-    def save_pretrained(self, save_directory: str):
+    def save_pretrained(self, save_directory: str, safe_serialization: bool = False):
         """
         Save the pipeline's model and tokenizer.
 
         Args:
             save_directory (`str`):
                 A path to the directory where to saved. It will be created if it doesn't exist.
+            safe_serialization (`str`):
+                Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow
         """
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -855,7 +857,7 @@ def save_pretrained(self, save_directory: str):
             # Save the pipeline custom code
             custom_object_save(self, save_directory)
 
-        self.model.save_pretrained(save_directory)
+        self.model.save_pretrained(save_directory, safe_serialization=safe_serialization)
 
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(save_directory)

From 44bd590a2926af8f3921a997ed6543de8e2d018a Mon Sep 17 00:00:00 2001
From: Jungwoo Park <affjljoo3581@gmail.com>
Date: Tue, 6 Jun 2023 00:47:29 +0900
Subject: [PATCH 313/935] Pix2Struct: fix wrong broadcast axis of attention
 mask in visual encoder (#23976)

* fix wrong broadcast axis of attention mask in visual encoder

* fix slow tests

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 src/transformers/models/pix2struct/modeling_pix2struct.py | 6 +++---
 tests/models/pix2struct/test_modeling_pix2struct.py       | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 1a900e264c02..539c95eda0c7 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -210,7 +210,7 @@ def to_projection_shape(states):
                 attention_mask = torch.ones((batch_size, seq_length), device=scores.device, dtype=scores.dtype)
 
             if attention_mask.dim() == 2:
-                position_bias = position_bias + attention_mask[:, None, :, None].to(position_bias.device)
+                position_bias = position_bias + attention_mask[:, None, None, :].to(position_bias.device)
             else:
                 # (batch_size, n_heads, seq_length, key_length)
                 position_bias = position_bias + attention_mask.to(position_bias.device)
@@ -1695,7 +1695,7 @@ def forward(
         >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
         >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         >>> print(generated_text)
-        A picture of a stop sign with a red stop sign on it.
+        A picture of a stop sign with a red stop sign
         ```
 
         Training:
@@ -1719,7 +1719,7 @@ def forward(
         >>> outputs = model(**inputs, labels=labels)
         >>> loss = outputs.loss
         >>> print(f"{loss.item():.5f}")
-        5.95566
+        5.94282
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.text_config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 4dbd7e649f16..4d028d111d0f 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -757,12 +757,12 @@ def test_batched_inference_image_captioning_conditioned(self):
 
         self.assertEqual(
             processor.decode(predictions[0], skip_special_tokens=True),
-            "A picture of a stop sign with a red stop sign on it.",
+            "A picture of a stop sign with a red stop sign",
         )
 
         self.assertEqual(
             processor.decode(predictions[1], skip_special_tokens=True),
-            "An photography of the Temple Bar and the Temple Bar.",
+            "An photography of the Temple Bar and other places in the city.",
         )
 
     def test_vqa_model(self):

From 2872f9671b0be5d111df43712bc70d20e664426f Mon Sep 17 00:00:00 2001
From: Brian Yu <46059916+bri25yu@users.noreply.github.com>
Date: Mon, 5 Jun 2023 08:53:45 -0700
Subject: [PATCH 314/935] TensorBoard callback no longer adds hparams (#23999)

tensorboard callback no longer adds hparams
---
 src/transformers/integrations.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 0a85cef6981c..dc3d00d4d134 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -639,9 +639,6 @@ def on_train_begin(self, args, state, control, **kwargs):
                 if hasattr(model, "config") and model.config is not None:
                     model_config_json = model.config.to_json_string()
                     self.tb_writer.add_text("model_config", model_config_json)
-            # Version of TensorBoard coming from tensorboardX does not have this method.
-            if hasattr(self.tb_writer, "add_hparams"):
-                self.tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={})
 
     def on_log(self, args, state, control, logs=None, **kwargs):
         if not state.is_world_process_zero:

From 649ffbf5752c5f839ab9691e0cdc23ae874008a3 Mon Sep 17 00:00:00 2001
From: Hyeonseo Yun <0525_hhgus@naver.com>
Date: Tue, 6 Jun 2023 01:02:03 +0900
Subject: [PATCH 315/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`tasks=5Fexplained.mdx`=20to=20Korean=20(#23844)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: tasks_explained.mdx

* feat: nmt and manual edit `tasks_explained.mdx`

* revised: resolve suggestions task_explained.mdx

* fixed: added draft of reference docs

Co-Authored-By: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>

* revised: resolve suggestions(voca, spell check) task_explained.mdx

Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* revised: remove duplicate sentence in task_explained.mdx

* fixed: remove draft of reference docs

- I think it will be confusing in the translation process.
- This issue is included in #23971.

---------

Co-authored-by: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml        |   6 +-
 docs/source/ko/tasks_explained.mdx | 291 +++++++++++++++++++++++++++++
 2 files changed, 294 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/ko/tasks_explained.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 661e1b751a43..d26f30b6ab4a 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -157,9 +157,9 @@
   - local: in_translation
     title: (번역중) Glossary
   - local: in_translation
-    title: (번역중) What 🤗 Transformers can do
-  - local: in_translation
-    title: (번역중) How 🤗 Transformers solve tasks
+    title: (번역중) 🤗 What 🤗 Transformers can do
+  - local: tasks_explained
+    title: 🤗 Transformers로 작업을 해결하는 방법
   - local: in_translation
     title: (번역중) The Transformer model family
   - local: in_translation
diff --git a/docs/source/ko/tasks_explained.mdx b/docs/source/ko/tasks_explained.mdx
new file mode 100644
index 000000000000..7bef350dc8b2
--- /dev/null
+++ b/docs/source/ko/tasks_explained.mdx
@@ -0,0 +1,291 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 🤗 Transformers로 작업을 해결하는 방법[[how-transformers-solve-tasks]]
+
+[🤗 Transformers로 할 수 있는 작업](task_summary)에서 자연어 처리(NLP), 음성 및 오디오, 컴퓨터 비전 작업 등의 중요한 응용을 배웠습니다. 이 페이지에서는 모델이 이러한 작업을 어떻게 해결하는지 자세히 살펴보고 내부에서 어떤 일이 일어나는지 설명합니다. 주어진 작업을 해결하는 많은 방법이 있으며, 일부 모델은 특정 기술을 구현하거나 심지어 새로운 방식으로 작업에 접근할 수도 있지만, Transformer 모델의 경우 일반적인 아이디어는 동일합니다. 유연한 아키텍처 덕분에 대부분의 모델은 인코더, 디코더 또는 인코더-디코더 구조의 변형입니다. Transformer 모델뿐만 아니라 우리의 라이브러리에는 오늘날 컴퓨터 비전 작업에 사용되는 몇 가지 합성곱 신경망(CNNs)도 있습니다. 또한, 우리는 현대 CNN의 작동 방식에 대해 설명할 것입니다.
+
+작업이 어떻게 해결되는지 설명하기 위해, 유용한 예측을 출력하고자 모델 내부에서 어떤 일이 일어나는지 살펴봅니다.
+
+- 오디오 분류 및 자동 음성 인식(ASR)을 위한 [Wav2Vec2](model_doc/wav2vec2)
+- 이미지 분류를 위한 [Vision Transformer (ViT)](model_doc/vit) 및 [ConvNeXT](model_doc/convnext)
+- 객체 탐지를 위한 [DETR](model_doc/detr)
+- 이미지 분할을 위한 [Mask2Former](model_doc/mask2former)
+- 깊이 추정을 위한 [GLPN](model_doc/glpn)
+- 인코더를 사용하는 텍스트 분류, 토큰 분류 및 질의응답과 같은 NLP 작업을 위한 [BERT](model_doc/bert)
+- 디코더를 사용하는 텍스트 생성과 같은 NLP 작업을 위한 [GPT2](model_doc/gpt2)
+- 인코더-디코더를 사용하는 요약 및 번역과 같은 NLP 작업을 위한 [BART](model_doc/bart)
+
+<Tip>
+
+더 나아가기 전에, 기존 Transformer 아키텍처에 대한 기본적인 지식을 숙지하는 것이 좋습니다. 인코더, 디코더 및 어텐션의 작동 방식을 알면 다양한 Transformer 모델이 어떻게 작동하는지 이해하는 데 도움이 됩니다. 시작 단계거나 복습이 필요한 경우, 더 많은 정보를 위해 [코스](https://huggingface.co/course/chapter1/4?fw=pt)를 확인하세요!
+
+</Tip>
+
+## 음성 및 오디오[[speech-and-audio]]
+
+[Wav2Vec2](model_doc/wav2vec2)는 레이블이 지정되지 않은 음성 데이터에 대해 사전훈련된 모델로, 오디오 분류 및 자동 음성 인식을 위해 레이블이 지정된 데이터로 미세 조정합니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+이 모델에는 4가지 주요 구성 요소가 있습니다:
+
+1. *특징 인코더(feature encoder)*는 원시 오디오 파형(raw audio waveform)을 가져와서 제로 평균 및 단위 분산으로 표준화하고, 각각 20ms 길이의 특징 벡터의 시퀀스로 변환합니다.
+
+2. 오디오 파형은 본질적으로 연속적이기 때문에, 텍스트 시퀀스를 단어로 나누는 것과 같이 분할할 수 없습니다. 그래서 *양자화 모듈(quantization module)*로 전달되는 특징 벡터는 이산형 음성 단위를 학습하기 위한 것입니다. 음성 단위는 *코드북(codebook)*(어휘집이라고 생각할 수 있습니다)이라는 코드단어(codewords) 콜렉션에서 선택됩니다. 코드북에서 연속적인 오디오 입력을 가장 잘 나타내는 벡터 또는 음성 단위가 선택되어 모델을 통과합니다.
+
+3. 특징 벡터의 절반은 무작위로 마스크가 적용되며, 마스크된 특징 벡터는 *상대적 위치 임베딩*을 추가하는 Transformer 인코더인 *문맥 네트워크(context network)*로 전달됩니다.
+
+4. 문맥 네트워크의 사전훈련 목표는 *대조적 작업(contrastive task)*입니다. 모델은 잘못된 예측 시퀀스에서 마스크된 예측의 실제 양자화된 음성 표현을 예측하며, 모델이 가장 유사한 컨텍스트 벡터와 양자화된 음성 단위(타겟 레이블)를 찾도록 권장합니다.
+
+이제 wav2vec2가 사전훈련되었으므로, 오디오 분류 또는 자동 음성 인식을 위해 데이터에 맞춰 미세 조정할 수 있습니다!
+
+### 오디오 분류[[audio-classification]]
+
+사전훈련된 모델을 오디오 분류에 사용하려면, 기본 Wav2Vec2 모델 상단에 시퀀스 분류 헤드를 추가하면 됩니다. 분류 헤드는 인코더의 은닉 상태(hidden states)를 받는 선형 레이어입니다. 은닉 상태는 각각 길이가 다른 오디오 프레임에서 학습된 특징을 나타냅니다. 고정 길이의 벡터 하나를 만들기 위해, 은닉 상태는 먼저 풀링되고, 클래스 레이블에 대한 로짓으로 변환됩니다. 가장 가능성이 높은 클래스를 찾기 위해 로짓과 타겟 사이의 교차 엔트로피 손실이 계산됩니다.
+
+오디오 분류에 직접 도전할 준비가 되셨나요? 완전한 [오디오 분류 가이드](tasks/audio_classification)를 확인하여 Wav2Vec2를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 자동 음성 인식[[automatic-speech-recognition]]
+
+사전훈련된 모델을 자동 음성 인식에 사용하려면, [연결주의적 시간 분류(CTC, Connectionist Temporal Classification)](glossary#connectionist-temporal-classification-ctc)를 위해 기본 Wav2Vec2 모델 상단에 언어 모델링 헤드를 추가합니다. 언어 모델링 헤드는 인코더의 은닉 상태를 받아서 로짓으로 변환합니다. 각 로짓은 토큰 클래스(토큰 수는 작업의 어휘에서 나타납니다)를 나타냅니다. CTC 손실은 텍스트로 디코딩된 토큰에서 가장 가능성이 높은 토큰 시퀀스를 찾기 위해 로짓과 타겟 사이에서 계산됩니다. 
+
+자동 음성 인식에 직접 도전할 준비가 되셨나요? 완전한 [자동 음성 인식 가이드](tasks/asr)를 확인하여 Wav2Vec2를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+## 컴퓨터 비전[[computer-vision]]
+
+컴퓨터 비전 작업에 접근하는 2가지 방법이 있습니다:
+
+1. 이미지를 패치 시퀀스로 분리하고 Transformer로 병렬 처리합니다.
+2. [ConvNeXT](model_doc/convnext)와 같은 현대 CNN을 사용합니다. 이는 합성곱 레이어를 기반으로 하지만 현대 네트워크 설계를 적용합니다.
+
+<Tip>
+
+세 번째 방법은 Transformer와 합성곱(예를 들어, [Convolutional Vision Transformer](model_doc/cvt) 또는 [LeViT](model_doc/levit))을 결합하는 것입니다. 우리는 살펴볼 두 가지 방법만 결합하기 때문에 여기서 이 방법을 다루지 않습니다.
+
+</Tip>
+
+ViT와 ConvNeXT는 일반적으로 이미지 분류에서 사용되지만, 물체 감지, 분할, 깊이 추정과 같은 다른 비전 작업에는 각각 DETR, Mask2Former, GLPN이 더 적합하므로 이러한 모델을 살펴보겠습니다.
+
+### 이미지 분류[[image-classification]]
+
+ViT와 ConvNeXT 모두 이미지 분류에 사용될 수 있지만, ViT는 어텐션 메커니즘을, ConvNeXT는 합성곱을 사용하는 것이 주된 차이입니다.
+
+#### Transformer[[transformer]]
+
+[ViT](model_doc/vit)은 합성곱을 전적으로 순수 Transformer 아키텍처로 대체합니다. 기존 Transformer에 익숙하다면, ViT를 이해하는 방법의 대부분을 이미 파악했다고 볼 수 있습니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+ViT가 도입한 주요 변경 사항은 이미지가 Transformer로 어떻게 전달되는지에 있습니다:
+
+1. 이미지는 서로 중첩되지 않는 정사각형 패치로 분할되고, 각 패치는 벡터 또는 *패치 임베딩(patch embedding)*으로 변환됩니다. 패치 임베딩은 적절한 입력 차원을 만드는 2D 합성곱 계층에서 생성됩니다(기본 Transformer의 경우 각 패치의 임베딩마다 768개의 값이 필요합니다). 224x224 픽셀 이미지가 있다면, 16x16 이미지 패치 196개로 분할할 수 있습니다. 텍스트가 단어로 토큰화되는 것처럼, 이미지도 패치 시퀀스로 "토큰화"됩니다.
+
+2. *학습 가능한 임베딩(learnable embedding)*(특수한 `[CLS]` 토큰)이 BERT와 같이 패치 임베딩의 시작 부분에 추가됩니다. `[CLS]` 토큰의 마지막 은닉 상태는 부착된 분류 헤드의 입력으로 사용되고, 다른 출력은 무시됩니다. 이 토큰은 모델이 이미지의 표현을 인코딩하는 방법을 학습하는 데 도움이 됩니다.
+
+3. 패치와 학습 가능한 임베딩에 마지막으로 추가할 것은 *위치 임베딩*입니다. 왜냐하면 모델은 이미지 패치의 순서를 모르기 때문입니다. 위치 임베딩도 학습 가능하며, 패치 임베딩과 동일한 크기를 가집니다. 최종적으로, 모든 임베딩이 Transformer 인코더에 전달됩니다.
+
+4. `[CLS]` 토큰을 포함한 출력은 다층 퍼셉트론 헤드(MLP)에 전달됩니다. ViT의 사전훈련 목표는 단순히 분류입니다. 다른 분류 헤드와 같이, MLP 헤드는 출력을 클래스 레이블에 대해 로짓으로 변환하고 교차 엔트로피 손실을 계산하여 가장 가능성이 높은 클래스를 찾습니다.
+
+이미지 분류에 직접 도전할 준비가 되셨나요? 완전한 [이미지 분류 가이드](tasks/image_classification)를 확인하여 ViT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+#### CNN[[cnn]]
+
+<Tip>
+
+이 섹션에서는 합성곱에 대해 간략하게 설명합니다. 그러나 이미지의 모양과 크기가 어떻게 변화하는지에 대한 사전 이해가 있다면 도움이 될 것입니다. 합성곱에 익숙하지 않은 경우, fastai book의 [합성곱 신경망 챕터](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb)를 확인하세요!
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext)는 성능을 높이기 위해 새로운 현대 네트워크 설계를 적용한 CNN 구조입니다. 그러나 합성곱은 여전히 모델의 핵심입니다. 높은 수준의 관점에서 볼 때, [합성곱](glossary#convolution)은 작은 행렬(*커널*)에 이미지 픽셀의 작은 윈도우를 곱하는 연산입니다. 이는 특정 텍스쳐(texture)이나 선의 곡률과 같은 일부 특징을 계산합니다. 그러고 다음 픽셀 윈도우로 넘어가는데, 여기서 합성곱이 이동하는 거리를 *보폭(stride)*이라고 합니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>패딩이나 보폭이 없는 기본 합성곱, <a href="https://arxiv.org/abs/1603.07285">딥러닝을 위한 합성곱 연산 가이드</a></small>
+
+이 출력을 다른 합성곱 레이어에 전달할 수 있으며, 각 연속적인 레이어를 통해 네트워크는 핫도그나 로켓과 같이 더 복잡하고 추상적인 것을 학습합니다. 합성곱 레이어 사이에 풀링 레이어를 추가하여 차원을 줄이고 특징의 위치 변화에 대해 모델을 더 견고하게 만드는 것이 일반적입니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+ConvNeXT는 CNN을 5가지 방식으로 현대화합니다:
+
+1. 각 단계의 블록 수를 변경하고 더 큰 보폭과 그에 대응하는 커널 크기로 이미지를 "패치화(patchify)"합니다. 겹치지 않는 슬라이딩 윈도우는 ViT가 이미지를 패치로 분할하는 방법과 유사하게 이 패치화 전략을 만듭니다.
+
+2. *병목(bottleneck)* 레이어는 채널 수를 줄였다가 다시 복원합니다. 왜냐하면 1x1 합성곱을 수행하는 것이 더 빠르고, 깊이를 늘릴 수 있기 때문입니다. 역 병목(inverted bottlenect)은 채널 수를 확장하고 축소함으로써 그 반대로 수행하므로, 메모리 효율이 더 높습니다.
+
+3. 병목 레이어의 일반적인 3x3 합성곱 레이어를 각 입력 채널에 개별적으로 합성곱을 적용한 다음 마지막에 쌓는 *깊이별 합성곱(depthwise convolution)*으로 대체합니다. 이는 네트워크 폭이 넓혀 성능이 향상됩니다.
+
+4. ViT는 어텐션 메커니즘 덕분에 한 번에 더 많은 이미지를 볼 수 있는 전역 수신 필드를 가지고 있습니다. ConvNeXT는 커널 크기를 7x7로 늘려 이 효과를 재현하려고 시도합니다.
+
+5. 또한 ConvNeXT는 Transformer 모델을 모방하는 몇 가지 레이어 설계를 변경합니다. 활성화 및 정규화 레이어가 더 적고, 활성화 함수가 ReLU 대신 GELU로 전환되고, BatchNorm 대신 LayerNorm을 사용합니다.
+
+합성곱 블록의 출력은 분류 헤드로 전달되며, 분류 헤드는 출력을 로짓으로 변환하고 교차 엔트로피 손실을 계산하여 가장 가능성이 높은 레이블을 찾습니다.
+
+### 객체 탐지[[object-detection]]
+
+[DETR](model_doc/detr), *DEtection TRansformer*는 CNN과 Transformer 인코더-디코더를 결합한 종단간(end-to-end) 객체 탐지 모델입니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+1. 사전훈련된 CNN *백본(backbone)*은 픽셀 값으로 나타낸 이미지를 가져와 저해상도 특징 맵을 만듭니다. 특징 맵에 대해 1x1 합성곱을 적용하여 차원을 줄이고, 고수준 이미지 표현을 가진 새로운 특징 맵을 생성합니다. Transformer는 시퀀스 모델이기 때문에 특징 맵을 위치 임베딩과 결합된 특징 벡터의 시퀀스로 평탄화합니다.
+
+2. 특징 벡터는 어텐션 레이어를 사용하여 이미지 표현을 학습하는 인코더에 전달됩니다. 다음으로, 인코더의 은닉 상태는 디코더에서 *객체 쿼리*와 결합됩니다. 객체 쿼리는 이미지의 다른 영역에 초점을 맞춘 학습된 임베딩으로 학습되고, 각 어텐션 레이어를 진행하면서 갱신됩니다. 디코더의 은닉 상태는 각 객체 쿼리에 대한 바운딩 박스 좌표와 클래스 레이블을 예측하는 순방향 네트워크에 전달되며, 객체가 없는 경우 `no object`가 출력됩니다.
+
+    DETR은 각 객체 쿼리를 병렬로 디코딩하여 *N* 개의 최종 예측을 출력합니다. 여기서 *N*은 쿼리 수입니다. 한 번에 하나의 요소를 예측하는 일반적인 자기회귀 모델과 달리, 객체 탐지는 한 번에 *N* 개의 예측을 수행하는 집합 예측 작업(`바운딩 박스`, `클래스 레이블`)입니다.
+
+3. DETR은 훈련 중 *이분 매칭 손실(bipartite matching loss)*을 사용하여 고정된 수의 예측과 고정된 실제 정답 레이블(ground truth labels) 세트를 비교합니다. *N*개의 레이블 세트에 실제 정답 레이블보다 적은 경우, `no object` 클래스로 패딩됩니다. 이 손실 함수는 DETR이 예측과 실제 정답 레이블 간 1:1 대응을 찾도록 권장합니다. 바운딩 박스 또는 클래스 레이블 중 하나라도 잘못된 경우, 손실이 발생합니다. 마찬가지로, 존재하지 않는 객체를 예측하는 경우, 패널티를 받습니다. 이로 인해 DETR은 이미지에서 눈에 잘 띄는 물체 하나에 집중하는 대신, 다른 객체를 찾도록 권장됩니다.
+
+객체 탐지 헤드가 DETR 상단에 추가되어 클래스 레이블과 바운딩 박스의 좌표를 찾습니다. 객체 탐지 헤드에는 두 가지 구성 요소가 있습니다: 디코더 은닉 상태를 클래스 레이블의 로짓으로 변환하는 선형 레이어 및 바운딩 박스를 예측하는 MLP
+
+객체 탐지에 직접 도전할 준비가 되셨나요? 완전한 [객체 탐지 가이드](tasks/object_detection)를 확인하여 DETR을 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 이미지 분할[[image-segmentation]]
+
+[Mask2Former](model_doc/mask2former)는 모든 유형의 이미지 분할 작업을 해결하는 범용 아키텍처입니다. 전통적인 분할 모델은 일반적으로 시멘틱(semantic) 또는 파놉틱(panoptic) 분할과 같은 이미지 분할의 특정 하위 작업에 맞춰 조정됩니다. Mask2Former는 모든 작업을 *마스크 분류* 문제로 구성합니다. 마스크 분류는 픽셀을 *N*개 세그먼트로 그룹화하고, 주어진 이미지에 대해 *N*개의 마스크와 그에 대응하는 클래스 레이블을 예측합니다. 이 섹션에서 Mask2Former의 작동 방법을 설명한 다음, 마지막에 SegFormer를 미세 조정해볼 수 있습니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+Mask2Former에는 3가지 주요 구성 요소가 있습니다:
+
+1. [Swin](model_doc/swin) 백본이 이미지를 받아 3개의 연속된 3x3 합성곱에서 저해상도 이미지 특징 맵을 생성합니다.
+
+2. 특징 맵은 *픽셀 디코더*에 전달됩니다. 이 디코더는 저해상도 특징을 고해상도 픽셀 임베딩으로 점진적으로 업샘플링합니다. 픽셀 디코더는 실제로 원본 이미지의 1/32, 1/16, 1/8 해상도의 다중 스케일 특징(저해상도 및 고해상도 특징 모두 포함)을 생성합니다.
+
+3. 이러한 서로 다른 크기의 특징 맵은 고해상도 특징에서 작은 객체를 포착하기 위해 한 번에 하나의 Transformer 디코더 레이어에 연속적으로 공급됩니다. Mask2Former의 핵심은 디코더의 *마스크 어텐션* 메커니즘입니다. 전체 이미지를 참조할 수 있는 크로스 어텐션(cross-attention)과 달리, 마스크 어텐션은 이미지의 특정 영역에만 집중합니다. 이는 이미지의 지역적 특징만으로 모델이 충분히 학습할 수 있기 때문에 더 빠르고 성능이 우수합니다.
+
+4. [DETR](tasks_explained#object-detection)과 같이, Mask2Former는 학습된 객체 쿼리를 사용하고 이를 픽셀 디코더에서의 이미지 특징과 결합하여 예측 집합(`클래스 레이블`, `마스크 예측`)을 생성합니다. 디코더의 은닉 상태는 선형 레이어로 전달되어 클래스 레이블에 대한 로짓으로 변환됩니다. 로짓과 클래스 레이블 사이의 교차 엔트로피 손실을 계산하여 가장 가능성이 높은 것을 찾습니다.
+
+    마스크 예측은 픽셀 임베딩과 최종 디코더 은닉 상태를 결합하여 생성됩니다. 시그모이드 교차 엔트로피 및 Dice 손실은 로짓과 실제 정답 마스크(ground truth mask) 사이에서 계산되어 가장 가능성이 높은 마스크를 찾습니다.
+
+이미지 분할에 직접 도전할 준비가 되셨나요? 완전한 [이미지 분할 가이드](tasks/semantic_segmentation)를 확인하여 SegFormer를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 깊이 추정[[depth-estimation]]
+
+[GLPN](model_doc/glpn), *Global-Local Path Network*는 [SegFormer](model_doc/segformer) 인코더와 경량 디코더를 결합한 깊이 추정을 위한 Transformer입니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+
+1. ViT와 같이, 이미지는 패치 시퀀스로 분할되지만, 이미지 패치가 더 작다는 점이 다릅니다. 이는 세그멘테이션이나 깊이 추정과 같은 밀도 예측 작업에 더 적합합니다. 이미지 패치는 패치 임베딩으로 변환되어(패치 임베딩이 생성되는 방법은 [이미지 분류](#image-classification) 섹션을 참조하세요), 인코더로 전달됩니다.
+
+2. 인코더는 패치 임베딩을 받아, 여러 인코더 블록에 전달합니다. 각 블록은 어텐션 및 Mix-FFN 레이어로 구성됩니다. 후자의 목적은 위치 정보를 제공하는 것입니다. 각 인코더 블록의 끝에는 계층적 표현을 생성하기 위한 *패치 병합(patch merging)* 레이어가 있습니다. 각 인접한 패치 그룹의 특징은 연결되고, 연결된 특징에 선형 레이어가 적용되어 패치 수를 1/4의 해상도로 줄입니다. 이는 다음 인코더 블록의 입력이 되며, 이러한 전체 프로세스는 1/8, 1/16, 1/32 해상도의 이미지 특징을 가질 때까지 반복됩니다.
+
+3. 경량 디코더는 인코더에서 마지막 특징 맵(1/32 크기)을 가져와 1/16 크기로 업샘플링합니다. 여기서, 특징은 *선택적 특징 융합(SFF, Selective Feature Fusion)* 모듈로 전달됩니다. 이 모듈은 각 특징에 대해 어텐션 맵에서 로컬 및 전역 특징을 선택하고 결합한 다음, 1/8로 업샘플링합니다. 이 프로세스는 디코딩된 특성이 원본 이미지와 동일한 크기가 될 때까지 반복됩니다. 출력은 두 개의 합성곱 레이어를 거친 다음, 시그모이드 활성화가 적용되어 각 픽셀의 깊이를 예측합니다.
+
+## 자연어처리[[natural-language-processing]]
+
+Transformer는 초기에 기계 번역을 위해 설계되었고, 그 이후로는 사실상 모든 NLP 작업을 해결하기 위한 기본 아키텍처가 되었습니다. 어떤 작업은 Transformer의 인코더 구조에 적합하며, 다른 작업은 디코더에 더 적합합니다. 또 다른 작업은 Transformer의 인코더-디코더 구조를 모두 활용합니다.
+
+### 텍스트 분류[[text-classification]]
+
+[BERT](model_doc/bert)는 인코더 전용 모델이며, 텍스트의 풍부한 표현을 학습하기 위해 양방향의 단어에 주목함으로써 심층 양방향성(deep bidirectionality)을 효과적으로 구현한 최초의 모델입니다.
+
+1. BERT는 [WordPiece](tokenizer_summary#wordpiece) 토큰화를 사용하여 문장의 토큰 임베딩을 생성합니다. 단일 문장과 한 쌍의 문장을 구분하기 위해 특수한 `[SEP]` 토큰이 추가됩니다. 모든 텍스트 시퀀스의 시작 부분에는 특수한 `[CLS]` 토큰이 추가됩니다. `[CLS]` 토큰이 있는 최종 출력은 분류 작업을 위한 분류 헤드로 입력에 사용됩니다. BERT는 또한 한 쌍의 문장에서 각 토큰이 첫 번째 문장인지 두 번째 문장에 속하는지 나타내는 세그먼트 임베딩(segment embedding)을 추가합니다.
+
+2. BERT는 마스크드 언어 모델링과 다음 문장 예측, 두 가지 목적으로 사전훈련됩니다. 마스크드 언어 모델링에서는 입력 토큰의 일부가 무작위로 마스킹되고, 모델은 이를 예측해야 합니다. 이는 모델이 모든 단어를 보고 다음 단어를 "예측"할 수 있는 양방향성 문제를 해결합니다. 예측된 마스크 토큰의 최종 은닉 상태는 어휘에 대한 소프트맥스가 있는 순방향 네트워크로 전달되어 마스크된 단어를 예측합니다.
+
+    두 번째 사전훈련 대상은 다음 문장 예측입니다. 모델은 문장 B가 문장 A 다음에 오는지 예측해야 합니다. 문장 B가 다음 문장인 경우와 무작위 문장인 경우 각각 50%의 확률로 발생합니다. 다음 문장인지 아닌지에 대한 예측은 두 개의 클래스(`IsNext` 및 `NotNext`)에 대한 소프트맥스가 있는 순방향 네트워크로 전달됩니다.
+
+3. 입력 임베딩은 여러 인코더 레이어를 거쳐서 최종 은닉 상태를 출력합니다.
+
+사전훈련된 모델을 텍스트 분류에 사용하려면, 기본 BERT 모델 상단에 시퀀스 분류 헤드를 추가합니다. 시퀀스 분류 헤드는 최종 은닉 상태를 받는 선형 레이어이며, 로짓으로 변환하기 위해 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 타겟 간에 계산되어 가장 가능성이 높은 레이블을 찾습니다. 
+
+텍스트 분류에 직접 도전할 준비가 되셨나요? 완전한 [텍스트 분류 가이드](tasks/sequence_classification)를 확인하여 DistilBERT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 토큰 분류[[token-classification]]
+
+개체명 인식(Named Entity Recognition, NER)과 같은 토큰 분류 작업에 BERT를 사용하려면, 기본 BERT 모델 상단에 토큰 분류 헤드를 추가합니다. 토큰 분류 헤드는 최종 은닉 상태를 받는 선형 레이어이며, 로짓으로 변환하기 위해 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 각 토큰 간에 계산되어 가장 가능성이 높은 레이블을 찾습니다. 
+
+토큰 분류에 직접 도전할 준비가 되셨나요? 완전한 [토큰 분류 가이드](tasks/token_classification)를 확인하여 DistilBERT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+### 질의응답[[question-answering]]
+
+질의응답에 BERT를 사용하려면, 기본 BERT 모델 위에 스팬(span) 분류 헤드를 추가합니다. 이 선형 레이어는 최종 은닉 상태를 받고, 답변에 대응하는 `스팬`의 시작과 끝 로그를 계산하기 위해 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 각 레이블 위치 간에 계산되어 답변에 대응하는 가장 가능성이 높은 텍스트의 스팬을 찾습니다. 
+
+질의응답에 직접 도전할 준비가 되셨나요? 완전한 [질의응답 가이드](tasks/question_answering)를 확인하여 DistilBERT를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+💡 사전훈련된 BERT를 다양한 작업에 사용하는 것이 얼마나 쉬운지 주목하세요. 사전훈련된 모델에 특정 헤드를 추가하기만 하면 은닉 상태를 원하는 출력으로 조작할 수 있습니다!
+
+</Tip>
+
+### 텍스트 생성[[text-generation]]
+
+[GPT-2](model_doc/gpt2)는 대량의 텍스트에 대해 사전훈련된 디코딩 전용 모델입니다. 프롬프트를 주어지면 설득력 있는 (항상 사실은 아니지만!) 텍스트를 생성하고 명시적으로 훈련되지 않았음에도 불구하고 질의응답과 같은 다른 NLP 작업을 완수할 수 있습니다.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. GPT-2는 단어를 토큰화하고 토큰 임베딩을 생성하기 위해 [바이트 페어 인코딩(BPE, byte pair encoding)](tokenizer_summary#bytepair-encoding-bpe)을 사용합니다. 위치 인코딩은 시퀀스에서 각 토큰의 위치를 나타내기 위해 토큰 임베딩에 추가됩니다. 입력 임베딩은 여러 디코더 블록을 거쳐 일부 최종 은닉 상태를 출력합니다. 각 디코더 블록 내에서 GPT-2는 *마스크드 셀프 어텐션(masked self-attention)* 레이어를 사용합니다. 이는 GPT-2가 이후 토큰(future tokens)에 주의를 기울일 수 없도록 합니다. 왼쪽에 있는 토큰에만 주의를 기울일 수 있습니다. 마스크드 셀프 어텐션에서는 어텐션 마스크를 사용하여 이후 토큰에 대한 점수(score)를 `0`으로 설정하기 때문에 BERT의 [`mask`] 토큰과 다릅니다.
+
+2. 디코더의 출력은 언어 모델링 헤드에 전달되며, 언어 모델링 헤드는 은닉 상태를 로짓으로 선형 변환을 수행합니다. 레이블은 시퀀스의 다음 토큰으로, 로짓을 오른쪽으로 하나씩 이동하여 생성됩니다. 교차 엔트로피 손실은 이동된 로짓과 레이블 간에 계산되어 가장 가능성이 높은 다음 토큰을 출력합니다.
+
+GPT-2의 사전훈련 목적은 전적으로 [인과적 언어 모델링](glossary#causal-language-modeling)에 기반하여, 시퀀스에서 다음 단어를 예측하는 것입니다. 이는 GPT-2가 텍스트 생성에 관련된 작업에 특히 우수하도록 합니다.
+
+텍스트 생성에 직접 도전할 준비가 되셨나요? 완전한 [인과적 언어 모델링 가이드](tasks/language_modeling#causal-language-modeling)를 확인하여 DistilGPT-2를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+텍스트 생성에 대한 자세한 내용은 [텍스트 생성 전략](generation_strategies) 가이드를 확인하세요!
+
+</Tip>
+
+### 요약[[summarization]]
+
+[BART](model_doc/bart) 및 [T5](model_doc/t5)와 같은 인코더-디코더 모델은 요약 작업의 시퀀스-투-시퀀스 패턴을 위해 설계되었습니다. 이 섹션에서 BART의 작동 방법을 설명한 다음, 마지막에 T5를 미세 조정해볼 수 있습니다. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. BART의 인코더 아키텍처는 BERT와 매우 유사하며 텍스트의 토큰 및 위치 임베딩을 받습니다. BART는 입력을 변형시키고 디코더로 재구성하여 사전훈련됩니다. 특정 변형 기법이 있는 다른 인코더와는 달리, BART는 모든 유형의 변형을 적용할 수 있습니다. 그러나 *text infilling* 변형 기법이 가장 잘 작동합니다. Text Infiling에서는 여러 텍스트 스팬을 **단일** [`mask`] 토큰으로 대체합니다. 이는 모델이 마스크된 토큰을 예측해야 하고, 모델에 누락된 토큰의 수를 예측하도록 가르치기 때문에 중요합니다. 입력 임베딩과 마스크된 스팬이 인코더를 거쳐 최종 은닉 상태를 출력하지만, BERT와 달리 BART는 마지막에 단어를 예측하는 순방향 네트워크를 추가하지 않습니다.
+
+2. 인코더의 출력은 디코더로 전달되며, 디코더는 인코더의 출력에서 마스크 토큰과 변형되지 않은 토큰을 예측해야 합니다. 이는 디코더가 원본 텍스트를 복원하는 데 도움이 되는 추가적인 문맥을 얻도록 합니다. 디코더의 출력은 언어 모델링 헤드에 전달되며, 언어 모델링 헤드는 은닉 상태를 로짓으로 선형 변환을 수행합니다. 교차 엔트로피 손실은 로짓과 토큰이 오른쪽으로 이동된 레이블 간에 계산됩니다.
+
+요약에 직접 도전할 준비가 되셨나요? 완전한 [요약 가이드](tasks/summarization)를 확인하여 T5를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+텍스트 생성에 대한 자세한 내용은 [텍스트 생성 전략](generation_strategies) 가이드를 확인하세요!
+
+</Tip>
+
+### 번역[[translation]]
+
+번역은 시퀀스-투-시퀀스 작업의 또 다른 예로, [BART](model_doc/bart) 또는 [T5](model_doc/t5)와 같은 인코더-디코더 모델을 사용할 수 있습니다. 이 섹션에서 BART의 작동 방법을 설명한 다음, 마지막에 T5를 미세 조정해볼 수 있습니다. 
+
+BART는 원천 언어를 타겟 언어로 디코딩할 수 있는 입력에 매핑하기 위해 무작위로 초기화된 별도의 인코더를 추가하여 번역에 적용합니다. 이 새로운 인코더의 임베딩은 원본 단어 임베딩 대신 사전훈련된 인코더로 전달됩니다. 원천 인코더는 모델 출력의 교차 엔트로피 손실로부터 원천 인코더, 위치 임베딩, 입력 임베딩을 갱신하여 훈련됩니다. 첫 번째 단계에서는 모델 파라미터가 고정되고, 두 번째 단계에서는 모든 모델 파라미터가 함께 훈련됩니다.
+
+BART는 이후 번역을 위해 다양한 언어로 사전훈련된 다국어 버전의 mBART로 확장되었습니다.
+
+번역에 직접 도전할 준비가 되셨나요? 완전한 [번역 가이드](tasks/summarization)를 확인하여 T5를 미세 조정하고 추론에 사용하는 방법을 학습하세요!
+
+<Tip>
+
+텍스트 생성에 대한 자세한 내용은 [텍스트 생성 전략](generation_strategies) 가이드를 확인하세요!
+
+</Tip>
\ No newline at end of file

From 17846646f230fdf5b0d6b4d31248ccb418000acb Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 5 Jun 2023 18:12:45 +0200
Subject: [PATCH 316/935] Fix `MobileViTV2` checkpoint name (#24018)

* fix

* fix

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 tests/models/mobilevitv2/test_modeling_mobilevitv2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index 8b3a754d337a..0e55e6babe90 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -306,14 +306,14 @@ class MobileViTV2ModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
         return (
-            MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-imagenet1k-256")
+            MobileViTImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
             if is_vision_available()
             else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = MobileViTV2ForImageClassification.from_pretrained("shehan97/mobilevitv2-1.0-imagenet1k-256").to(
+        model = MobileViTV2ForImageClassification.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256").to(
             torch_device
         )
 

From 7631db0fdcfbd95b1f21d8034a0b8df73b9380ff Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 5 Jun 2023 20:00:28 +0200
Subject: [PATCH 317/935] Pin `deepspeed` to `0.9.2` for now (#24024)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-scheduled.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index a0a9d3a5de4e..d92fb01cd529 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -393,7 +393,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed==0.9.2 --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |

From c9385976571a215197a62899ffece50b383f5137 Mon Sep 17 00:00:00 2001
From: Wonhyeong Seo <wonhseo@kakao.com>
Date: Tue, 6 Jun 2023 22:08:26 +0900
Subject: [PATCH 318/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`language-modeling.mdx`=20(#23969)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: `language_modeling.mdx`

* feat: nmt draft

* fix: manual edits

* fix: add inline toc

* fix: typo in toc_tree.yml

* fix: resolve suggestions

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

---------

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml                |   4 +-
 docs/source/ko/tasks/language_modeling.mdx | 413 +++++++++++++++++++++
 2 files changed, 415 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tasks/language_modeling.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index d26f30b6ab4a..1502c96c7502 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -30,8 +30,8 @@
         title: 토큰 분류
       - local: tasks/question_answering
         title: 질의 응답(Question Answering)
-      - local: in_translation
-        title: (번역중) Causal language modeling
+      - local: tasks/language_modeling
+        title: 인과적 언어 모델링(Causal language modeling)
       - local: tasks/masked_language_modeling
         title: 마스킹된 언어 모델링(Masked language modeling)
       - local: tasks/translation
diff --git a/docs/source/ko/tasks/language_modeling.mdx b/docs/source/ko/tasks/language_modeling.mdx
new file mode 100644
index 000000000000..e5261d1813e0
--- /dev/null
+++ b/docs/source/ko/tasks/language_modeling.mdx
@@ -0,0 +1,413 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 인과 언어 모델링[[causal-language-modeling]]
+
+[[open-in-colab]]
+
+언어 모델링은 인과적 언어 모델링과 마스크드 언어 모델링, 두 가지 유형으로 나뉩니다. 이 가이드에서는 인과적 언어 모델링을 설명합니다.
+인과 언어 모델은 텍스트 생성에 자주 사용됩니다. 또 창의적인 방향으로 응용할 수 있습니다.
+직접 사용하며 재미있는 탐구를 해보거나, Copilot 또는 CodeParrot와 같은 지능형 코딩 어시스턴트의 기반이 되기도 합니다.
+
+<Youtube id="Vpjb1lu0MDk"/>
+
+인과 언어 모델링은 토큰 시퀀스에서 다음 토큰을 예측하며, 모델은 왼쪽의 토큰에만 접근할 수 있습니다.
+이는 모델이 미래의 토큰을 볼 수 없다는 것을 의미합니다. 인과 언어 모델의 예로 GPT-2가 있죠.
+
+이 가이드에서는 다음 작업을 수행하는 방법을 안내합니다:
+
+1. [DistilGPT2](https://huggingface.co/distilgpt2) 모델을 [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트의 [r/askscience](https://www.reddit.com/r/askscience/) 하위 집합으로 미세 조정
+2. 미세 조정된 모델을 추론에 사용
+
+<Tip>
+이 안내서의 단계와 동일한 방법으로 인과 언어 모델링을 위해 다른 아키텍처를 미세 조정할 수 있습니다.
+다음 아키텍처 중 하나를 선택하세요:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+커뮤니티에 모델을 업로드하고 공유하기 위해 Hugging Face 계정에 로그인하는 것을 권장합니다. 알림이 표시되면 토큰을 입력하여 로그인하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## ELI5 데이터 세트 불러오기[[load-eli5-dataset]]
+
+먼저, 🤗 Datasets 라이브러리에서 r/askscience의 작은 하위 집합인 ELI5 데이터 세트를 불러옵니다.
+이를 통해 전체 데이터 세트에서 학습하는 데 더 많은 시간을 투자하기 전에, 실험해봄으로써 모든 것이 작동하는지 확인할 수 있습니다.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+데이터 세트의 `train_asks` 분할을 [`~datasets.Dataset.train_test_split`] 메소드를 사용하여 학습 및 테스트 세트로 분할합니다:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+그런 다음 예제를 살펴보세요:
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+많아 보일 수 있지만, 실제로는 `text` 필드만 중요합니다. 언어 모델링 작업의 장점은 레이블이 필요하지 않다는 것입니다. 다음 단어 *자체가* 레이블입니다. (이렇게 레이블을 제공하지 않아도 되는 학습을 비지도 학습이라고 일컫습니다)
+
+## 전처리[[preprocess]]
+
+<Youtube id="ma1TrR7gE7I"/>
+
+다음 단계는 `text` 필드를 전처리하기 위해 DistilGPT2 토크나이저를 불러오는 것입니다.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+```
+
+위의 예제에서 알 수 있듯이, `text` 필드는 `answers` 아래에 중첩되어 있습니다. 따라서 [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) 메소드를 사용하여 중첩 구조에서 `text` 하위 필드를 추출해야 합니다.
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+각 하위 필드는 이제 `answers` 접두사를 가진 별도의 열로 나뉘었으며, `text` 필드는 이제 리스트입니다. 각 문장을 개별적으로 토큰화하는 대신, 먼저 리스트를 문자열로 변환하여 한꺼번에 토큰화할 수 있습니다.
+
+다음은 문자열 리스트를 결합하고 결과를 토큰화하는 첫 번째 전처리 함수입니다:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+이 전처리 함수를 전체 데이터 세트에 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용하세요. `batched=True`로 설정하여 데이터셋의 여러 요소를 한 번에 처리하고, `num_proc`를 증가시켜 프로세스 수를 늘릴 수 있습니다. 필요 없는 열은 제거하세요:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+이제 데이터 세트는 시퀀스가 토큰화됐지만, 일부 시퀀스는 모델의 최대 입력 길이보다 길 수 있습니다.
+
+이제 두 번째 전처리 함수를 사용하여
+- 모든 시퀀스를 연결하고,
+- `block_size`로 정의된 길이로 연결된 시퀀스를 여러 개의 짧은 묶음으로 나눕니다. 이 값은 최대 입력 길이와 GPU RAM을 고려해 충분히 짧아야 합니다.
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+전체 데이터 세트에 `group_texts` 함수를 적용하세요:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+그런 다음 [`DataCollatorForLanguageModeling`]을 사용하여 예제의 배치를 만듭니다. 데이터 세트 전체를 최대 길이로 패딩하는 것보다, 취합 단계에서 각 배치의 최대 길이로 문장을 *동적으로 패딩*하는 것이 더 효율적입니다.
+
+<frameworkcontent>
+<pt>
+패딩 토큰으로 종결 토큰을 사용하고 `mlm=False`로 설정하세요. 이렇게 하면 입력을 오른쪽으로 한 칸씩 시프트한 값을 레이블로 사용합니다:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+```
+
+</pt>
+<tf>
+패딩 토큰으로 종결 토큰을 사용하고 `mlm=False`로 설정하세요. 이렇게 하면 입력을 오른쪽으로 한 칸씩 시프트한 값을 레이블로 사용합니다:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+
+</tf>
+</frameworkcontent>
+
+
+## 훈련[[train]]
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`]를 사용하여 모델을 미세 조정하는 방법을 잘 모르신다면 [기본 튜토리얼](../training#train-with-pytorch-trainer)을 확인해보세요!
+
+</Tip>
+
+이제 모델을 훈련하기 준비가 되었습니다! [`AutoModelForCausalLM`]를 사용하여 DistilGPT2를 불러옵니다:
+
+```py
+>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+여기까지 진행하면 세 단계만 남았습니다:
+
+1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. `output_dir`은 유일한 필수 매개변수로, 모델을 저장할 위치를 지정합니다. (먼저 Hugging Face에 로그인 필수) `push_to_hub=True`로 설정하여 이 모델을 허브에 업로드할 수 있습니다.
+2. 훈련 인수를 [`Trainer`]에 모델, 데이터 세트 및 데이터 콜레이터와 함께 전달하세요.
+3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_clm-model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+훈련이 완료되면 [`~transformers.Trainer.evaluate`] 메소드를 사용하여 모델을 평가하고 퍼플렉서티를 얻을 수 있습니다:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 49.61
+```
+
+그런 다음 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요. 이렇게 하면 누구나 모델을 사용할 수 있습니다:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras를 사용하여 모델을 미세 조정하는 방법에 익숙하지 않다면 [기본 튜토리얼](../training#train-a-tensorflow-model-with-keras)을 확인해보세요!
+
+</Tip>
+TensorFlow에서 모델을 미세 조정하려면, 먼저 옵티마이저 함수, 학습률 스케줄 및 일부 훈련 하이퍼파라미터를 설정하세요:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+그런 다음 [`TFAutoModelForCausalLM`]를 사용하여 DistilGPT2를 불러옵니다:
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환하세요:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 모델을 훈련하기 위해 구성하세요. Transformers 모델은 모두 기본적인 작업 관련 손실 함수를 가지고 있으므로, 원한다면 별도로 지정하지 않아도 됩니다:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # 별도로 loss 인자를 넣지 않았어요!
+```
+
+[`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 업로드할 위치를 지정할 수 있습니다:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_clm-model",
+...     tokenizer=tokenizer,
+... )
+```
+
+마지막으로, 모델을 훈련하기 위해 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하세요. 훈련 데이터 세트, 검증 데이터 세트, 에폭 수 및 콜백을 전달하세요:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+훈련이 완료되면 모델이 자동으로 허브에 업로드되어 모두가 사용할 수 있습니다!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+인과 언어 모델링을 위해 모델을 미세 조정하는 더 자세한 예제는 해당하는 [PyTorch 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) 또는 [TensorFlow 노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)을 참조하세요.
+
+</Tip>
+
+## 추론[[inference]]
+
+좋아요, 이제 모델을 미세 조정했으므로 추론에 사용할 수 있습니다!
+
+생성할 텍스트를 위한 프롬프트를 만들어보세요:
+
+```py
+>>> prompt = "Somatic hypermutation allows the immune system to"
+```
+
+추론을 위해 미세 조정된 모델을 간단히 사용하는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 모델과 함께 텍스트 생성을 위한 `pipeline`을 인스턴스화하고 텍스트를 전달하세요:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model")
+>>> generator(prompt)
+[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
+```
+
+<frameworkcontent>
+<pt>
+텍스트를 토큰화하고 `input_ids`를 PyTorch 텐서로 반환하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
+```
+
+[`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하여 텍스트를 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+생성된 토큰 ID를 다시 텍스트로 디코딩하세요:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
+```
+</pt>
+<tf>
+텍스트를 토큰화하고 `input_ids`를 TensorFlow 텐서로 반환하세요:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
+```
+
+[`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하여 요약을 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+생성된 토큰 ID를 다시 텍스트로 디코딩하세요:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
+```
+</tf>
+</frameworkcontent>

From 7da3ce04a6cd85a12f55b81e8de128e6982cb7aa Mon Sep 17 00:00:00 2001
From: Wonhyeong Seo <wonhseo@kakao.com>
Date: Tue, 6 Jun 2023 22:08:45 +0900
Subject: [PATCH 319/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`bertology.mdx`=20to=20Korean=20(#23968)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: `bertology.mdx`

* feat: nmt draft

* fix: manual edits

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/_toctree.yml  |  4 ++--
 docs/source/ko/bertology.mdx | 37 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/bertology.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 1502c96c7502..8c15b26039c2 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -168,8 +168,8 @@
     title: (번역중) Attention mechanisms
   - local: pad_truncation
     title: 패딩과 잘라내기
-  - local: in_translation
-    title: (번역중) BERTology
+  - local: bertology
+    title: BERTology
   - local: in_translation
     title: (번역중) Perplexity of fixed-length models
   - local: in_translation
diff --git a/docs/source/ko/bertology.mdx b/docs/source/ko/bertology.mdx
new file mode 100644
index 000000000000..33b00b2af4d0
--- /dev/null
+++ b/docs/source/ko/bertology.mdx
@@ -0,0 +1,37 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# BERTology
+
+BERT와 같은 대규모 트랜스포머의 내부 동작을 조사하는 연구 분야가 점점 더 중요해지고 있습니다.
+혹자는 "BERTology"라 칭하기도 합니다. 이 분야의 좋은 예시는 다음과 같습니다:
+
+
+- BERT는 고전적인 NLP 파이프라인의 재발견 - Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- 16개의 헤드가 정말로 1개보다 나은가? - Paul Michel, Omer Levy, Graham Neubig:
+  https://arxiv.org/abs/1905.10650
+- BERT는 무엇을 보는가? BERT의 어텐션 분석 - Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning:
+  https://arxiv.org/abs/1906.04341
+- CAT-probing: 프로그래밍 언어에 대해 사전훈련된 모델이 어떻게 코드 구조를 보는지 알아보기 위한 메트릭 기반 접근 방법:
+  https://arxiv.org/abs/2210.04633
+
+우리는 이 새로운 연구 분야의 발전을 돕기 위해, BERT/GPT/GPT-2 모델에 내부 표현을 살펴볼 수 있는 몇 가지 기능을 추가했습니다.
+이 기능들은 주로 Paul Michel의 훌륭한 작업을 참고하여 개발되었습니다
+(https://arxiv.org/abs/1905.10650):
+
+
+- BERT/GPT/GPT-2의 모든 은닉 상태에 접근하기,
+- BERT/GPT/GPT-2의 각 헤드의 모든 어텐션 가중치에 접근하기,
+- 헤드의 출력 값과 그래디언트를 검색하여 헤드 중요도 점수를 계산하고 https://arxiv.org/abs/1905.10650에서 설명된 대로 헤드를 제거하는 기능을 제공합니다.
+
+이러한 기능들을 이해하고 직접 사용해볼 수 있도록 [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) 예제 스크립트를 추가했습니다. 이 예제 스크립트에서는 GLUE에 대해 사전훈련된 모델에서 정보를 추출하고 모델을 가지치기(prune)해봅니다.

From 6307312dfc1b8bda4b1dc09a2c92e6b9f33fdb5d Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Tue, 6 Jun 2023 09:12:46 -0400
Subject: [PATCH 320/935] Add check for tied parameters (#24029)

* Add check for tied parameters

* Fix style

* fix style

* Fix versioning

* Change if to elif
---
 src/transformers/modeling_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index fb142432863b..4002f188e9ee 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -96,6 +96,10 @@
         from accelerate.utils import get_balanced_memory
     else:
         get_balanced_memory = None
+    if version.parse(accelerate_version) > version.parse("0.19.0"):
+        from accelerate.utils import check_tied_parameters_on_same_device
+    else:
+        check_tied_parameters_on_same_device = None
 else:
     find_tied_parameters = None
 
@@ -2824,6 +2828,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     )
                 del device_map_without_lm_head
 
+        elif device_map is not None:
+            model.tie_weights()
+            tied_params = find_tied_parameters(model)
+            # check if we don't have tied param in different devices
+            if check_tied_parameters_on_same_device is not None:
+                check_tied_parameters_on_same_device(tied_params, device_map)
+
         if from_tf:
             if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
@@ -3015,6 +3026,7 @@ def _fix_key(key):
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         if find_tied_parameters is not None:
+            model.tie_weights()
             tied_params = find_tied_parameters(model)
         else:
             tied_params = []

From c2e3fa0b2a42934105538e0769e334862a164564 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 6 Jun 2023 15:26:10 +0200
Subject: [PATCH 321/935] Fixing single candidate_label return. (#24023)

---
 src/transformers/pipelines/zero_shot_image_classification.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 8ba07eb018cb..5f33ecffcab6 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -141,6 +141,8 @@ def postprocess(self, model_outputs):
         if self.framework == "pt":
             probs = logits.softmax(dim=-1).squeeze(-1)
             scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
         elif self.framework == "tf":
             probs = stable_softmax(logits, axis=-1)
             scores = probs.numpy().tolist()

From d924390d5b6e5a02c564b265efdc40808aa9f3b3 Mon Sep 17 00:00:00 2001
From: Roy Hvaara <hvaara@google.com>
Date: Tue, 6 Jun 2023 06:51:44 -0700
Subject: [PATCH 322/935] Use TruncatedNormal from Keras initializers (#24036)

Co-authored-by: Andrey Voynov <avoin@google.com>
---
 src/transformers/modeling_tf_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 41ee0402b70a..4232fe48e9ab 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -3391,14 +3391,14 @@ def call(self, inputs, cls_index=None, training=False):
         return output
 
 
-def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
+def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal:
     """
-    Creates a `tf.initializers.TruncatedNormal` with the given range.
+    Creates a `tf.keras.initializers.TruncatedNormal` with the given range.
 
     Args:
         initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
 
     Returns:
-        `tf.initializers.TruncatedNormal`: The truncated normal initializer.
+        `tf.keras.initializers.TruncatedNormal`: The truncated normal initializer.
     """
     return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)

From 02fe3af275f0ad935441b813ef2e58b94d09788a Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Tue, 6 Jun 2023 17:31:05 +0200
Subject: [PATCH 323/935] Prevent ZeroDivisionError on `trainer.evaluate` if
 model and dataset are tiny  (#24049)

Prevent ZeroDivisionError if evaluation is too quick
---
 src/transformers/trainer_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index bb44c4c1ab07..d6008b53e752 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -350,6 +350,8 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None):
     """
     runtime = time.time() - start_time
     result = {f"{split}_runtime": round(runtime, 4)}
+    if runtime == 0:
+        return result
     if num_samples is not None:
         samples_per_second = num_samples / runtime
         result[f"{split}_samples_per_second"] = round(samples_per_second, 3)

From b8935980a206f15c4db5590d5132ae477571d610 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 6 Jun 2023 12:02:56 -0400
Subject: [PATCH 324/935] Modification of one text example file should trigger
 said test (#24051)

---
 tests/repo_utils/test_tests_fetcher.py | 35 ++++++++++++++++++++++++++
 utils/tests_fetcher.py                 |  3 +++
 2 files changed, 38 insertions(+)

diff --git a/tests/repo_utils/test_tests_fetcher.py b/tests/repo_utils/test_tests_fetcher.py
index 6ab213b70cae..2dfc671944ee 100644
--- a/tests/repo_utils/test_tests_fetcher.py
+++ b/tests/repo_utils/test_tests_fetcher.py
@@ -759,6 +759,41 @@ def test_infer_tests_to_run_with_test_modifs(self):
 
             assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
 
+    def test_infer_tests_to_run_with_examples_modifs(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = ["bert", "gpt2"]
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            # Modification in one example trigger the corresponding test
+            commit_changes(
+                "examples/pytorch/text-classification/run_glue.py",
+                "from transformers import BertModeln\n\ncode1",
+                repo,
+            )
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            assert example_tests_to_run == "examples/pytorch/test_pytorch_examples.py"
+
+            # Modification in one test example file trigger that test
+            repo = create_tmp_repo(tmp_folder, models=models)
+            commit_changes(
+                "examples/pytorch/test_pytorch_examples.py",
+                """test_args = "run_glue.py"\nmore_code""",
+                repo,
+            )
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "examples_test_list.txt", "r") as f:
+                    example_tests_to_run = f.read()
+
+            assert example_tests_to_run == "examples/pytorch/test_pytorch_examples.py"
+
     def test_parse_commit_message(self):
         assert parse_commit_message("Normal commit") == {"skip": False, "no_filter": False, "test_all": False}
 
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 0480e44c3ea5..44b1b1f10b80 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -533,6 +533,9 @@ def init_test_examples_dependencies():
             test_example_deps[str(test_file.relative_to(PATH_TO_REPO))] = [
                 str(e.relative_to(PATH_TO_REPO)) for e in examples if e.name in content
             ]
+            test_example_deps[str(test_file.relative_to(PATH_TO_REPO))].append(
+                str(test_file.relative_to(PATH_TO_REPO))
+            )
     return test_example_deps, all_examples
 
 

From a717e0318ce4b4973cfebee44d1747f5ca828ac2 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 6 Jun 2023 17:11:30 +0100
Subject: [PATCH 325/935] Add TimmBackbone model (#22619)

* Add test_backbone for convnext

* Add TimmBackbone model

* Add check for backbone type

* Tidying up - config checks

* Update convnextv2

* Tidy up

* Fix indices & clearer comment

* Exceptions for config checks

* Correclty update config for tests

* Safer imports

* Safer safer imports

* Fix where decorators go

* Update import logic and backbone tests

* More import fixes

* Fixup

* Only import all_models if torch available

* Fix kwarg updates in from_pretrained & main rebase

* Tidy up

* Add tests for AutoBackbone

* Tidy up

* Fix import error

* Fix up

* Install nattan in doc_test_job

* Revert back to setting self._out_xxx directly

* Bug fix - out_indices mapping from out_features

* Fix tests

* Dont accept output_loading_info for Timm models

* Set out_xxx and don't remap

* Use smaller checkpoint for test

* Don't remap timm indices - check out_indices based on stage names

* Skip test as it's n/a

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Cleaner imports / spelling is hard

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .circleci/create_circleci_config.py           |   3 +-
 docs/source/en/index.mdx                      |   1 +
 src/transformers/__init__.py                  |   4 +
 src/transformers/models/__init__.py           |   1 +
 src/transformers/models/auto/auto_factory.py  |  44 ++-
 .../models/auto/configuration_auto.py         |   2 +
 src/transformers/models/auto/modeling_auto.py |   6 +-
 src/transformers/models/bit/modeling_bit.py   |   8 +-
 .../models/convnext/modeling_convnext.py      |   8 +-
 .../models/convnextv2/modeling_convnextv2.py  |   8 +-
 .../models/dinat/modeling_dinat.py            |   9 +-
 .../models/focalnet/modeling_focalnet.py      |  12 +-
 .../maskformer/modeling_maskformer_swin.py    |  10 +-
 src/transformers/models/nat/modeling_nat.py   |   9 +-
 .../models/resnet/modeling_resnet.py          |  10 +-
 src/transformers/models/swin/modeling_swin.py |  11 +-
 .../models/timm_backbone/__init__.py          |  49 ++++
 .../configuration_timm_backbone.py            |  78 ++++++
 .../timm_backbone/modeling_timm_backbone.py   | 140 ++++++++++
 src/transformers/utils/backbone_utils.py      | 110 ++++++--
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 tests/models/auto/test_modeling_auto.py       |  39 +++
 tests/models/timm_backbone/__init__.py        |   0
 .../test_modeling_timm_backbone.py            | 259 ++++++++++++++++++
 tests/test_backbone_common.py                 |   5 +
 utils/check_config_attributes.py              |   3 +
 utils/check_config_docstrings.py              |   1 +
 utils/check_copies.py                         |   1 +
 utils/check_repo.py                           |   3 +
 29 files changed, 753 insertions(+), 88 deletions(-)
 create mode 100644 src/transformers/models/timm_backbone/__init__.py
 create mode 100644 src/transformers/models/timm_backbone/configuration_timm_backbone.py
 create mode 100644 src/transformers/models/timm_backbone/modeling_timm_backbone.py
 create mode 100644 tests/models/timm_backbone/__init__.py
 create mode 100644 tests/models/timm_backbone/test_modeling_timm_backbone.py

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 3f7da48770e3..2a09d5f8542e 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -128,7 +128,7 @@ def to_dict(self):
         if self.command_timeout:
             test_command = f"timeout {self.command_timeout} "
         test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
-        
+
         if self.parallelism == 1:
             if self.tests_to_run is None:
                 test_command += " << pipeline.parameters.tests_to_run >>"
@@ -456,6 +456,7 @@ def job_name(self):
         "pip install -e .[dev]",
         "pip install git+https://github.com/huggingface/accelerate",
         "pip install --upgrade pytest pytest-sugar",
+        "pip install natten",
         "find -name __pycache__ -delete",
         "find . -name \*.pyc -delete",
         # Add an empty file to keep the test step running correctly even no file is selected to be tested.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index ee47a3ab5e61..6075c0ecf3d2 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -424,6 +424,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         TimmBackbone          |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
 |    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index edb15c037b35..c7deecf7552f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -483,6 +483,7 @@
         "TimeSeriesTransformerConfig",
     ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
+    "models.timm_backbone": ["TimmBackboneConfig"],
     "models.trajectory_transformer": [
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TrajectoryTransformerConfig",
@@ -2578,6 +2579,7 @@
             "TimesformerPreTrainedModel",
         ]
     )
+    _import_structure["models.timm_backbone"].extend(["TimmBackbone"])
     _import_structure["models.trajectory_transformer"].extend(
         [
             "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4288,6 +4290,7 @@
         TimeSeriesTransformerConfig,
     )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
+    from .models.timm_backbone import TimmBackboneConfig
     from .models.trajectory_transformer import (
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TrajectoryTransformerConfig,
@@ -6024,6 +6027,7 @@
             TimesformerModel,
             TimesformerPreTrainedModel,
         )
+        from .models.timm_backbone import TimmBackbone
         from .models.trajectory_transformer import (
             TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TrajectoryTransformerModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 26e4643728c6..0ee88e0ba392 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -186,6 +186,7 @@
     tapex,
     time_series_transformer,
     timesformer,
+    timm_backbone,
     trajectory_transformer,
     transfo_xl,
     trocr,
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index eedecb0da9c7..891934116645 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -19,7 +19,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module
-from ...utils import copy_func, logging
+from ...utils import copy_func, logging, requires_backends
 from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
 
 
@@ -515,6 +515,48 @@ def register(cls, config_class, model_class):
         cls._model_mapping.register(config_class, model_class)
 
 
+class _BaseAutoBackboneClass(_BaseAutoModelClass):
+    # Base class for auto backbone models.
+    _model_mapping = None
+
+    @classmethod
+    def _load_timm_backbone_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        requires_backends(cls, ["vision", "timm"])
+        from ...models.timm_backbone import TimmBackboneConfig
+
+        config = kwargs.pop("config", TimmBackboneConfig())
+
+        use_timm = kwargs.pop("use_timm_backbone", True)
+        if not use_timm:
+            raise ValueError("`use_timm_backbone` must be `True` for timm backbones")
+
+        if kwargs.get("out_features", None) is not None:
+            raise ValueError("Cannot specify `out_features` for timm backbones")
+
+        if kwargs.get("output_loading_info", False):
+            raise ValueError("Cannot specify `output_loading_info=True` when loading from timm")
+
+        num_channels = kwargs.pop("num_channels", config.num_channels)
+        features_only = kwargs.pop("features_only", config.features_only)
+        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
+        out_indices = kwargs.pop("out_indices", config.out_indices)
+        config = TimmBackboneConfig(
+            backbone=pretrained_model_name_or_path,
+            num_channels=num_channels,
+            features_only=features_only,
+            use_pretrained_backbone=use_pretrained_backbone,
+            out_indices=out_indices,
+        )
+        return super().from_config(config, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        if kwargs.get("use_timm_backbone", False):
+            return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
 def insert_head_doc(docstring, head_doc=""):
     if len(head_doc) > 0:
         return docstring.replace(
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1b82bc40da1f..197d2b2e14be 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -186,6 +186,7 @@
         ("tapas", "TapasConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("timesformer", "TimesformerConfig"),
+        ("timm_backbone", "TimmBackboneConfig"),
         ("trajectory_transformer", "TrajectoryTransformerConfig"),
         ("transfo-xl", "TransfoXLConfig"),
         ("trocr", "TrOCRConfig"),
@@ -579,6 +580,7 @@
         ("tapex", "TAPEX"),
         ("time_series_transformer", "Time Series Transformer"),
         ("timesformer", "TimeSformer"),
+        ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "Trajectory Transformer"),
         ("transfo-xl", "Transformer-XL"),
         ("trocr", "TrOCR"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 0a10eb96e9df..2c23da3c556f 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 
 from ...utils import logging
-from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from .auto_factory import _BaseAutoBackboneClass, _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
 from .configuration_auto import CONFIG_MAPPING_NAMES
 
 
@@ -179,6 +179,7 @@
         ("tapas", "TapasModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("timesformer", "TimesformerModel"),
+        ("timm_backbone", "TimmBackbone"),
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
         ("tvlt", "TvltModel"),
@@ -999,6 +1000,7 @@
         ("nat", "NatBackbone"),
         ("resnet", "ResNetBackbone"),
         ("swin", "SwinBackbone"),
+        ("timm_backbone", "TimmBackbone"),
     ]
 )
 
@@ -1330,7 +1332,7 @@ class AutoModelForAudioXVector(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_AUDIO_XVECTOR_MAPPING
 
 
-class AutoBackbone(_BaseAutoModelClass):
+class AutoBackbone(_BaseAutoBackboneClass):
     _model_mapping = MODEL_FOR_BACKBONE_MAPPING
 
 
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index d440f180757b..284ff5e2de82 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -39,7 +39,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_bit import BitConfig
 
 
@@ -845,14 +845,10 @@ def forward(
 class BitBackbone(BitPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
         self.bit = BitModel(config)
-
         self.num_features = [config.embedding_size] + config.hidden_sizes
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
 
         # initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 1748e68aeec1..3733fb941409 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -37,7 +37,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_convnext import ConvNextConfig
 
 
@@ -481,15 +481,11 @@ def forward(
 class ConvNextBackbone(ConvNextPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
         self.embeddings = ConvNextEmbeddings(config)
         self.encoder = ConvNextEncoder(config)
-
         self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
 
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index c4cac4eb39fc..70c35a85af3f 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -37,7 +37,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_convnextv2 import ConvNextV2Config
 
 
@@ -504,15 +504,11 @@ def forward(
 class ConvNextV2Backbone(ConvNextV2PreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
         self.embeddings = ConvNextV2Embeddings(config)
         self.encoder = ConvNextV2Encoder(config)
-
         self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
 
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 7e3809c1a303..b15d7d187ed0 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -39,7 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_dinat import DinatConfig
 
 
@@ -883,17 +883,12 @@ def forward(
 class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
         requires_backends(self, ["natten"])
 
-        self.stage_names = config.stage_names
-
         self.embeddings = DinatEmbeddings(config)
         self.encoder = DinatEncoder(config)
-
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index e7ebdda5e5d4..fc327ad0b39f 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -36,7 +36,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_focalnet import FocalNetConfig
 
 
@@ -981,16 +981,12 @@ def forward(
     FOCALNET_START_DOCSTRING,
 )
 class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
-    def __init__(self, config):
+    def __init__(self, config: FocalNetConfig):
         super().__init__(config)
-
-        self.stage_names = config.stage_names
-        self.focalnet = FocalNetModel(config)
+        super()._init_backbone(config)
 
         self.num_features = [config.embed_dim] + config.hidden_sizes
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
+        self.focalnet = FocalNetModel(config)
 
         # initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index c7b74a6f2bd7..7016b598e853 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -29,7 +29,7 @@
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_maskformer_swin import MaskFormerSwinConfig
 
 
@@ -852,17 +852,11 @@ class MaskFormerSwinBackbone(MaskFormerSwinPreTrainedModel, BackboneMixin):
 
     def __init__(self, config: MaskFormerSwinConfig):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
         self.model = MaskFormerSwinModel(config)
-
-        self._out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
         if "stem" in self.out_features:
             raise ValueError("This backbone does not support 'stem' in the `out_features`.")
-
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
         self.hidden_states_norms = nn.ModuleList(
             [nn.LayerNorm(num_channels) for num_channels in self.num_features[1:]]
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index 7634a08ad95b..2293661f2b4c 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -39,7 +39,7 @@
     replace_return_docstrings,
     requires_backends,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_nat import NatConfig
 
 
@@ -861,17 +861,12 @@ def forward(
 class NatBackbone(NatPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
         requires_backends(self, ["natten"])
 
-        self.stage_names = config.stage_names
-
         self.embeddings = NatEmbeddings(config)
         self.encoder = NatEncoder(config)
-
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
         self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
 
         # Add layer norms to hidden states of out_features
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index b177cdeda6c1..207a0d5196aa 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -36,7 +36,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_resnet import ResNetConfig
 
 
@@ -432,16 +432,12 @@ def forward(
 class ResNetBackbone(ResNetPreTrainedModel, BackboneMixin):
     def __init__(self, config):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
+        self.num_features = [config.embedding_size] + config.hidden_sizes
         self.embedder = ResNetEmbeddings(config)
         self.encoder = ResNetEncoder(config)
 
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
-        self.num_features = [config.embedding_size] + config.hidden_sizes
-
         # initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 6482ff1b5bf2..b324cfdcd935 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -38,7 +38,7 @@
     logging,
     replace_return_docstrings,
 )
-from ...utils.backbone_utils import BackboneMixin, get_aligned_output_features_output_indices
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_swin import SwinConfig
 
 
@@ -1259,17 +1259,12 @@ def forward(
 class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
     def __init__(self, config: SwinConfig):
         super().__init__(config)
+        super()._init_backbone(config)
 
-        self.stage_names = config.stage_names
-
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
         self.embeddings = SwinEmbeddings(config)
         self.encoder = SwinEncoder(config, self.embeddings.patch_grid)
 
-        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
-            config.out_features, config.out_indices, self.stage_names
-        )
-        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
-
         # Add layer norms to hidden states of out_features
         hidden_states_norms = {}
         for stage, num_channels in zip(self._out_features, self.channels):
diff --git a/src/transformers/models/timm_backbone/__init__.py b/src/transformers/models/timm_backbone/__init__.py
new file mode 100644
index 000000000000..4c692f76432f
--- /dev/null
+++ b/src/transformers/models/timm_backbone/__init__.py
@@ -0,0 +1,49 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_timm_backbone": ["TimmBackboneConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_timm_backbone"] = ["TimmBackbone"]
+
+
+if TYPE_CHECKING:
+    from .configuration_timm_backbone import TimmBackboneConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_timm_backbone import TimmBackbone
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/timm_backbone/configuration_timm_backbone.py b/src/transformers/models/timm_backbone/configuration_timm_backbone.py
new file mode 100644
index 000000000000..19bfcbebf62b
--- /dev/null
+++ b/src/transformers/models/timm_backbone/configuration_timm_backbone.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Configuration for Backbone models"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TimmBackboneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration for a timm backbone [`TimmBackbone`].
+
+    It is used to instantiate a timm backbone model according to the specified arguments, defining the model.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone (`str`, *optional*):
+            The timm checkpoint to load.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        features_only (`bool`, *optional*, defaults to `True`):
+            Whether to output only the features or also the logits.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use a pretrained backbone.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). Will default to the last stage if unset.
+
+    Example:
+    ```python
+    >>> from transformers import TimmBackboneConfig, TimmBackbone
+
+    >>> # Initializing a timm backbone
+    >>> configuration = TimmBackboneConfig("resnet50")
+
+    >>> # Initializing a model from the configuration
+    >>> model = TimmBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "timm_backbone"
+
+    def __init__(
+        self,
+        backbone=None,
+        num_channels=3,
+        features_only=True,
+        use_pretrained_backbone=True,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.backbone = backbone
+        self.num_channels = num_channels
+        self.features_only = features_only
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = True
+        self.out_indices = out_indices if out_indices is not None else (-1,)
diff --git a/src/transformers/models/timm_backbone/modeling_timm_backbone.py b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
new file mode 100644
index 000000000000..2dcfca7e538b
--- /dev/null
+++ b/src/transformers/models/timm_backbone/modeling_timm_backbone.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Union
+
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import is_timm_available, is_torch_available, requires_backends
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_timm_backbone import TimmBackboneConfig
+
+
+if is_timm_available():
+    import timm
+
+
+if is_torch_available():
+    from torch import Tensor
+
+
+class TimmBackbone(PreTrainedModel, BackboneMixin):
+    """
+    Wrapper class for timm models to be used as backbones. This enables using the timm models interchangeably with the
+    other models in the library keeping the same API.
+    """
+
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+    config_class = TimmBackboneConfig
+
+    def __init__(self, config, **kwargs):
+        requires_backends(self, "timm")
+        super().__init__(config)
+        self.config = config
+
+        if config.backbone is None:
+            raise ValueError("backbone is not set in the config. Please set it to a timm model name.")
+
+        if config.backbone not in timm.list_models():
+            raise ValueError(f"backbone {config.backbone} is not supported by timm.")
+
+        if hasattr(config, "out_features") and config.out_features is not None:
+            raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")
+
+        pretrained = getattr(config, "use_pretrained_backbone", None)
+        if pretrained is None:
+            raise ValueError("use_pretrained_backbone is not set in the config. Please set it to True or False.")
+
+        # We just take the final layer by default. This matches the default for the transformers models.
+        out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)
+
+        self._backbone = timm.create_model(
+            config.backbone,
+            pretrained=pretrained,
+            # This is currently not possible for transformer architectures.
+            features_only=config.features_only,
+            in_chans=config.num_channels,
+            out_indices=out_indices,
+            **kwargs,
+        )
+        # These are used to control the output of the model when called. If output_hidden_states is True, then
+        # return_layers is modified to include all layers.
+        self._return_layers = self._backbone.return_layers
+        self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
+        super()._init_backbone(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        requires_backends(cls, ["vision", "timm"])
+        from ...models.timm_backbone import TimmBackboneConfig
+
+        config = kwargs.pop("config", TimmBackboneConfig())
+
+        use_timm = kwargs.pop("use_timm_backbone", True)
+        if not use_timm:
+            raise ValueError("use_timm_backbone must be True for timm backbones")
+
+        num_channels = kwargs.pop("num_channels", config.num_channels)
+        features_only = kwargs.pop("features_only", config.features_only)
+        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
+        out_indices = kwargs.pop("out_indices", config.out_indices)
+        config = TimmBackboneConfig(
+            backbone=pretrained_model_name_or_path,
+            num_channels=num_channels,
+            features_only=features_only,
+            use_pretrained_backbone=use_pretrained_backbone,
+            out_indices=out_indices,
+        )
+        return super()._from_config(config, **kwargs)
+
+    def _init_weights(self, module):
+        """
+        Empty init weights function to ensure compatibility of the class in the library.
+        """
+        pass
+
+    def forward(
+        self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None, **kwargs
+    ) -> Union[BackboneOutput, Tuple[Tensor, ...]]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        if output_attentions:
+            raise ValueError("Cannot output attentions for timm backbones at the moment")
+
+        if output_hidden_states:
+            # We modify the return layers to include all the stages of the backbone
+            self._backbone.return_layers = self._all_layers
+            hidden_states = self._backbone(pixel_values, **kwargs)
+            self._backbone.return_layers = self._return_layers
+            feature_maps = tuple(hidden_states[i] for i in self.out_indices)
+        else:
+            feature_maps = self._backbone(pixel_values, **kwargs)
+            hidden_states = None
+
+        feature_maps = tuple(feature_maps)
+        hidden_states = tuple(hidden_states) if hidden_states is not None else None
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output = output + (hidden_states,)
+            return output
+
+        return BackboneOutput(feature_maps=feature_maps, hidden_states=hidden_states, attentions=None)
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 8c6b7107eb0e..595aae18832c 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -15,10 +15,16 @@
 
 """ Collection of utils to be used by backbones and their components."""
 
+import enum
 import inspect
 from typing import Iterable, List, Optional, Tuple, Union
 
 
+class BackboneType(enum.Enum):
+    TIMM = "timm"
+    TRANSFORMERS = "transformers"
+
+
 def verify_out_features_out_indices(
     out_features: Optional[Iterable[str]], out_indices: Optional[Iterable[int]], stage_names: Optional[Iterable[str]]
 ):
@@ -72,7 +78,7 @@ def _align_output_features_output_indices(
         out_indices = [len(stage_names) - 1]
         out_features = [stage_names[-1]]
     elif out_indices is None and out_features is not None:
-        out_indices = [stage_names.index(layer) for layer in stage_names if layer in out_features]
+        out_indices = [stage_names.index(layer) for layer in out_features]
     elif out_features is None and out_indices is not None:
         out_features = [stage_names[idx] for idx in out_indices]
     return out_features, out_indices
@@ -110,29 +116,57 @@ def get_aligned_output_features_output_indices(
 
 
 class BackboneMixin:
-    @property
-    def out_feature_channels(self):
-        # the current backbones will output the number of channels for each stage
-        # even if that stage is not in the out_features list.
-        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
+    backbone_type: Optional[BackboneType] = None
 
-    @property
-    def channels(self):
-        return [self.out_feature_channels[name] for name in self.out_features]
+    def _init_timm_backbone(self, config) -> None:
+        """
+        Initialize the backbone model from timm The backbone must already be loaded to self._backbone
+        """
+        if getattr(self, "_backbone", None) is None:
+            raise ValueError("self._backbone must be set before calling _init_timm_backbone")
 
-    def forward_with_filtered_kwargs(self, *args, **kwargs):
-        signature = dict(inspect.signature(self.forward).parameters)
-        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
-        return self(*args, **filtered_kwargs)
+        # These will diagree with the defaults for the transformers models e.g. for resnet50
+        # the transformer model has out_features = ['stem', 'stage1', 'stage2', 'stage3', 'stage4']
+        # the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
+        self.stage_names = [stage["module"] for stage in self._backbone.feature_info.info]
+        self.num_features = [stage["num_chs"] for stage in self._backbone.feature_info.info]
+        out_indices = self._backbone.feature_info.out_indices
+        out_features = self._backbone.feature_info.module_name()
 
-    def forward(
-        self,
-        pixel_values,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        raise NotImplementedError("This method should be implemented by the derived class.")
+        # We verify the out indices and out features are valid
+        verify_out_features_out_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self._out_features, self._out_indices = out_features, out_indices
+
+    def _init_transformers_backbone(self, config) -> None:
+        stage_names = getattr(config, "stage_names")
+        out_features = getattr(config, "out_features", None)
+        out_indices = getattr(config, "out_indices", None)
+
+        self.stage_names = stage_names
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=stage_names
+        )
+        # Number of channels for each stage. This is set in the transformer backbone model init
+        self.num_features = None
+
+    def _init_backbone(self, config) -> None:
+        """
+        Method to initialize the backbone. This method is called by the constructor of the base class after the
+        pretrained model weights have been loaded.
+        """
+        self.config = config
+
+        self.use_timm_backbone = getattr(config, "use_timm_backbone", False)
+        self.backbone_type = BackboneType.TIMM if self.use_timm_backbone else BackboneType.TRANSFORMERS
+
+        if self.backbone_type == BackboneType.TIMM:
+            self._init_timm_backbone(config)
+        elif self.backbone_type == BackboneType.TRANSFORMERS:
+            self._init_transformers_backbone(config)
+        else:
+            raise ValueError(f"backbone_type {self.backbone_type} not supported.")
 
     @property
     def out_features(self):
@@ -160,6 +194,40 @@ def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
             out_features=None, out_indices=out_indices, stage_names=self.stage_names
         )
 
+    @property
+    def out_feature_channels(self):
+        # the current backbones will output the number of channels for each stage
+        # even if that stage is not in the out_features list.
+        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    def forward_with_filtered_kwargs(self, *args, **kwargs):
+        signature = dict(inspect.signature(self.forward).parameters)
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
+        return self(*args, **filtered_kwargs)
+
+    def forward(
+        self,
+        pixel_values,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        raise NotImplementedError("This method should be implemented by the derived class.")
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features")
+        output["out_indices"] = output.pop("_out_indices")
+        return output
+
 
 class BackboneConfigMixin:
     """
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 069edb3ba851..1eba8c0ca754 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -6806,6 +6806,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class TimmBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py
index 26eecd54299c..9c788a615560 100644
--- a/tests/models/auto/test_modeling_auto.py
+++ b/tests/models/auto/test_modeling_auto.py
@@ -45,6 +45,7 @@
     from test_module.custom_modeling import CustomModel
 
     from transformers import (
+        AutoBackbone,
         AutoConfig,
         AutoModel,
         AutoModelForCausalLM,
@@ -66,11 +67,13 @@
         FunnelModel,
         GPT2Config,
         GPT2LMHeadModel,
+        ResNetBackbone,
         RobertaForMaskedLM,
         T5Config,
         T5ForConditionalGeneration,
         TapasConfig,
         TapasForQuestionAnswering,
+        TimmBackbone,
     )
     from transformers.models.auto.modeling_auto import (
         MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -224,6 +227,42 @@ def test_token_classification_model_from_pretrained(self):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForTokenClassification)
 
+    @slow
+    def test_auto_backbone_timm_model_from_pretrained(self):
+        # Configs can't be loaded for timm models
+        model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True)
+
+        with pytest.raises(ValueError):
+            # We can't pass output_loading_info=True as we're loading from timm
+            AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, output_loading_info=True)
+
+        self.assertIsNotNone(model)
+        self.assertIsInstance(model, TimmBackbone)
+
+        # Check kwargs are correctly passed to the backbone
+        model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_indices=(-1, -2))
+        self.assertEqual(model.out_indices, (-1, -2))
+
+        # Check out_features cannot be passed to Timm backbones
+        with self.assertRaises(ValueError):
+            _ = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_features=["stage1"])
+
+    @slow
+    def test_auto_backbone_from_pretrained(self):
+        model = AutoBackbone.from_pretrained("microsoft/resnet-18")
+        model, loading_info = AutoBackbone.from_pretrained("microsoft/resnet-18", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertIsInstance(model, ResNetBackbone)
+
+        # Check kwargs are correctly passed to the backbone
+        model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_indices=[-1, -2])
+        self.assertEqual(model.out_indices, [-1, -2])
+        self.assertEqual(model.out_features, ["stage4", "stage3"])
+
+        model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_features=["stage2", "stage4"])
+        self.assertEqual(model.out_indices, [2, 4])
+        self.assertEqual(model.out_features, ["stage2", "stage4"])
+
     def test_from_pretrained_identifier(self):
         model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
         self.assertIsInstance(model, BertForMaskedLM)
diff --git a/tests/models/timm_backbone/__init__.py b/tests/models/timm_backbone/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
new file mode 100644
index 000000000000..f58716e0f2f0
--- /dev/null
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import unittest
+
+from transformers import AutoBackbone
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import require_timm, require_torch, torch_device
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import TimmBackbone, TimmBackboneConfig
+
+
+class TimmBackboneModelTester:
+    def __init__(
+        self,
+        parent,
+        out_indices=None,
+        out_features=None,
+        stage_names=None,
+        backbone="resnet50",
+        batch_size=3,
+        image_size=32,
+        num_channels=3,
+        is_training=True,
+        use_pretrained_backbone=True,
+    ):
+        self.parent = parent
+        self.out_indices = out_indices if out_indices is not None else [4]
+        self.stage_names = stage_names
+        self.out_features = out_features
+        self.backbone = backbone
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return TimmBackboneConfig(
+            image_size=self.image_size,
+            num_channels=self.num_channels,
+            out_features=self.out_features,
+            out_indices=self.out_indices,
+            stage_names=self.stage_names,
+            use_pretrained_backbone=self.use_pretrained_backbone,
+            backbone=self.backbone,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TimmBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        self.parent.assertEqual(
+            result.feature_map[-1].shape,
+            (self.batch_size, model.channels[-1], 14, 14),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+@require_timm
+class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, unittest.TestCase):
+    all_model_classes = (TimmBackbone,) if is_torch_available() else ()
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_pruning = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = TimmBackboneModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PretrainedConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_timm_transformer_backbone_equivalence(self):
+        timm_checkpoint = "resnet18"
+        transformers_checkpoint = "microsoft/resnet-18"
+
+        timm_model = AutoBackbone.from_pretrained(timm_checkpoint, use_timm_backbone=True)
+        transformers_model = AutoBackbone.from_pretrained(transformers_checkpoint)
+
+        self.assertEqual(len(timm_model.out_features), len(transformers_model.out_features))
+        self.assertEqual(len(timm_model.stage_names), len(transformers_model.stage_names))
+        self.assertEqual(timm_model.channels, transformers_model.channels)
+        # Out indices are set to the last layer by default. For timm models, we don't know
+        # the number of layers in advance, so we set it to (-1,), whereas for transformers
+        # models, we set it to [len(stage_names) - 1] (kept for backward compatibility).
+        self.assertEqual(timm_model.out_indices, (-1,))
+        self.assertEqual(transformers_model.out_indices, [len(timm_model.stage_names) - 1])
+
+        timm_model = AutoBackbone.from_pretrained(timm_checkpoint, use_timm_backbone=True, out_indices=[1, 2, 3])
+        transformers_model = AutoBackbone.from_pretrained(transformers_checkpoint, out_indices=[1, 2, 3])
+
+        self.assertEqual(timm_model.out_indices, transformers_model.out_indices)
+        self.assertEqual(len(timm_model.out_features), len(transformers_model.out_features))
+        self.assertEqual(timm_model.channels, transformers_model.channels)
+
+    @unittest.skip("TimmBackbone doesn't support feed forward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip("TimmBackbone doesn't have num_hidden_layers attribute")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip("TimmBackbone initialization is managed on the timm side")
+    def test_initialization(self):
+        pass
+
+    @unittest.skip("TimmBackbone models doesn't have inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("TimmBackbone models doesn't have inputs_embeds")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip("TimmBackbone model cannot be created without specifying a backbone checkpoint")
+    def test_from_pretrained_no_checkpoint(self):
+        pass
+
+    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    def test_save_load(self):
+        pass
+
+    @unittest.skip("model weights aren't tied in TimmBackbone.")
+    def test_tie_model_weights(self):
+        pass
+
+    @unittest.skip("model weights aren't tied in TimmBackbone.")
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    @unittest.skip("TimmBackbone doesn't have hidden size info in its configuration.")
+    def test_channels(self):
+        pass
+
+    @unittest.skip("TimmBackbone doesn't support output_attentions.")
+    def test_torchscript_output_attentions(self):
+        pass
+
+    @unittest.skip("Safetensors is not supported by timm.")
+    def test_can_use_safetensors(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = self.has_attentions
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+        outputs = model(**inputs)
+        output = outputs[0][-1]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        hidden_states.retain_grad()
+
+        if self.has_attentions:
+            attentions = outputs.attentions[0]
+            attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+
+        if self.has_attentions:
+            self.assertIsNotNone(attentions.grad)
+
+    # TimmBackbone config doesn't have out_features attribute
+    def test_create_from_modified_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), len(config.out_indices))
+            self.assertEqual(len(model.channels), len(config.out_indices))
+
+            # Check output of last stage is taken if out_features=None, out_indices=None
+            modified_config = copy.deepcopy(config)
+            modified_config.out_indices = None
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
+
+            self.assertEqual(len(result.feature_maps), 1)
+            self.assertEqual(len(model.channels), 1)
+
+            # Check backbone can be initialized with fresh weights
+            modified_config = copy.deepcopy(config)
+            modified_config.use_pretrained_backbone = False
+            model = model_class(modified_config)
+            model.to(torch_device)
+            model.eval()
+            result = model(**inputs_dict)
diff --git a/tests/test_backbone_common.py b/tests/test_backbone_common.py
index fd9bbe3bfbfe..1700ab98eed4 100644
--- a/tests/test_backbone_common.py
+++ b/tests/test_backbone_common.py
@@ -17,6 +17,7 @@
 import inspect
 
 from transformers.testing_utils import require_torch, torch_device
+from transformers.utils.backbone_utils import BackboneType
 
 
 @require_torch
@@ -104,6 +105,8 @@ def test_create_from_modified_config(self):
 
             self.assertEqual(len(result.feature_maps), len(config.out_features))
             self.assertEqual(len(model.channels), len(config.out_features))
+            self.assertEqual(len(result.feature_maps), len(config.out_indices))
+            self.assertEqual(len(model.channels), len(config.out_indices))
 
             # Check output of last stage is taken if out_features=None, out_indices=None
             modified_config = copy.deepcopy(config)
@@ -140,6 +143,7 @@ def test_backbone_common_attributes(self):
         for backbone_class in self.all_model_classes:
             backbone = backbone_class(config)
 
+            self.assertTrue(hasattr(backbone, "backbone_type"))
             self.assertTrue(hasattr(backbone, "stage_names"))
             self.assertTrue(hasattr(backbone, "num_features"))
             self.assertTrue(hasattr(backbone, "out_indices"))
@@ -147,6 +151,7 @@ def test_backbone_common_attributes(self):
             self.assertTrue(hasattr(backbone, "out_feature_channels"))
             self.assertTrue(hasattr(backbone, "channels"))
 
+            self.assertIsInstance(backbone.backbone_type, BackboneType)
             # Verify num_features has been initialized in the backbone init
             self.assertIsNotNone(backbone.num_features)
             self.assertTrue(len(backbone.channels) == len(backbone.out_indices))
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 929f3a51b1c6..02c3d2276f5d 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -77,6 +77,7 @@
     "AutoformerConfig": ["num_static_real_features", "num_time_features"],
 }
 
+
 # TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
 SPECIAL_CASES_TO_ALLOW.update(
     {
@@ -172,6 +173,8 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "mask_index",
         "image_size",
         "use_cache",
+        "out_features",
+        "out_indices",
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
 
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index a4a18a66293b..93385b127d75 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -39,6 +39,7 @@
     "EncoderDecoderConfig",
     "RagConfig",
     "SpeechEncoderDecoderConfig",
+    "TimmBackboneConfig",
     "VisionEncoderDecoderConfig",
     "VisionTextDualEncoderConfig",
     "LlamaConfig",
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 5fa0c8bfb983..bd79aa9e86c0 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -517,6 +517,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "Speech Encoder decoder",
     "Speech2Text",
     "Speech2Text2",
+    "TimmBackbone",
     "Vision Encoder decoder",
     "VisionTextDualEncoder",
 ]
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 8d1760d13352..12615550a8d0 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -408,6 +408,7 @@ def get_model_modules():
         "modeling_speech_encoder_decoder",
         "modeling_flax_speech_encoder_decoder",
         "modeling_flax_vision_encoder_decoder",
+        "modeling_timm_backbone",
         "modeling_transfo_xl_utilities",
         "modeling_tf_auto",
         "modeling_tf_encoder_decoder",
@@ -846,6 +847,8 @@ def find_all_documented_objects():
     "NatBackbone",
     "ResNetBackbone",
     "SwinBackbone",
+    "TimmBackbone",
+    "TimmBackboneConfig",
 ]
 
 

From ff4c0fc7d2e8df5fb5971f9d8dbd1108edabf65b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 6 Jun 2023 18:17:41 +0200
Subject: [PATCH 326/935] Tiny fix for `check_self_hosted_runner.py` (#24052)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/check_self_hosted_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/check_self_hosted_runner.py b/utils/check_self_hosted_runner.py
index 0eadfcf5f9c0..7439bd270eff 100644
--- a/utils/check_self_hosted_runner.py
+++ b/utils/check_self_hosted_runner.py
@@ -25,7 +25,7 @@ def get_runner_status(target_runners, token):
         fp.write(json.dumps(offline_runners))
 
     if len(offline_runners) > 0:
-        failed = "\n".join(offline_runners)
+        failed = "\n".join([x["name"] for x in offline_runners])
         raise ValueError(f"The following runners are offline:\n{failed}")
 
 

From 072188d638de6d137d077fa967b68390fd52c775 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Tue, 6 Jun 2023 13:04:38 -0400
Subject: [PATCH 327/935] Act on deprecations in Accelerate no_trainer examples
 (#24053)

Act on deprecation
---
 .../run_image_classification_no_trainer.py                    | 2 +-
 examples/pytorch/image-pretraining/run_mim_no_trainer.py      | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py      | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py      | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py       | 2 +-
 .../question-answering/run_qa_beam_search_no_trainer.py       | 2 +-
 examples/pytorch/question-answering/run_qa_no_trainer.py      | 2 +-
 .../run_semantic_segmentation_no_trainer.py                   | 2 +-
 .../pytorch/summarization/run_summarization_no_trainer.py     | 2 +-
 examples/pytorch/token-classification/run_ner_no_trainer.py   | 2 +-
 examples/pytorch/translation/run_translation_no_trainer.py    | 2 +-
 .../codeparrot/scripts/codeparrot_training.py                 | 4 +++-
 12 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 9dd373e1a563..9f7ddd224368 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -210,7 +210,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 18d5c15638c5..e1fc63b42e19 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -378,7 +378,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index fb07ad9392d1..176240ec6dae 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -261,7 +261,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 593dae4628d9..46b7ff4b8578 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -270,7 +270,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 7492f31ddbf9..8c98eb603cc7 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -288,7 +288,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index ed69e8b396fe..aecff7be0d9a 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -303,7 +303,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 8054779cc7b9..fb5a00f1feb0 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -341,7 +341,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 5087a4186a17..cab3198626d4 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -330,7 +330,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index bef349fdb369..f77f7d48b663 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -325,7 +325,7 @@ def main():
 
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
-        accelerator_log_kwargs["logging_dir"] = args.output_dir
+        accelerator_log_kwargs["project_dir"] = args.output_dir
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
     if args.source_prefix is None and args.model_name_or_path in [
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 82aea5d4d6cd..9da37a24fe4d 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -271,7 +271,7 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
     )
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 5267cea4e269..5d1fcdcea1e5 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -316,7 +316,7 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
     )
 
     # Make one log on every process with the configuration for debugging.
diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
index 2510e02c9470..16f6077f2415 100644
--- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py
+++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
@@ -7,6 +7,7 @@
 import datasets
 import torch
 from accelerate import Accelerator, DistributedType
+from accelerate.utils import ProjectConfiguration
 from arguments import TrainingArguments
 from datasets import load_dataset
 from huggingface_hub import Repository
@@ -195,7 +196,8 @@ def evaluate(args):
 args = parser.parse_args()
 
 # Accelerator
-accelerator = Accelerator(log_with=["wandb", "tensorboard"], logging_dir=f"{args.save_dir}/log")
+config = ProjectConfiguration(project_dir=args.save_dir, logging_dir="log")
+accelerator = Accelerator(log_with=["wandb", "tensorboard"], project_config=config)
 acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
 
 args = Namespace(**vars(args), **acc_state)

From 7203ea6797fb9f0b8e3b1f925db46b82a3899d4e Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 6 Jun 2023 18:29:54 +0100
Subject: [PATCH 328/935] Reduce memory usage in TF building (#24046)

* Make the default dummies (2, 2) instead of (3, 3)

* Fix for Funnel

* Actually fix Funnel
---
 src/transformers/modeling_tf_utils.py                |  6 +++---
 src/transformers/models/funnel/modeling_tf_funnel.py | 10 ++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 4232fe48e9ab..ba9f4ec87f47 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1116,8 +1116,8 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         dummies = {}
         sig = self._prune_signature(self.input_signature)
         for key, spec in sig.items():
-            # 3 is the most correct arbitrary size. I will not be taking questions
-            dummies[key] = tf.ones(shape=[dim if dim is not None else 3 for dim in spec.shape], dtype=spec.dtype)
+            # 2 is the most correct arbitrary size. I will not be taking questions
+            dummies[key] = tf.ones(shape=[dim if dim is not None else 2 for dim in spec.shape], dtype=spec.dtype)
             if key == "token_type_ids":
                 # Some models have token_type_ids but with a vocab_size of 1
                 dummies[key] = tf.zeros_like(dummies[key])
@@ -1125,7 +1125,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
             if "encoder_hidden_states" not in dummies:
                 if self.main_input_name == "input_ids":
                     dummies["encoder_hidden_states"] = tf.ones(
-                        shape=(3, 3, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
+                        shape=(2, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
                     )
                 else:
                     raise NotImplementedError(
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 9c472674cf65..d1c8fc187885 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -242,6 +242,7 @@ def get_position_embeds(self, seq_len, training=False):
                 # rel_pos = tf.broadcast_to(rel_pos, (rel_pos.shape[0], self.d_model))
                 rel_pos = tf.cast(rel_pos, dtype=zero_offset.dtype)
                 rel_pos = rel_pos + zero_offset
+                tf.debugging.assert_less(rel_pos, tf.shape(pos_embed)[0])
                 position_embeds_no_pooling = tf.gather(pos_embed, rel_pos, axis=0)
 
                 position_embeds_list.append([position_embeds_no_pooling, position_embeds_pooling])
@@ -974,6 +975,11 @@ class TFFunnelPreTrainedModel(TFPreTrainedModel):
     config_class = FunnelConfig
     base_model_prefix = "funnel"
 
+    @property
+    def dummy_inputs(self):
+        # Funnel misbehaves with very small inputs, so we override and make them a bit bigger
+        return {"input_ids": tf.ones((3, 3), dtype=tf.int32)}
+
 
 @dataclass
 class TFFunnelForPreTrainingOutput(ModelOutput):
@@ -1424,6 +1430,10 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
         self.funnel = TFFunnelBaseLayer(config, name="funnel")
         self.classifier = TFFunnelClassificationHead(config, 1, name="classifier")
 
+    @property
+    def dummy_inputs(self):
+        return {"input_ids": tf.ones((3, 3, 4), dtype=tf.int32)}
+
     @unpack_inputs
     @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(

From cbf6bc235073cd56867dcb9858b9c96dcb594aca Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Tue, 6 Jun 2023 13:30:19 -0400
Subject: [PATCH 329/935] Oops, missed one (#24054)

Oops
---
 examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 2ba3c4449836..cf5d1f2e7426 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -217,7 +217,7 @@ def main():
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
     accelerator = (
-        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
+        Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
     )
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(

From 4a55e4787760fdb6c40a972a60d814ba05425da1 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 6 Jun 2023 18:30:51 +0100
Subject: [PATCH 330/935] Move TF building to an actual build() method (#23760)

* A fun new PR where I break the entire codebase again

* A fun new PR where I break the entire codebase again

* Handle cross-attention

* Move calls to model(model.dummy_inputs) to the new build() method

* Seeing what fails with the build context thing

* make fix-copies

* Let's see what fails with new build methods

* Fix the pytorch crossload build calls

* Fix the overridden build methods in vision_text_dual_encoder

* Make sure all our build methods set self.built or call super().build(), which also sets it

* make fix-copies

* Remove finished TODO

* Tentatively remove unneeded (?) line

* Transpose b in deberta correctly and remove unused threading local

* Get rid of build_with_dummies and all it stands for

* Rollback some changes to TF-PT crossloading

* Correctly call super().build()
---
 src/transformers/modeling_tf_pytorch_utils.py |  3 -
 src/transformers/modeling_tf_utils.py         | 35 +++++---
 .../models/blip/modeling_tf_blip.py           |  6 +-
 .../models/blip/modeling_tf_blip_text.py      |  3 +-
 .../models/clip/modeling_tf_clip.py           |  6 +-
 .../models/convbert/modeling_tf_convbert.py   |  3 +-
 .../models/convnext/modeling_tf_convnext.py   |  2 +-
 .../models/ctrl/modeling_tf_ctrl.py           |  2 +-
 .../data2vec/modeling_tf_data2vec_vision.py   |  2 +-
 .../models/deberta/modeling_tf_deberta.py     |  9 +--
 .../models/dpr/modeling_tf_dpr.py             |  6 +-
 .../models/groupvit/modeling_tf_groupvit.py   |  2 +-
 .../models/led/modeling_tf_led.py             | 81 ++++++++++---------
 .../longformer/modeling_tf_longformer.py      | 81 ++++++++++---------
 .../mobilebert/modeling_tf_mobilebert.py      |  1 +
 .../models/sam/modeling_tf_sam.py             |  1 +
 .../modeling_tf_vision_text_dual_encoder.py   |  3 +-
 .../models/vit_mae/modeling_tf_vit_mae.py     |  7 +-
 .../models/whisper/modeling_tf_whisper.py     | 11 +--
 .../models/xlnet/modeling_tf_xlnet.py         |  1 +
 tests/models/bart/test_modeling_tf_bart.py    |  2 +-
 .../test_modeling_tf_encoder_decoder.py       |  4 +-
 tests/models/gpt2/test_modeling_tf_gpt2.py    |  2 +-
 tests/models/opt/test_modeling_tf_opt.py      |  2 +-
 ...test_modeling_tf_vision_encoder_decoder.py |  4 +-
 .../whisper/test_modeling_tf_whisper.py       |  2 +-
 tests/test_modeling_tf_common.py              | 16 ++--
 27 files changed, 159 insertions(+), 138 deletions(-)

diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 3b1c030699b9..3d08be1a8a1e 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -341,9 +341,6 @@ def load_pytorch_state_dict_in_tf2_model(
 
     K.batch_set_value(weight_value_tuples)
 
-    if tf_inputs is not None:
-        tf_model(tf_inputs, training=False)  # Make sure restore ops are run
-
     logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.")
 
     unexpected_keys = list(all_pytorch_weights)
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index ba9f4ec87f47..e278fceb4a4c 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -40,7 +40,12 @@
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation import GenerationConfig, TFGenerationMixin
-from .tf_utils import expand_1d, load_attributes_from_hdf5_group, save_attributes_to_hdf5_group, shape_list
+from .tf_utils import (
+    expand_1d,
+    load_attributes_from_hdf5_group,
+    save_attributes_to_hdf5_group,
+    shape_list,
+)
 from .utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -69,11 +74,14 @@
 if parse(tf.__version__).minor >= 13:
     from keras import backend as K
     from keras.__internal__ import KerasTensor
+    from keras.engine.base_layer_utils import call_context
 elif parse(tf.__version__).minor >= 11:
     from keras import backend as K
+    from keras.engine.base_layer_utils import call_context
     from keras.engine.keras_tensor import KerasTensor
 else:
     from tensorflow.python.keras import backend as K
+    from tensorflow.python.keras.engine import call_context
     from tensorflow.python.keras.engine.keras_tensor import KerasTensor
 
 
@@ -1140,6 +1148,13 @@ def framework(self) -> str:
         """
         return "tf"
 
+    def build(self, input_shape=None):
+        if self.built or call_context().in_call:
+            self.built = True
+        else:
+            self(self.dummy_inputs, training=False)
+            self.built = True
+
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
         if not isinstance(config, PretrainedConfig):
@@ -1867,7 +1882,7 @@ def set_input_embeddings(self, value):
             main_layer.set_input_embeddings(value)
         except AttributeError:
             logger.info("Building the model")
-            self(self.dummy_inputs)
+            self.build()
             main_layer.set_input_embeddings(value)
 
     def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
@@ -1884,7 +1899,7 @@ def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
                 return lm_head.get_output_embeddings()
             except AttributeError:
                 logger.info("Building the model")
-                self(self.dummy_inputs)
+                self.build()
 
                 return lm_head().get_output_embeddings()
 
@@ -1904,7 +1919,7 @@ def set_output_embeddings(self, value):
                 lm_head.set_output_embeddings(value)
             except AttributeError:
                 logger.info("Building the model")
-                self(self.dummy_inputs)
+                self.build()
                 lm_head.set_output_embeddings(value)
 
     def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
@@ -1942,7 +1957,7 @@ def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
             try:
                 return lm_head.get_bias()
             except AttributeError:
-                self(self.dummy_inputs)
+                self.build()
 
                 return lm_head.get_bias()
         return None
@@ -1960,7 +1975,7 @@ def set_bias(self, value):
             try:
                 lm_head.set_bias(value)
             except AttributeError:
-                self(self.dummy_inputs)
+                self.build()
                 lm_head.set_bias(value)
 
     def get_lm_head(self) -> tf.keras.layers.Layer:
@@ -2047,7 +2062,7 @@ def _get_word_embedding_weight(model, embedding_layer):
         # The reason why the attributes don't exist might be
         # because the model is not built, so retry getting
         # the argument after building the model
-        model(model.dummy_inputs)
+        model.build()
 
         embeds = getattr(embedding_layer, "weight", None)
         if embeds is not None:
@@ -2870,9 +2885,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         # we might need to extend the variable scope for composite models
         if load_weight_prefix is not None:
             with tf.compat.v1.variable_scope(load_weight_prefix):
-                model(model.dummy_inputs)  # build the network with dummy inputs
+                model.build()  # build the network with dummy inputs
         else:
-            model(model.dummy_inputs)  # build the network with dummy inputs
+            model.build()  # build the network with dummy inputs
 
         if safetensors_from_pt:
             from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model
@@ -2925,8 +2940,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
                 )
 
-        model(model.dummy_inputs)  # Make sure restore ops are run
-
         if cls._keys_to_ignore_on_load_missing is not None:
             for pat in cls._keys_to_ignore_on_load_missing:
                 missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 428151ea9a3c..b94c005eb483 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -258,6 +258,7 @@ def build(self, input_shape):
             trainable=True,
             name="position_embedding",
         )
+        super().build(input_shape)
 
     def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
         # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch
@@ -282,7 +283,7 @@ def __init__(self, config: BlipTextConfig, **kwargs):
 
         self.config = config
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
                 shape=(self.config.vocab_size, self.embed_dim),
@@ -757,13 +758,14 @@ def __init__(self, config: BlipConfig, *args, **kwargs):
 
         self.config = config
 
-    def build(self, input_shape):
+    def build(self, input_shape=None):
         self.logit_scale = self.add_weight(
             name="logit_scale",
             shape=[],
             initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
             trainable=True,
         )
+        super().build(input_shape)
 
     @unpack_inputs
     def call(
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index 19ebdac62e22..6fef07e8a3f9 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -543,8 +543,9 @@ def __init__(self, config, **kwargs):
         )
         self.config = config
 
-    def build(self, input_shape):
+    def build(self, input_shape=None):
         self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
+        super().build(input_shape)
 
     def call(self, hidden_states):
         hidden_states = self.transform(hidden_states)
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 778f1ed2c92e..009e474440d7 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -151,7 +151,7 @@ def __init__(self, config: CLIPVisionConfig, **kwargs):
             name="patch_embedding",
         )
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         factor = self.config.initializer_factor
 
         self.class_embedding = self.add_weight(
@@ -204,7 +204,7 @@ def __init__(self, config: CLIPTextConfig, **kwargs):
 
         self.config = config
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
                 shape=(self.config.vocab_size, self.embed_dim),
@@ -739,7 +739,7 @@ def __init__(self, config: CLIPConfig, **kwargs):
             name="text_projection",
         )
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         self.logit_scale = self.add_weight(
             shape=(1,),
             initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 9b2bf2383bb7..4beb01cb78b0 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -346,7 +346,7 @@ def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kw
         self.group_in_dim = self.input_size // self.num_groups
         self.group_out_dim = self.output_size // self.num_groups
 
-    def build(self, input_shape):
+    def build(self, input_shape=None):
         self.kernel = self.add_weight(
             "kernel",
             shape=[self.group_out_dim, self.group_in_dim, self.num_groups],
@@ -357,6 +357,7 @@ def build(self, input_shape):
         self.bias = self.add_weight(
             "bias", shape=[self.output_size], initializer=self.kernel_initializer, dtype=self.dtype, trainable=True
         )
+        super().build(input_shape)
 
     def call(self, hidden_states):
         batch_size = shape_list(hidden_states)[0]
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 23a77a928ecc..1629988900aa 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -155,7 +155,7 @@ def __init__(self, config, dim, drop_path=0.0, **kwargs):
             else tf.keras.layers.Activation("linear", name="drop_path")
         )
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         # PT's `nn.Parameters` must be mapped to a TF layer weight to inherit the same name hierarchy (and vice-versa)
         self.layer_scale_parameter = (
             self.add_weight(
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index 4dd9e7392507..18c8c2c5883d 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -576,7 +576,7 @@ def __init__(self, config, input_embeddings, **kwargs):
         # an output-only bias for each token.
         self.input_embeddings = input_embeddings
 
-    def build(self, input_shape):
+    def build(self, input_shape=None):
         self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
         super().build(input_shape)
 
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index 8ebb8c68ff8d..ee8bec20a019 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -464,7 +464,7 @@ def __init__(
         )
         self.init_values = config.layer_scale_init_value
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         if self.init_values > 0:
             self.lambda_1 = self.add_weight(
                 shape=(self.config.hidden_size),
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 57e6ea8b1e9b..5fc4ce783ccb 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -593,11 +593,10 @@ def call(
         else:
 
             def linear(w, b, x):
-                return tf.cond(
-                    b is not None,
-                    lambda: tf.matmul(x, w, transpose_b=True) + tf.transpose(b),
-                    lambda: tf.matmul(x, w, transpose_b=True),
-                )
+                out = tf.matmul(x, w, transpose_b=True)
+                if b is not None:
+                    out += tf.transpose(b)
+                return out
 
             ws = tf.split(
                 tf.transpose(self.in_proj.weight[0]), num_or_size_splits=self.num_attention_heads * 3, axis=0
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 759e22c8c71c..837537a5cadf 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -532,7 +532,7 @@ def get_input_embeddings(self):
         try:
             return self.ctx_encoder.bert_model.get_input_embeddings()
         except AttributeError:
-            self(self.dummy_inputs)
+            self.build()
             return self.ctx_encoder.bert_model.get_input_embeddings()
 
     @unpack_inputs
@@ -613,7 +613,7 @@ def get_input_embeddings(self):
         try:
             return self.question_encoder.bert_model.get_input_embeddings()
         except AttributeError:
-            self(self.dummy_inputs)
+            self.build()
             return self.question_encoder.bert_model.get_input_embeddings()
 
     @unpack_inputs
@@ -693,7 +693,7 @@ def get_input_embeddings(self):
         try:
             return self.span_predictor.encoder.bert_model.get_input_embeddings()
         except AttributeError:
-            self(self.dummy_inputs)
+            self.build()
             return self.span_predictor.encoder.bert_model.get_input_embeddings()
 
     @unpack_inputs
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 5c989356a5de..e6d6c1d32527 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -538,7 +538,7 @@ def __init__(self, config: GroupViTTextConfig, **kwargs):
 
         self.config = config
 
-    def build(self, input_shape: tf.TensorShape):
+    def build(self, input_shape: tf.TensorShape = None):
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
                 shape=(self.config.vocab_size, self.embed_dim),
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 6e962ea4934e..a661f4b703b2 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -135,6 +135,7 @@ def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
 class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, layer_id, **kwargs):
         super().__init__(**kwargs)
+        self.config = config
 
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
@@ -191,6 +192,16 @@ def __init__(self, config, layer_id, **kwargs):
 
         self.one_sided_attn_window_size = attention_window // 2
 
+    def build(self, input_shape=None):
+        if not self.built:
+            with tf.name_scope("query_global"):
+                self.query_global.build((self.config.hidden_size,))
+            with tf.name_scope("key_global"):
+                self.key_global.build((self.config.hidden_size,))
+            with tf.name_scope("value_global"):
+                self.value_global.build((self.config.hidden_size,))
+        super().build(input_shape)
+
     def call(
         self,
         inputs,
@@ -271,9 +282,8 @@ def call(
         ) = self._get_global_attn_indices(is_index_global_attn)
 
         # this function is only relevant for global attention
-        attn_scores = tf.cond(
-            is_global_attn,
-            lambda: self._concat_with_global_key_attn_probs(
+        if is_global_attn:
+            attn_scores = self._concat_with_global_key_attn_probs(
                 attn_scores=attn_scores,
                 query_vectors=query_vectors,
                 key_vectors=key_vectors,
@@ -281,26 +291,24 @@ def call(
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
-            ),
-            lambda: attn_scores,
-        )
+            )
+
         attn_probs = stable_softmax(attn_scores, axis=-1)
 
         # softmax sometimes inserts NaN if all positions are masked, replace them with 0
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_index,
             tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype),
@@ -324,19 +332,19 @@ def call(
         value_vectors = tf.reshape(value_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
 
         # if global attention, compute sum of global and local attn
-        attn_output = tf.cond(
-            is_global_attn,
-            lambda: self._compute_attn_output_with_global_indices(
+
+        if is_global_attn:
+            attn_output = self._compute_attn_output_with_global_indices(
                 value_vectors=value_vectors,
                 attn_probs=attn_probs,
                 max_num_global_attn_indices=max_num_global_attn_indices,
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-            ),
-            lambda: self._sliding_chunks_matmul_attn_probs_value(
+            )
+        else:
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
                 attn_probs, value_vectors, self.one_sided_attn_window_size
-            ),
-        )
+            )
 
         tf.debugging.assert_equal(
             shape_list(attn_output), [batch_size, seq_len, self.num_heads, self.head_dim], message="Unexpected size"
@@ -345,10 +353,8 @@ def call(
         attn_output = tf.reshape(attn_output, (batch_size, seq_len, embed_dim))
 
         # compute value for global attention and overwrite to attention output
-        # TODO: remove the redundant computation
-        attn_output, global_attn_probs = tf.cond(
-            is_global_attn,
-            lambda: self._compute_global_attn_output_from_hidden(
+        if is_global_attn:
+            attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
                 attn_output=attn_output,
                 hidden_states=hidden_states,
                 max_num_global_attn_indices=max_num_global_attn_indices,
@@ -358,25 +364,25 @@ def call(
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
                 is_index_masked=is_index_masked,
                 training=training,
-            ),
-            lambda: (attn_output, tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))),
-        )
+            )
+        else:
+            # Leave attn_output unchanged
+            global_attn_probs = tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))
 
         # make sure that local attention probabilities are set to 0 for indices of global attn
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_global_attn_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_global_attn_index,
             tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype),
@@ -1864,13 +1870,10 @@ def _pad_to_window_size(
             input_ids = tf.pad(input_ids, paddings, constant_values=pad_token_id)
 
         if inputs_embeds is not None:
-
-            def pad_embeddings():
+            if padding_len > 0:
                 input_ids_padding = tf.fill((batch_size, padding_len), pad_token_id)
                 inputs_embeds_padding = self.embed_tokens(input_ids_padding)
-                return tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
-
-            inputs_embeds = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: inputs_embeds)
+                inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
 
         attention_mask = tf.pad(attention_mask, paddings, constant_values=False)  # no attention on the padding tokens
 
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 60cee2a83e89..2bfe79e21dde 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -652,6 +652,7 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool
 class TFLongformerSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, layer_id, **kwargs):
         super().__init__(**kwargs)
+        self.config = config
 
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
@@ -708,6 +709,16 @@ def __init__(self, config, layer_id, **kwargs):
 
         self.one_sided_attn_window_size = attention_window // 2
 
+    def build(self, input_shape=None):
+        if not self.built:
+            with tf.name_scope("query_global"):
+                self.query_global.build((self.config.hidden_size,))
+            with tf.name_scope("key_global"):
+                self.key_global.build((self.config.hidden_size,))
+            with tf.name_scope("value_global"):
+                self.value_global.build((self.config.hidden_size,))
+        super().build(input_shape)
+
     def call(
         self,
         inputs,
@@ -788,9 +799,8 @@ def call(
         ) = self._get_global_attn_indices(is_index_global_attn)
 
         # this function is only relevant for global attention
-        attn_scores = tf.cond(
-            is_global_attn,
-            lambda: self._concat_with_global_key_attn_probs(
+        if is_global_attn:
+            attn_scores = self._concat_with_global_key_attn_probs(
                 attn_scores=attn_scores,
                 query_vectors=query_vectors,
                 key_vectors=key_vectors,
@@ -798,26 +808,24 @@ def call(
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
-            ),
-            lambda: attn_scores,
-        )
+            )
+
         attn_probs = stable_softmax(attn_scores, axis=-1)
 
         # softmax sometimes inserts NaN if all positions are masked, replace them with 0
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_index = tf.tile(
                 is_index_masked[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_index,
             tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype),
@@ -841,19 +849,19 @@ def call(
         value_vectors = tf.reshape(value_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
 
         # if global attention, compute sum of global and local attn
-        attn_output = tf.cond(
-            is_global_attn,
-            lambda: self._compute_attn_output_with_global_indices(
+
+        if is_global_attn:
+            attn_output = self._compute_attn_output_with_global_indices(
                 value_vectors=value_vectors,
                 attn_probs=attn_probs,
                 max_num_global_attn_indices=max_num_global_attn_indices,
                 is_index_global_attn_nonzero=is_index_global_attn_nonzero,
                 is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
-            ),
-            lambda: self._sliding_chunks_matmul_attn_probs_value(
+            )
+        else:
+            attn_output = self._sliding_chunks_matmul_attn_probs_value(
                 attn_probs, value_vectors, self.one_sided_attn_window_size
-            ),
-        )
+            )
 
         tf.debugging.assert_equal(
             shape_list(attn_output), [batch_size, seq_len, self.num_heads, self.head_dim], message="Unexpected size"
@@ -862,10 +870,8 @@ def call(
         attn_output = tf.reshape(attn_output, (batch_size, seq_len, embed_dim))
 
         # compute value for global attention and overwrite to attention output
-        # TODO: remove the redundant computation
-        attn_output, global_attn_probs = tf.cond(
-            is_global_attn,
-            lambda: self._compute_global_attn_output_from_hidden(
+        if is_global_attn:
+            attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden(
                 attn_output=attn_output,
                 hidden_states=hidden_states,
                 max_num_global_attn_indices=max_num_global_attn_indices,
@@ -875,25 +881,25 @@ def call(
                 is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero,
                 is_index_masked=is_index_masked,
                 training=training,
-            ),
-            lambda: (attn_output, tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))),
-        )
+            )
+        else:
+            # Leave attn_output unchanged
+            global_attn_probs = tf.zeros((batch_size, self.num_heads, max_num_global_attn_indices, seq_len))
 
         # make sure that local attention probabilities are set to 0 for indices of global attn
         # Make sure to create a mask with the proper shape:
         # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1]
         # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1]
-        masked_global_attn_index = tf.cond(
-            is_global_attn,
-            lambda: tf.tile(
+        if is_global_attn:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1),
-            ),
-            lambda: tf.tile(
+            )
+        else:
+            masked_global_attn_index = tf.tile(
                 is_index_global_attn[:, :, None, None],
                 (1, 1, self.num_heads, self.one_sided_attn_window_size * 2 + 1),
-            ),
-        )
+            )
         attn_probs = tf.where(
             masked_global_attn_index,
             tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype),
@@ -1828,13 +1834,10 @@ def _pad_to_window_size(
             position_ids = tf.pad(position_ids, paddings, constant_values=pad_token_id)
 
         if inputs_embeds is not None:
-
-            def pad_embeddings():
+            if padding_len > 0:
                 input_ids_padding = tf.cast(tf.fill((batch_size, padding_len), self.pad_token_id), tf.int64)
                 inputs_embeds_padding = self.embeddings(input_ids_padding)
-                return tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
-
-            inputs_embeds = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: inputs_embeds)
+                inputs_embeds = tf.concat([inputs_embeds, inputs_embeds_padding], axis=-2)
 
         attention_mask = tf.pad(attention_mask, paddings, constant_values=False)  # no attention on the padding tokens
         token_type_ids = tf.pad(token_type_ids, paddings, constant_values=0)  # pad with token_type_id = 0
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index c454a8b35db1..bc508a47984e 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -151,6 +151,7 @@ def __init__(self, feat_size, epsilon=None, **kwargs):
     def build(self, input_shape):
         self.bias = self.add_weight("bias", shape=[self.feat_size], initializer="zeros")
         self.weight = self.add_weight("weight", shape=[self.feat_size], initializer="ones")
+        super().build(input_shape)
 
     def call(self, inputs: tf.Tensor):
         return inputs * self.weight + self.bias
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index 46710b329847..b1ef53eb6533 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -581,6 +581,7 @@ def build(self, input_shape):
             initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=self.scale),
             trainable=False,
         )
+        super().build(input_shape)
 
     def call(self, input_coords, input_shape=None):
         """Positionally encode points that are normalized to [0,1]."""
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index 6e0c65a813f1..34349c866175 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -225,6 +225,7 @@ def build(self, input_shape=None):
         # Build in the build() method to make sure the names are right
         initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value)
         self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale")
+        super().build(input_shape)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -591,7 +592,7 @@ def from_vision_text_pretrained(
         if text_model.name != "text_model":
             raise ValueError("text model must be created with the name `text_model`.")
 
-        model(model.dummy_inputs)  # Ensure model is fully built
+        model.build()  # Ensure model is fully built
 
         return model
 
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index e7d7770bcf26..21898bbe83bb 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -966,11 +966,8 @@ def patchify(self, pixel_values):
         """
         patch_size, num_channels = self.config.patch_size, self.config.num_channels
         # make sure channels are last
-        pixel_values = tf.cond(
-            tf.math.equal(shape_list(pixel_values)[1], num_channels),
-            lambda: tf.transpose(pixel_values, perm=(0, 2, 3, 1)),
-            lambda: pixel_values,
-        )
+        if shape_list(pixel_values)[1] == num_channels:
+            pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
 
         # sanity checks
         tf.debugging.assert_equal(
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index b8cd87f67ef0..3fab2af43767 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -766,11 +766,12 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         batch_size, seq_len = input_shape[0], input_shape[1]
 
-        combined_attention_mask = tf.cond(
-            tf.math.greater(seq_len, 1),
-            lambda: _make_causal_mask(input_shape, past_key_values_length=past_key_values_length),
-            lambda: _expand_mask(tf.ones((batch_size, seq_len + past_key_values_length)), tgt_len=seq_len),
-        )
+        if seq_len > 1:
+            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        else:
+            combined_attention_mask = _expand_mask(
+                tf.ones((batch_size, seq_len + past_key_values_length)), tgt_len=seq_len
+            )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index c5f3805ec987..a0e6a8c2aa50 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -476,6 +476,7 @@ def build(self, input_shape):
         self.mask_emb = self.add_weight(
             shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
         )
+        super().build(input_shape)
 
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index c113011c567d..2068c5f6651c 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -328,7 +328,7 @@ def test_save_load_after_resize_token_embeddings(self):
             old_total_size = config.vocab_size
             new_total_size = old_total_size + new_tokens_size
             model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-            model(model.dummy_inputs)  # builds the embeddings layer
+            model.build()
             model.resize_token_embeddings(new_total_size)
 
             # fetch the output for an input exclusively made of new members of the vocabulary
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
index aa22e961f653..ab5da3d41e6c 100644
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
@@ -1070,9 +1070,9 @@ def test_encoder_decoder_save_load_from_encoder_decoder(self):
 
         # create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
         encoder = TFBertModel(config.encoder)
-        encoder(encoder.dummy_inputs)
+        encoder.build()
         decoder = TFBertLMHeadModel(config.decoder)
-        decoder(decoder.dummy_inputs)
+        decoder.build()
 
         encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)
 
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index c69ab863373d..c0aeeddafd19 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -463,7 +463,7 @@ def test_onnx_runtime_optimize(self):
                 continue
 
             model = model_class(config)
-            model(model.dummy_inputs)
+            model.build()
 
             onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
index 85514c9d7204..0fd3a22bf37e 100644
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ b/tests/models/opt/test_modeling_tf_opt.py
@@ -194,7 +194,7 @@ def _get_word_embedding_weight(model, embedding_layer):
             else:
                 # Here we build the word embeddings weights if not exists.
                 # And then we retry to get the attribute once built.
-                model(model.dummy_inputs)
+                model.build()
                 if hasattr(embedding_layer, "weight"):
                     return embedding_layer.weight
                 else:
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index 04062014b846..7a29bbd22115 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -729,9 +729,9 @@ def test_encoder_decoder_save_load_from_encoder_decoder(self):
 
         # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
         encoder = TFViTModel(config.encoder)
-        encoder(encoder.dummy_inputs)
+        encoder.build()
         decoder = TFGPT2LMHeadModel(config.decoder)
-        decoder(decoder.dummy_inputs)
+        decoder.build()
 
         encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
 
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index b9ad982176ef..0783bd67bf43 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -281,7 +281,7 @@ def test_save_load_strict(self):
         for model_class in self.all_model_classes:
             model = model_class(config)
 
-            model(model.dummy_inputs)
+            model.build()
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname, saved_model=False)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 69363686837b..586a2a761dc1 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -348,7 +348,7 @@ def test_onnx_compliancy(self):
 
             with tf.Graph().as_default() as g:
                 model = model_class(config)
-                model(model.dummy_inputs)
+                model.build()
 
                 for op in g.get_operations():
                     model_op_names.add(op.node_def.op)
@@ -375,7 +375,7 @@ def test_onnx_runtime_optimize(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            model(model.dummy_inputs)
+            model.build()
 
             onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset)
 
@@ -1180,7 +1180,7 @@ def test_resize_token_embeddings(self):
         def _get_word_embedding_weight(model, embedding_layer):
             if isinstance(embedding_layer, tf.keras.layers.Embedding):
                 # builds the embeddings layer
-                model(model.dummy_inputs)
+                model.build()
                 return embedding_layer.embeddings
             else:
                 return model._get_word_embedding_weight(embedding_layer)
@@ -1243,7 +1243,7 @@ def test_save_load_after_resize_token_embeddings(self):
             old_total_size = config.vocab_size
             new_total_size = old_total_size + new_tokens_size
             model = model_class(config=copy.deepcopy(config))  # `resize_token_embeddings` mutates `config`
-            model(model.dummy_inputs)  # builds the embeddings layer
+            model.build()
             model.resize_token_embeddings(new_total_size)
 
             # fetch the output for an input exclusively made of new members of the vocabulary
@@ -2313,8 +2313,8 @@ def test_checkpoint_sharding_local(self):
                 # Finally, check the model can be reloaded
                 new_model = TFBertModel.from_pretrained(tmp_dir)
 
-                model(model.dummy_inputs)
-                new_model(model.dummy_inputs)
+                model.build()
+                new_model.build()
 
                 for p1, p2 in zip(model.weights, new_model.weights):
                     self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
@@ -2440,7 +2440,7 @@ def test_push_to_hub(self):
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
-        _ = model(model.dummy_inputs)
+        model.build()
 
         logging.set_verbosity_info()
         logger = logging.get_logger("transformers.utils.hub")
@@ -2509,7 +2509,7 @@ def test_push_to_hub_in_organization(self):
         )
         model = TFBertModel(config)
         # Make sure model is properly initialized
-        _ = model(model.dummy_inputs)
+        model.build()
 
         model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
 

From bc9ecef94242f3247441ec60167cb026d871280e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@mit.edu>
Date: Tue, 6 Jun 2023 13:34:57 -0400
Subject: [PATCH 331/935] Use new parametrization based weight norm if
 available (#24030)

* Use new parametrization based weight norm if available

See https://github.com/pytorch/pytorch/pull/103001

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

* handle copies

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

* black

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

---------

Signed-off-by: Edward Z. Yang <ezyang@meta.com>
---
 src/transformers/models/hubert/modeling_hubert.py         | 8 ++++++--
 src/transformers/models/speecht5/modeling_speecht5.py     | 8 ++++++--
 src/transformers/models/unispeech/modeling_unispeech.py   | 8 ++++++--
 .../models/unispeech_sat/modeling_unispeech_sat.py        | 8 ++++++--
 src/transformers/models/wav2vec2/modeling_wav2vec2.py     | 8 ++++++--
 .../wav2vec2_conformer/modeling_wav2vec2_conformer.py     | 8 ++++++--
 src/transformers/models/wavlm/modeling_wavlm.py           | 8 ++++++--
 7 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index a62b0180e9c1..cd8b5f165d94 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -271,15 +271,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 786110c89da2..84a78de2aef3 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -401,15 +401,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = SpeechT5SamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index d7a10fbf59f6..c156886fe25c 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -306,15 +306,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index d2b258453091..5fff3265085d 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -320,15 +320,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = UniSpeechSatSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 51811dfdd31d..8b4d7874c6a5 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -376,15 +376,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 594ca15b697a..4eba3023de4d 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -356,15 +356,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 0dabcd234e10..59662053ba75 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -279,15 +279,19 @@ def __init__(self, config):
             groups=config.num_conv_pos_embedding_groups,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
             deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]

From 02d255db260253d137f6c627108f04dafeee4e81 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 6 Jun 2023 19:35:24 +0200
Subject: [PATCH 332/935] bring back `filtered_test_list_cross_tests.txt`
 (#24055)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0a5060b34900..822c67374895 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -49,6 +49,12 @@ jobs:
                   else
                       touch test_preparation/examples_test_list.txt
                   fi
+            - run: |
+                  if [ -f filtered_test_list_cross_tests.txt ]; then
+                      mv filtered_test_list_cross_tests.txt test_preparation/filtered_test_list_cross_tests.txt
+                  else
+                      touch test_preparation/filtered_test_list_cross_tests.txt
+                  fi
             - run: |
                 if [ -f doctest_list.txt ]; then
                     cp doctest_list.txt test_preparation/doctest_list.txt

From 60825f2c6e7fd3cb8ce27ab072a86b919ef1082a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 6 Jun 2023 14:30:59 -0400
Subject: [PATCH 333/935] =?UTF-8?q?Fix=20device=20placement=20for=20model-?=
 =?UTF-8?q?parallelism=20in=20generate=20for=20encoder/de=E2=80=A6=20(#240?=
 =?UTF-8?q?25)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix device placement for model-parallelism in generate for encoder/decoders

* Remove debug statements
---
 src/transformers/generation/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 9bf9add17e16..e88c446ad3dc 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -616,6 +616,10 @@ def _prepare_encoder_decoder_kwargs_for_generation(
     ) -> Dict[str, Any]:
         # 1. get encoder
         encoder = self.get_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
 
         # 2. Prepare encoder args and encoder kwargs from model kwargs.
         irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]

From f1660d7e23d4432513fe060bde4f9b7b29f05204 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 6 Jun 2023 14:31:14 -0400
Subject: [PATCH 334/935] Remote code improvements (#23959)

* Fix model load when it has both code on the Hub and locally

* Add input check with timeout

* Add tests

* Apply suggestions from code review

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

* Some non-saved stuff

* Add feature extractors

* Add image processor

* Add model

* Add processor and tokenizer

* Reduce timeout

---------

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
---
 src/transformers/dynamic_module_utils.py      | 44 ++++++++++++
 src/transformers/models/auto/auto_factory.py  | 38 +++++------
 .../models/auto/configuration_auto.py         | 18 ++---
 .../models/auto/feature_extraction_auto.py    | 32 ++++-----
 .../models/auto/image_processing_auto.py      | 32 ++++-----
 .../models/auto/processing_auto.py            | 38 +++++------
 .../models/auto/tokenization_auto.py          | 39 +++++------
 tests/models/auto/test_configuration_auto.py  | 33 +++++++++
 .../auto/test_feature_extraction_auto.py      | 49 ++++++++++++++
 .../models/auto/test_image_processing_auto.py | 45 +++++++++++++
 tests/models/auto/test_modeling_auto.py       | 39 +++++++++++
 tests/models/auto/test_processor_auto.py      | 67 +++++++++++++++++++
 tests/models/auto/test_tokenization_auto.py   | 64 ++++++++++++++++++
 13 files changed, 435 insertions(+), 103 deletions(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index ae76e4ae1fa5..f106ccd213ac 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -18,6 +18,7 @@
 import os
 import re
 import shutil
+import signal
 import sys
 from pathlib import Path
 from typing import Dict, Optional, Union
@@ -513,3 +514,46 @@ def _set_auto_map_in_config(_config):
         result.append(dest_file)
 
     return result
+
+
+def _raise_timeout_error(signum, frame):
+    raise ValueError(
+        "Loading this model requires you to execute the configuration file in that repo on your local machine. We "
+        "asked if it was okay but did not get an answer. Make sure you have read the code there to avoid malicious "
+        "use, then set the option `trust_remote_code=True` to remove this error."
+    )
+
+
+TIME_OUT_REMOTE_CODE = 15
+
+
+def resolve_trust_remote_code(trust_remote_code, model_name, has_local_code, has_remote_code):
+    if trust_remote_code is None:
+        if has_local_code:
+            trust_remote_code = False
+        elif has_remote_code and TIME_OUT_REMOTE_CODE > 0:
+            signal.signal(signal.SIGALRM, _raise_timeout_error)
+            signal.alarm(TIME_OUT_REMOTE_CODE)
+            while trust_remote_code is None:
+                answer = input(
+                    f"Loading {model_name} requires to execute some code in that repo, you can inspect the content of "
+                    f"the repository at https://hf.co/{model_name}. You can dismiss this prompt by passing "
+                    "`trust_remote_code=True`.\nDo you accept? [y/N] "
+                )
+                if answer.lower() in ["yes", "y", "1"]:
+                    trust_remote_code = True
+                elif answer.lower() in ["no", "n", "0", ""]:
+                    trust_remote_code = False
+            signal.alarm(0)
+        elif has_remote_code:
+            # For the CI which puts the timeout at 0
+            _raise_timeout_error(None, None)
+
+    if has_remote_code and not has_local_code and not trust_remote_code:
+        raise ValueError(
+            f"Loading {model_name} requires you to execute the configuration file in that"
+            " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+            " set the option `trust_remote_code=True` to remove this error."
+        )
+
+    return trust_remote_code
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 891934116645..3c0aa8d63fc5 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...utils import copy_func, logging, requires_backends
 from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
 
@@ -404,19 +404,14 @@ def __init__(self, *args, **kwargs):
 
     @classmethod
     def from_config(cls, config, **kwargs):
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
-        if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
-            if not trust_remote_code:
-                raise ValueError(
-                    "Loading this model requires you to execute the modeling file in that repo "
-                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
-                    "the option `trust_remote_code=True` to remove this error."
-                )
-            if kwargs.get("revision", None) is None:
-                logger.warning(
-                    "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
-                    "no malicious code has been contributed in a newer revision."
-                )
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, config._name_or_path, has_local_code, has_remote_code
+        )
+
+        if has_remote_code and trust_remote_code:
             class_ref = config.auto_map[cls.__name__]
             if "--" in class_ref:
                 repo_id, class_ref = class_ref.split("--")
@@ -437,7 +432,7 @@ def from_config(cls, config, **kwargs):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
         hub_kwargs_names = [
             "cache_dir",
@@ -470,13 +465,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             if kwargs_orig.get("torch_dtype", None) == "auto":
                 kwargs["torch_dtype"] = "auto"
 
-        if hasattr(config, "auto_map") and cls.__name__ in config.auto_map:
-            if not trust_remote_code:
-                raise ValueError(
-                    f"Loading {pretrained_model_name_or_path} requires you to execute the modeling file in that repo "
-                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
-                    "the option `trust_remote_code=True` to remove this error."
-                )
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+        if has_remote_code and trust_remote_code:
             class_ref = config.auto_map[cls.__name__]
             model_class = get_class_from_dynamic_module(
                 class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 197d2b2e14be..6445f62acb58 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -20,7 +20,7 @@
 from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...utils import CONFIG_NAME, logging
 
 
@@ -940,15 +940,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         ```"""
         kwargs["_from_auto"] = True
         kwargs["name_or_path"] = pretrained_model_name_or_path
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]:
-            if not trust_remote_code:
-                raise ValueError(
-                    f"Loading {pretrained_model_name_or_path} requires you to execute the configuration file in that"
-                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
-                    " set the option `trust_remote_code=True` to remove this error."
-                )
+        has_remote_code = "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]
+        has_local_code = "model_type" in config_dict and config_dict["model_type"] in CONFIG_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+
+        if has_remote_code and trust_remote_code:
             class_ref = config_dict["auto_map"]["AutoConfig"]
             config_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
             _ = kwargs.pop("code_revision", None)
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 588f4b4d3d59..681a59e0b28b 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -21,7 +21,7 @@
 
 # Build the list of all feature extractors
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...feature_extraction_utils import FeatureExtractionMixin
 from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
 from .auto_factory import _LazyAutoMapping
@@ -307,7 +307,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
         config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
@@ -326,21 +326,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 feature_extractor_auto_map = config.auto_map["AutoFeatureExtractor"]
 
         if feature_extractor_class is not None:
-            # If we have custom code for a feature extractor, we get the proper class.
-            if feature_extractor_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
-                        "in that repo on your local machine. Make sure you have read the code there to avoid "
-                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
-                    )
-                feature_extractor_class = get_class_from_dynamic_module(
-                    feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
-                )
-                _ = kwargs.pop("code_revision", None)
-            else:
-                feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
+            feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)
 
+        has_remote_code = feature_extractor_auto_map is not None
+        has_local_code = feature_extractor_class is not None or type(config) in FEATURE_EXTRACTOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+
+        if has_remote_code and trust_remote_code:
+            feature_extractor_class = get_class_from_dynamic_module(
+                feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        elif feature_extractor_class is not None:
             return feature_extractor_class.from_dict(config_dict, **kwargs)
         # Last try: we use the FEATURE_EXTRACTOR_MAPPING.
         elif type(config) in FEATURE_EXTRACTOR_MAPPING:
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index cfa11f149cec..ab02bd495aac 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -21,7 +21,7 @@
 
 # Build the list of all image processors
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...image_processing_utils import ImageProcessingMixin
 from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
 from .auto_factory import _LazyAutoMapping
@@ -314,7 +314,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
         config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
@@ -351,21 +351,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 image_processor_auto_map = config.auto_map["AutoImageProcessor"]
 
         if image_processor_class is not None:
-            # If we have custom code for a image processor, we get the proper class.
-            if image_processor_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the image processor file "
-                        "in that repo on your local machine. Make sure you have read the code there to avoid "
-                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
-                    )
-                image_processor_class = get_class_from_dynamic_module(
-                    image_processor_auto_map, pretrained_model_name_or_path, **kwargs
-                )
-                _ = kwargs.pop("code_revision", None)
-            else:
-                image_processor_class = image_processor_class_from_name(image_processor_class)
+            image_processor_class = image_processor_class_from_name(image_processor_class)
+
+        has_remote_code = image_processor_auto_map is not None
+        has_local_code = image_processor_class is not None or type(config) in IMAGE_PROCESSOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
+        if has_remote_code and trust_remote_code:
+            image_processor_class = get_class_from_dynamic_module(
+                image_processor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            return image_processor_class.from_dict(config_dict, **kwargs)
+        elif image_processor_class is not None:
             return image_processor_class.from_dict(config_dict, **kwargs)
         # Last try: we use the IMAGE_PROCESSOR_MAPPING.
         elif type(config) in IMAGE_PROCESSOR_MAPPING:
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 691a9bb6d513..2dc8be70fe20 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -20,7 +20,7 @@
 
 # Build the list of all feature extractors
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...feature_extraction_utils import FeatureExtractionMixin
 from ...image_processing_utils import ImageProcessingMixin
 from ...tokenization_utils import TOKENIZER_CONFIG_FILE
@@ -194,7 +194,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
 
         processor_class = None
@@ -248,28 +248,28 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 processor_auto_map = config.auto_map["AutoProcessor"]
 
         if processor_class is not None:
-            # If we have custom code for a feature extractor, we get the proper class.
-            if processor_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
-                        "in that repo on your local machine. Make sure you have read the code there to avoid "
-                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
-                    )
-
-                processor_class = get_class_from_dynamic_module(
-                    processor_auto_map, pretrained_model_name_or_path, **kwargs
-                )
-                _ = kwargs.pop("code_revision", None)
-            else:
-                processor_class = processor_class_from_name(processor_class)
+            processor_class = processor_class_from_name(processor_class)
+
+        has_remote_code = processor_auto_map is not None
+        has_local_code = processor_class is not None or type(config) in PROCESSOR_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
+        if has_remote_code and trust_remote_code:
+            processor_class = get_class_from_dynamic_module(
+                processor_auto_map, pretrained_model_name_or_path, **kwargs
+            )
+            _ = kwargs.pop("code_revision", None)
+            return processor_class.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        elif processor_class is not None:
             return processor_class.from_pretrained(
                 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
             )
-
         # Last try: we use the PROCESSOR_MAPPING.
-        if type(config) in PROCESSOR_MAPPING:
+        elif type(config) in PROCESSOR_MAPPING:
             return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         # At this stage, there doesn't seem to be a `Processor` class available for this model, so let's try a
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index aa4d5860a147..049310e629e5 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -21,7 +21,7 @@
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
@@ -608,7 +608,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
         use_fast = kwargs.pop("use_fast", True)
         tokenizer_type = kwargs.pop("tokenizer_type", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
 
         # First, let's see whether the tokenizer_type is passed so that we can leverage it
         if tokenizer_type is not None:
@@ -662,31 +662,28 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
                 tokenizer_auto_map = config.auto_map["AutoTokenizer"]
 
-        # If we have the tokenizer class from the tokenizer config or the model config we're good!
-        if config_tokenizer_class is not None:
-            tokenizer_class = None
-            if tokenizer_auto_map is not None:
-                if not trust_remote_code:
-                    raise ValueError(
-                        f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that"
-                        " repo on your local machine. Make sure you have read the code there to avoid malicious use,"
-                        " then set the option `trust_remote_code=True` to remove this error."
-                    )
-
-                if use_fast and tokenizer_auto_map[1] is not None:
-                    class_ref = tokenizer_auto_map[1]
-                else:
-                    class_ref = tokenizer_auto_map[0]
-                tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
-                _ = kwargs.pop("code_revision", None)
+        has_remote_code = tokenizer_auto_map is not None
+        has_local_code = config_tokenizer_class is not None or type(config) in TOKENIZER_MAPPING
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
 
-            elif use_fast and not config_tokenizer_class.endswith("Fast"):
+        if has_remote_code and trust_remote_code:
+            if use_fast and tokenizer_auto_map[1] is not None:
+                class_ref = tokenizer_auto_map[1]
+            else:
+                class_ref = tokenizer_auto_map[0]
+            tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            _ = kwargs.pop("code_revision", None)
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif config_tokenizer_class is not None:
+            tokenizer_class = None
+            if use_fast and not config_tokenizer_class.endswith("Fast"):
                 tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
                 tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
             if tokenizer_class is None:
                 tokenizer_class_candidate = config_tokenizer_class
                 tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
-
             if tokenizer_class is None:
                 raise ValueError(
                     f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
diff --git a/tests/models/auto/test_configuration_auto.py b/tests/models/auto/test_configuration_auto.py
index 030a03aa6d6c..fa05952d29a3 100644
--- a/tests/models/auto/test_configuration_auto.py
+++ b/tests/models/auto/test_configuration_auto.py
@@ -21,6 +21,7 @@
 import unittest
 from pathlib import Path
 
+import transformers
 import transformers.models.auto
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
 from transformers.models.bert.configuration_bert import BertConfig
@@ -37,6 +38,9 @@
 
 
 class AutoConfigTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_module_spec(self):
         self.assertIsNotNone(transformers.models.auto.__spec__)
         self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))
@@ -108,6 +112,13 @@ def test_configuration_not_found(self):
             _ = AutoConfig.from_pretrained("hf-internal-testing/no-config-test-repo")
 
     def test_from_pretrained_dynamic_config(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+
         config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
         self.assertEqual(config.__class__.__name__, "NewModelConfig")
 
@@ -116,3 +127,25 @@ def test_from_pretrained_dynamic_config(self):
             config.save_pretrained(tmp_dir)
             reloaded_config = AutoConfig.from_pretrained(tmp_dir, trust_remote_code=True)
         self.assertEqual(reloaded_config.__class__.__name__, "NewModelConfig")
+
+    def test_from_pretrained_dynamic_config_conflict(self):
+        class NewModelConfigLocal(BertConfig):
+            model_type = "new-model"
+
+        try:
+            AutoConfig.register("new-model", NewModelConfigLocal)
+            # If remote code is not set, the default is to use local
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
+            self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote code is disabled, we load the local one.
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+            self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote is enabled, we load from the Hub
+            config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
+            self.assertEqual(config.__class__.__name__, "NewModelConfig")
+
+        finally:
+            if "new-model" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["new-model"]
diff --git a/tests/models/auto/test_feature_extraction_auto.py b/tests/models/auto/test_feature_extraction_auto.py
index 35d3ac0fa4de..ed50006741eb 100644
--- a/tests/models/auto/test_feature_extraction_auto.py
+++ b/tests/models/auto/test_feature_extraction_auto.py
@@ -19,6 +19,7 @@
 import unittest
 from pathlib import Path
 
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
@@ -42,6 +43,9 @@
 
 
 class AutoFeatureExtractorTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_feature_extractor_from_model_shortcut(self):
         config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
@@ -96,6 +100,17 @@ def test_feature_extractor_not_found(self):
             _ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model")
 
     def test_from_pretrained_dynamic_feature_extractor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor"
+            )
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
+            )
+
         feature_extractor = AutoFeatureExtractor.from_pretrained(
             "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
         )
@@ -127,3 +142,37 @@ def test_new_feature_extractor_registration(self):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
                 del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
+
+    def test_from_pretrained_dynamic_feature_extractor_conflict(self):
+        class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
+            is_local = True
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
+            # If remote code is not set, the default is to use local
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor"
+            )
+            self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+            self.assertTrue(feature_extractor.is_local)
+
+            # If remote code is disabled, we load the local one.
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
+            )
+            self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+            self.assertTrue(feature_extractor.is_local)
+
+            # If remote is enabled, we load from the Hub
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
+            )
+            self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
+            self.assertTrue(not hasattr(feature_extractor, "is_local"))
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py
index 7b2296e71d22..0fb22b6c2b1f 100644
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -19,6 +19,7 @@
 import unittest
 from pathlib import Path
 
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     IMAGE_PROCESSOR_MAPPING,
@@ -37,6 +38,9 @@
 
 
 class AutoImageProcessorTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_image_processor_from_model_shortcut(self):
         config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.assertIsInstance(config, CLIPImageProcessor)
@@ -130,6 +134,15 @@ def test_image_processor_not_found(self):
             _ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
 
     def test_from_pretrained_dynamic_image_processor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            image_processor = AutoImageProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
+            )
+
         image_processor = AutoImageProcessor.from_pretrained(
             "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
         )
@@ -171,3 +184,35 @@ def test_new_image_processor_registration(self):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
                 del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
+
+    def test_from_pretrained_dynamic_image_processor_conflict(self):
+        class NewImageProcessor(CLIPImageProcessor):
+            is_local = True
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoImageProcessor.register(CustomConfig, NewImageProcessor)
+            # If remote code is not set, the default is to use local
+            image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
+            self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
+            self.assertTrue(image_processor.is_local)
+
+            # If remote code is disabled, we load the local one.
+            image_processor = AutoImageProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
+            )
+            self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
+            self.assertTrue(image_processor.is_local)
+
+            # If remote is enabled, we load from the Hub
+            image_processor = AutoImageProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
+            )
+            self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
+            self.assertTrue(not hasattr(image_processor, "is_local"))
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
+                del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py
index 9c788a615560..347cabd38a28 100644
--- a/tests/models/auto/test_modeling_auto.py
+++ b/tests/models/auto/test_modeling_auto.py
@@ -22,6 +22,7 @@
 
 import pytest
 
+import transformers
 from transformers import BertConfig, GPT2Model, is_safetensors_available, is_torch_available
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING
 from transformers.testing_utils import (
@@ -92,6 +93,9 @@
 
 @require_torch
 class AutoModelTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -312,6 +316,13 @@ def test_from_pretrained_dynamic_model_local(self):
                 del MODEL_MAPPING._extra_content[CustomConfig]
 
     def test_from_pretrained_dynamic_model_distant(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+
         model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
         self.assertEqual(model.__class__.__name__, "NewModel")
 
@@ -416,6 +427,34 @@ def test_new_model_registration(self):
                 if CustomConfig in mapping._extra_content:
                     del mapping._extra_content[CustomConfig]
 
+    def test_from_pretrained_dynamic_model_conflict(self):
+        class NewModelConfigLocal(BertConfig):
+            model_type = "new-model"
+
+        class NewModel(BertModel):
+            config_class = NewModelConfigLocal
+
+        try:
+            AutoConfig.register("new-model", NewModelConfigLocal)
+            AutoModel.register(NewModelConfigLocal, NewModel)
+            # If remote code is not set, the default is to use local
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
+            self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote code is disabled, we load the local one.
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
+            self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
+
+            # If remote is enabled, we load from the Hub
+            model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
+            self.assertEqual(model.config.__class__.__name__, "NewModelConfig")
+
+        finally:
+            if "new-model" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["new-model"]
+            if NewModelConfigLocal in MODEL_MAPPING._extra_content:
+                del MODEL_MAPPING._extra_content[NewModelConfigLocal]
+
     def test_repo_not_found(self):
         with self.assertRaisesRegex(
             EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index e0bb4946f702..a4f371426877 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -24,6 +24,7 @@
 from huggingface_hub import HfFolder, Repository, create_repo, delete_repo
 from requests.exceptions import HTTPError
 
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
@@ -33,6 +34,8 @@
     AutoFeatureExtractor,
     AutoProcessor,
     AutoTokenizer,
+    BertTokenizer,
+    ProcessorMixin,
     Wav2Vec2Config,
     Wav2Vec2FeatureExtractor,
     Wav2Vec2Processor,
@@ -58,6 +61,9 @@
 class AutoFeatureExtractorTest(unittest.TestCase):
     vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
 
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     def test_processor_from_model_shortcut(self):
         processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsInstance(processor, Wav2Vec2Processor)
@@ -144,6 +150,15 @@ def test_processor_from_local_directory_from_model_config(self):
         self.assertIsInstance(processor, Wav2Vec2Processor)
 
     def test_from_pretrained_dynamic_processor(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=False
+            )
+
         processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor", trust_remote_code=True)
         self.assertTrue(processor.special_attribute_present)
         self.assertEqual(processor.__class__.__name__, "NewProcessor")
@@ -203,6 +218,58 @@ def test_new_processor_registration(self):
             if CustomConfig in PROCESSOR_MAPPING._extra_content:
                 del PROCESSOR_MAPPING._extra_content[CustomConfig]
 
+    def test_from_pretrained_dynamic_processor_conflict(self):
+        class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
+            special_attribute_present = False
+
+        class NewTokenizer(BertTokenizer):
+            special_attribute_present = False
+
+        class NewProcessor(ProcessorMixin):
+            feature_extractor_class = "AutoFeatureExtractor"
+            tokenizer_class = "AutoTokenizer"
+            special_attribute_present = False
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
+            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
+            AutoProcessor.register(CustomConfig, NewProcessor)
+            # If remote code is not set, the default is to use local classes.
+            processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+            self.assertFalse(processor.special_attribute_present)
+            self.assertFalse(processor.feature_extractor.special_attribute_present)
+            self.assertFalse(processor.tokenizer.special_attribute_present)
+
+            # If remote code is disabled, we load the local ones.
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=False
+            )
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+            self.assertFalse(processor.special_attribute_present)
+            self.assertFalse(processor.feature_extractor.special_attribute_present)
+            self.assertFalse(processor.tokenizer.special_attribute_present)
+
+            # If remote is enabled, we load from the Hub.
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor", trust_remote_code=True
+            )
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+            self.assertTrue(processor.special_attribute_present)
+            self.assertTrue(processor.feature_extractor.special_attribute_present)
+            self.assertTrue(processor.tokenizer.special_attribute_present)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in TOKENIZER_MAPPING._extra_content:
+                del TOKENIZER_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in PROCESSOR_MAPPING._extra_content:
+                del PROCESSOR_MAPPING._extra_content[CustomConfig]
+
     def test_auto_processor_creates_tokenizer(self):
         processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
         self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index a919ac3eda11..a3a776083893 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -22,6 +22,7 @@
 
 import pytest
 
+import transformers
 from transformers import (
     BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -65,6 +66,9 @@
 
 
 class AutoTokenizerTest(unittest.TestCase):
+    def setUp(self):
+        transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
+
     @slow
     def test_tokenizer_from_pretrained(self):
         for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
@@ -298,6 +302,15 @@ def test_new_tokenizer_fast_registration(self):
                 del TOKENIZER_MAPPING._extra_content[CustomConfig]
 
     def test_from_pretrained_dynamic_tokenizer(self):
+        # If remote code is not set, we will time out when asking whether to load the model.
+        with self.assertRaises(ValueError):
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
+        # If remote code is disabled, we can't load this config.
+        with self.assertRaises(ValueError):
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
+            )
+
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
         self.assertTrue(tokenizer.special_attribute_present)
         # Test tokenizer can be reloaded.
@@ -326,6 +339,57 @@ def test_from_pretrained_dynamic_tokenizer(self):
             self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
             self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
 
+    @require_tokenizers
+    def test_from_pretrained_dynamic_tokenizer_conflict(self):
+        class NewTokenizer(BertTokenizer):
+            special_attribute_present = False
+
+        class NewTokenizerFast(BertTokenizerFast):
+            slow_tokenizer_class = NewTokenizer
+            special_attribute_present = False
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
+            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast)
+            # If remote code is not set, the default is to use local
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+            self.assertFalse(tokenizer.special_attribute_present)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertFalse(tokenizer.special_attribute_present)
+
+            # If remote code is disabled, we load the local one.
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+            self.assertFalse(tokenizer.special_attribute_present)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertFalse(tokenizer.special_attribute_present)
+
+            # If remote is enabled, we load from the Hub
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+            self.assertTrue(tokenizer.special_attribute_present)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
+            )
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertTrue(tokenizer.special_attribute_present)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in TOKENIZER_MAPPING._extra_content:
+                del TOKENIZER_MAPPING._extra_content[CustomConfig]
+
     def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
         tokenizer = AutoTokenizer.from_pretrained(
             "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True

From 612b2a1a6d407081fe6dc296c19b4dba28abffa8 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 7 Jun 2023 11:56:57 +0100
Subject: [PATCH 335/935] Generate: increase left-padding test atol (#23448)

increase atol
---
 tests/generation/test_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 5f835917ea02..4e09f21898fd 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1608,7 +1608,6 @@ def test_generate_with_head_masking(self):
                 attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
                 self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    @slow  # TODO (Joao): fix GPTBigCode
     def test_left_padding_compatibility(self):
         # The check done in this test is fairly difficult -- depending on the model architecture, passing the right
         # position index for the position embeddings can still result in a different output, due to numerical masking.
@@ -1648,7 +1647,7 @@ def test_left_padding_compatibility(self):
                     position_ids.masked_fill_(padded_attention_mask == 0, 1)
                     model_kwargs["position_ids"] = position_ids
                 next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
-                if not torch.allclose(next_logits_wo_padding, next_logits_with_padding):
+                if not torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-7):
                     no_failures = False
                     break
 

From 52972e70c7b95ad8c80572fc50d843ee4697b7f6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 7 Jun 2023 13:27:07 +0200
Subject: [PATCH 336/935] [Wav2Vec2] Fix torch srcipt (#24062)

* [Wav2Vec2] Fix torch srcipt

* fix more
---
 src/transformers/models/wav2vec2/modeling_wav2vec2.py | 5 ++---
 tests/models/wav2vec2/test_modeling_wav2vec2.py       | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 8b4d7874c6a5..80d05d3777f0 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1178,8 +1178,7 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureEncoder)):
             module.gradient_checkpointing = value
 
-    @property
-    def _adapters(self):
+    def _get_adapters(self):
         if self.config.adapter_attn_dim is None:
             raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
 
@@ -1339,7 +1338,7 @@ def load_adapter(self, target_lang: str, **kwargs):
                     f" directory containing a file named {filepath}."
                 )
 
-        adapter_weights = self._adapters
+        adapter_weights = self._get_adapters()
         unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
         missing_keys = set(adapter_weights.keys()) - set(state_dict.keys())
 
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 8fc82eb96e20..cf41dd9a301a 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -297,7 +297,7 @@ def create_and_check_model_with_attn_adapter(self, config, input_values, attenti
         config.adapter_attn_dim = 16
         model = Wav2Vec2ForCTC(config=config)
 
-        self.parent.assertIsNotNone(model._adapters)
+        self.parent.assertIsNotNone(model._get_adapters())
 
         model.to(torch_device)
         model.eval()
@@ -1146,7 +1146,7 @@ def get_logits(model, input_features):
             model = Wav2Vec2ForCTC.from_pretrained(tempdir)
 
             logits = get_logits(model, input_features)
-            adapter_weights = model._adapters
+            adapter_weights = model._get_adapters()
 
             # save safe weights
             safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng"))
@@ -1168,7 +1168,7 @@ def get_logits(model, input_features):
             model = Wav2Vec2ForCTC.from_pretrained(tempdir)
 
             logits = get_logits(model, input_features)
-            adapter_weights = model._adapters
+            adapter_weights = model._get_adapters()
 
             # save pt weights
             pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng"))

From 1e4a7737ed1fdb361a0473403201374382b447e3 Mon Sep 17 00:00:00 2001
From: YangLiu <yonyonlau@gmail.com>
Date: Wed, 7 Jun 2023 19:29:19 +0800
Subject: [PATCH 337/935] Add support for non-rust implemented tokenization for
 `__getitem__` method. (#24039)

* Add support for non-rust implemented tokenization for `__getitem__` method.

* Update for error message on adding new sub-branch for `__item__` method.

---------

Co-authored-by: liuyang17 <liuyang17@zhihu.com>
---
 src/transformers/tokenization_utils_base.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4b65b56f7a92..ba2f8ef3f13b 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -233,15 +233,20 @@ def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
         etc.).
 
         If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
+
+        If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
+        with the constraint of slice.
         """
         if isinstance(item, str):
             return self.data[item]
         elif self._encodings is not None:
             return self._encodings[item]
+        elif isinstance(item, slice):
+            return {key: self.data[key][slice] for key in self.data.keys()}
         else:
             raise KeyError(
-                "Indexing with integers (to access backend Encoding for a given batch index) "
-                "is not available when using Python based tokenizers"
+                "Invalid key. Only three types of key are available: "
+                "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
             )
 
     def __getattr__(self, item: str):

From 6daf7c311bb9755611b28e45f2373d5c9d5c3fbd Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 7 Jun 2023 14:30:55 +0200
Subject: [PATCH 338/935] Support PEFT models when saving the model using
 trainer (#24073)

* support PEFT models when saving the model using trainer

* fixup
---
 src/transformers/trainer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 45fb2f1a68e0..b9203024f1b3 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -147,6 +147,7 @@
     is_datasets_available,
     is_in_notebook,
     is_ipex_available,
+    is_peft_available,
     is_safetensors_available,
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
@@ -205,6 +206,10 @@
     import safetensors.torch
 
 
+if is_peft_available():
+    from peft import PeftModel
+
+
 skip_first_batches = None
 if is_accelerate_available():
     from accelerate import __version__ as accelerate_version
@@ -2897,13 +2902,15 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
         output_dir = output_dir if output_dir is not None else self.args.output_dir
         os.makedirs(output_dir, exist_ok=True)
         logger.info(f"Saving model checkpoint to {output_dir}")
+
+        supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
         # Save a trained model and configuration using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        if not isinstance(self.model, PreTrainedModel):
+        if not isinstance(self.model, supported_classes):
             if state_dict is None:
                 state_dict = self.model.state_dict()
 
-            if isinstance(unwrap_model(self.model), PreTrainedModel):
+            if isinstance(unwrap_model(self.model), supported_classes):
                 unwrap_model(self.model).save_pretrained(
                     output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
                 )

From 6b548129b1d2b5bc1de08e1a4de5cd57fb1ed483 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 7 Jun 2023 15:07:33 +0200
Subject: [PATCH 339/935] [`Hub`] Add `safe_serialization` in push_to_hub
 (#24074)

add `safe_serialization` in push_to_hub
---
 src/transformers/utils/hub.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index c8611c492341..076e6bdbb094 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -740,6 +740,7 @@ def push_to_hub(
         use_auth_token: Optional[Union[bool, str]] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
         create_pr: bool = False,
+        safe_serialization: bool = False,
         **deprecated_kwargs,
     ) -> str:
         """
@@ -767,6 +768,8 @@ def push_to_hub(
                 by a unit (like `"5MB"`).
             create_pr (`bool`, *optional*, defaults to `False`):
                 Whether or not to create a PR with the uploaded files or directly commit.
+            safe_serialization (`bool`, *optional*, defaults to `False`):
+                Whether or not to convert the model weights in safetensors format for safer serialization.
 
         Examples:
 
@@ -809,7 +812,7 @@ def push_to_hub(
             files_timestamps = self._get_files_timestamps(work_dir)
 
             # Save all files.
-            self.save_pretrained(work_dir, max_shard_size=max_shard_size)
+            self.save_pretrained(work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
 
             return self._upload_modified_files(
                 work_dir,

From a1160185ffccc89385a1b272d6df81cb7e6edb19 Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Wed, 7 Jun 2023 15:13:01 +0200
Subject: [PATCH 340/935] Fix `is_optimum_neuron_available` (#23961)

Fix is_optimum_neuron_available
---
 src/transformers/utils/import_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index be321c1017c3..39d72af305e5 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -98,7 +98,6 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _onnx_available = _is_package_available("onnx")
 _openai_available = _is_package_available("openai")
 _optimum_available = _is_package_available("optimum")
-_optimumneuron_available = _optimum_available and _is_package_available("optimum.neuron")
 _pandas_available = _is_package_available("pandas")
 _peft_available = _is_package_available("peft")
 _phonemizer_available = _is_package_available("phonemizer")
@@ -520,7 +519,7 @@ def is_optimum_available():
 
 
 def is_optimum_neuron_available():
-    return _optimumneuron_available
+    return _optimum_available and _is_package_available("optimum.neuron")
 
 
 def is_safetensors_available():

From 4795219228ee9ad63f07dc6fa769f08a13b77cb9 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 7 Jun 2023 15:27:46 +0200
Subject: [PATCH 341/935] [`bnb`] Fix bnb skip modules (#24043)

* fix skip modules test

* oops

* address comments
---
 src/transformers/utils/bitsandbytes.py | 10 +++++++---
 tests/bitsandbytes/test_mixed_int8.py  | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index d9b4c037b4d1..978726be4b91 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -109,16 +109,18 @@ class `Int8Params` from `bitsandbytes`.
             module._parameters[tensor_name] = new_value
 
 
-def _replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+def _replace_with_bnb_linear(
+    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False
+):
     """
     Private method that wraps the recursion for module replacement.
 
     Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
     """
-    has_been_replaced = False
     for name, module in model.named_children():
         if current_key_name is None:
             current_key_name = []
+        current_key_name.append(name)
 
         if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
             # Check if the current key is not in the `modules_to_not_convert`
@@ -151,14 +153,16 @@ def _replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_nam
                             has_been_replaced = True
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
-        # Remove the last key for recursion
         if len(list(module.children())) > 0:
             _, has_been_replaced = _replace_with_bnb_linear(
                 module,
                 modules_to_not_convert,
                 current_key_name,
                 quantization_config,
+                has_been_replaced=has_been_replaced,
             )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
     return model, has_been_replaced
 
 
diff --git a/tests/bitsandbytes/test_mixed_int8.py b/tests/bitsandbytes/test_mixed_int8.py
index d178dbff1c32..09157a251e02 100644
--- a/tests/bitsandbytes/test_mixed_int8.py
+++ b/tests/bitsandbytes/test_mixed_int8.py
@@ -146,6 +146,26 @@ def test_linear_are_8bit(self):
                 if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
                     self.assertTrue(module.weight.dtype == torch.int8)
 
+    def test_llm_skip(self):
+        r"""
+        A simple test to check if `llm_int8_skip_modules` works as expected
+        """
+        import bitsandbytes as bnb
+
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_skip_modules=["classifier"])
+        seq_classification_model = AutoModelForSequenceClassification.from_pretrained(
+            "roberta-large-mnli", quantization_config=quantization_config
+        )
+        self.assertTrue(seq_classification_model.roberta.encoder.layer[0].output.dense.weight.dtype == torch.int8)
+        self.assertTrue(
+            isinstance(seq_classification_model.roberta.encoder.layer[0].output.dense, bnb.nn.Linear8bitLt)
+        )
+
+        self.assertTrue(isinstance(seq_classification_model.classifier.dense, nn.Linear))
+        self.assertTrue(seq_classification_model.classifier.dense.weight.dtype != torch.int8)
+        self.assertTrue(isinstance(seq_classification_model.classifier.out_proj, nn.Linear))
+        self.assertTrue(seq_classification_model.classifier.out_proj != torch.int8)
+
     def test_generate_quality(self):
         r"""
         Test the generation quality of the quantized model and see that we are matching the expected output.

From 092c14c37d9baa6a12be5481138564a720a49a8a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 7 Jun 2023 16:18:13 +0200
Subject: [PATCH 342/935] Be nice to TF (#24076)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 2a09d5f8542e..cbcbf63087b8 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -272,6 +272,7 @@ def job_name(self):
         "pip install tensorflow_probability",
     ],
     parallelism=1,
+    pytest_num_workers=6,
     pytest_options={"rA": None},
 )
 

From 1fc832b4543e2d57cc738097ef43d9582991b54a Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 7 Jun 2023 16:23:05 +0100
Subject: [PATCH 343/935] Make the TF dummies even smaller (#24071)

* Let's see if we can use the smallest possible dummies

* Make GPT-2's dummies a little longer

* Just use (1,2) as the default shape

* Update other dummies in sync

* Correct imports for Keras 2.13

* Shrink the Wav2Vec2 dummies
---
 src/transformers/modeling_tf_utils.py             | 10 +++++++---
 .../models/funnel/modeling_tf_funnel.py           |  2 +-
 src/transformers/models/sam/modeling_tf_sam.py    | 15 ---------------
 .../models/wav2vec2/modeling_tf_wav2vec2.py       |  4 ++--
 .../models/whisper/modeling_tf_whisper.py         |  4 ++--
 5 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index e278fceb4a4c..5127e4c9821b 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -74,7 +74,7 @@
 if parse(tf.__version__).minor >= 13:
     from keras import backend as K
     from keras.__internal__ import KerasTensor
-    from keras.engine.base_layer_utils import call_context
+    from keras.src.engine.base_layer_utils import call_context
 elif parse(tf.__version__).minor >= 11:
     from keras import backend as K
     from keras.engine.base_layer_utils import call_context
@@ -1125,7 +1125,11 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         sig = self._prune_signature(self.input_signature)
         for key, spec in sig.items():
             # 2 is the most correct arbitrary size. I will not be taking questions
-            dummies[key] = tf.ones(shape=[dim if dim is not None else 2 for dim in spec.shape], dtype=spec.dtype)
+            dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
+            if spec.shape[0] is None:
+                # But let's make the batch size 1 to save memory anyway
+                dummy_shape[0] = 1
+            dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype)
             if key == "token_type_ids":
                 # Some models have token_type_ids but with a vocab_size of 1
                 dummies[key] = tf.zeros_like(dummies[key])
@@ -1133,7 +1137,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
             if "encoder_hidden_states" not in dummies:
                 if self.main_input_name == "input_ids":
                     dummies["encoder_hidden_states"] = tf.ones(
-                        shape=(2, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
+                        shape=(1, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states"
                     )
                 else:
                     raise NotImplementedError(
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index d1c8fc187885..ccd07b5954b7 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -978,7 +978,7 @@ class TFFunnelPreTrainedModel(TFPreTrainedModel):
     @property
     def dummy_inputs(self):
         # Funnel misbehaves with very small inputs, so we override and make them a bit bigger
-        return {"input_ids": tf.ones((3, 3), dtype=tf.int32)}
+        return {"input_ids": tf.ones((1, 3), dtype=tf.int32)}
 
 
 @dataclass
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index b1ef53eb6533..0d64b825d0de 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -1147,21 +1147,6 @@ class TFSamPreTrainedModel(TFPreTrainedModel):
     base_model_prefix = "sam"
     main_input_name = "pixel_values"
 
-    @property
-    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
-        # We override the default dummy inputs here because SAM has some really explosive memory usage in the
-        # attention layers, so we want to pass the smallest possible batches
-        VISION_DUMMY_INPUTS = tf.random.uniform(
-            shape=(
-                1,
-                self.config.vision_config.num_channels,
-                self.config.vision_config.image_size,
-                self.config.vision_config.image_size,
-            ),
-            dtype=tf.float32,
-        )
-        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
-
 
 SAM_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 39e1539e70a7..78cc2e467ba2 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1194,8 +1194,8 @@ def input_signature(self):
     @property
     def dummy_inputs(self):
         return {
-            "input_values": tf.random.uniform(shape=(1, 16000), dtype=tf.float32),
-            "attention_mask": tf.ones(shape=(1, 16000), dtype=tf.float32),
+            "input_values": tf.random.uniform(shape=(1, 500), dtype=tf.float32),
+            "attention_mask": tf.ones(shape=(1, 500), dtype=tf.float32),
         }
 
     def __init__(self, config, *inputs, **kwargs):
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 3fab2af43767..f33340e1c06a 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -481,9 +481,9 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         """
         return {
             self.main_input_name: tf.random.uniform(
-                [2, self.config.num_mel_bins, self.config.max_source_positions * 2 - 1], dtype=tf.float32
+                [1, self.config.num_mel_bins, self.config.max_source_positions * 2 - 1], dtype=tf.float32
             ),
-            "decoder_input_ids": tf.constant([[2, 3]], dtype=tf.int32),
+            "decoder_input_ids": tf.constant([[1, 3]], dtype=tf.int32),
         }
 
     @property

From 5c9394b54cb3179da98e5ac8f1a3ab1ea18d5e1d Mon Sep 17 00:00:00 2001
From: Mishig <dmishig@gmail.com>
Date: Wed, 7 Jun 2023 17:33:39 +0200
Subject: [PATCH 344/935] [doc build] Use secrets (#24079)

---
 .github/workflows/build_documentation.yml        |  1 +
 .github/workflows/delete_doc_comment.yml         | 13 +++++++------
 .github/workflows/delete_doc_comment_trigger.yml | 12 ++++++++++++
 .github/workflows/upload_pr_documentation.yml    | 16 ++++++++++++++++
 4 files changed, 36 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/delete_doc_comment_trigger.yml
 create mode 100644 .github/workflows/upload_pr_documentation.yml

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 4e59cfeb9d0d..6eecff24eb17 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -18,3 +18,4 @@ jobs:
       languages: de en es fr it ko pt zh
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
index d894f991ca9c..8604019d76eb 100644
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@@ -1,13 +1,14 @@
-name: Delete dev documentation
+name: Delete doc comment
 
 on:
-  pull_request:
-    types: [ closed ]
+  workflow_run:
+    workflows: ["Delete doc comment trigger"]
+    types:
+      - completed
 
 
 jobs:
   delete:
     uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
-      package: transformers
+    secrets:
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml
new file mode 100644
index 000000000000..771871f232c2
--- /dev/null
+++ b/.github/workflows/delete_doc_comment_trigger.yml
@@ -0,0 +1,12 @@
+name: Delete doc comment trigger
+
+on:
+  pull_request:
+    types: [ closed ]
+
+
+jobs:
+  delete:
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
+    with:
+      pr_number: ${{ github.event.number }}
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
new file mode 100644
index 000000000000..64befc595c42
--- /dev/null
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: transformers
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file

From 89b00eef9428111cd7c48426905fe013d7047a3b Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 7 Jun 2023 11:38:56 -0400
Subject: [PATCH 345/935] Fix expected value in tests of the test fetcher
 (#24077)

* Fix expected value in tests of the test fetcher

* Fix trigger for repo util tests
---
 tests/repo_utils/test_tests_fetcher.py | 13 ++++++++++---
 utils/tests_fetcher.py                 |  8 ++++++--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/repo_utils/test_tests_fetcher.py b/tests/repo_utils/test_tests_fetcher.py
index 2dfc671944ee..a897bb3f0d08 100644
--- a/tests/repo_utils/test_tests_fetcher.py
+++ b/tests/repo_utils/test_tests_fetcher.py
@@ -526,10 +526,17 @@ def test_init_test_examples_dependencies(self):
             create_tmp_repo(tmp_folder)
 
             expected_example_deps = {
-                "examples/flax/test_flax_examples.py": ["examples/flax/text-classification/run_glue.py"],
-                "examples/pytorch/test_pytorch_examples.py": ["examples/pytorch/text-classification/run_glue.py"],
+                "examples/flax/test_flax_examples.py": [
+                    "examples/flax/text-classification/run_glue.py",
+                    "examples/flax/test_flax_examples.py",
+                ],
+                "examples/pytorch/test_pytorch_examples.py": [
+                    "examples/pytorch/text-classification/run_glue.py",
+                    "examples/pytorch/test_pytorch_examples.py",
+                ],
                 "examples/tensorflow/test_tensorflow_examples.py": [
-                    "examples/tensorflow/text-classification/run_glue.py"
+                    "examples/tensorflow/text-classification/run_glue.py",
+                    "examples/tensorflow/test_tensorflow_examples.py",
                 ],
             }
 
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 44b1b1f10b80..a06a2f672fbf 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -353,7 +353,9 @@ def extract_imports(module_fname, cache=None):
         content = f.read()
 
     # Filter out all docstrings to not get imports in code examples.
-    splits = content.split('"""')
+    # fmt: off
+    splits = content.split('\"\"\"')
+    # fmt: on
     content = "".join(splits[::2])
 
     module_parts = str(module_fname).split(os.path.sep)
@@ -694,12 +696,14 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filter_models=T
             if f in test_map:
                 test_files_to_run.extend(test_map[f])
         test_files_to_run = sorted(set(test_files_to_run))
+        # Remove repo utils tests
+        test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "repo_utils"]
         # Remove SageMaker tests
         test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "sagemaker"]
         # Make sure we did not end up with a test file that was removed
         test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
 
-        repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in modified_files)
+        repo_utils_launch = any(f.split(os.path.sep)[0] == "utils" for f in modified_files)
 
     if repo_utils_launch:
         repo_util_file = Path(output_file).parent / "test_repo_utils.txt"

From ef010071ee92d8d2cec2318dcaef6307f34a91e6 Mon Sep 17 00:00:00 2001
From: Mishig <dmishig@gmail.com>
Date: Wed, 7 Jun 2023 17:55:48 +0200
Subject: [PATCH 346/935] Update delete_doc_comment_trigger.yml (#24084)

fix base workflow name
---
 .github/workflows/delete_doc_comment_trigger.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml
index 771871f232c2..f87d9bd4dca7 100644
--- a/.github/workflows/delete_doc_comment_trigger.yml
+++ b/.github/workflows/delete_doc_comment_trigger.yml
@@ -7,6 +7,6 @@ on:
 
 jobs:
   delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
     with:
       pr_number: ${{ github.event.number }}

From 12298cb65c7e9d615b749dde935a0b4966f4ae49 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 7 Jun 2023 22:08:04 +0530
Subject: [PATCH 347/935] fix executable batch size issue (#24067)

* fix executable batch size issue

* fix

* undo
---
 src/transformers/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index b9203024f1b3..0468b74360ef 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1650,6 +1650,7 @@ def train(
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
+        self.accelerator.free_memory()
         self._train_batch_size = batch_size
         logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps

From 2c887cf8e0cb1ac96d28361ff3235a77f83c36ee Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 7 Jun 2023 15:31:32 -0400
Subject: [PATCH 348/935] Do not prepare lr scheduler as it as the right number
 of steps (#24088)

* Do not prepare lr scheduler as it as the right number of steps

* Trigger CI

* Trigger CI

* Trigger CI

* Add fake comment

* Remove fake comment

* Trigger CI please!
---
 src/transformers/trainer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 0468b74360ef..0785e03ac545 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1747,9 +1747,7 @@ def _inner_training_loop(
 
         # prepare using `accelerator` prepare
         if use_accelerator_prepare:
-            model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
-                self.model, self.optimizer, self.lr_scheduler
-            )
+            model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
 
         if self.is_fsdp_enabled:
             self.model = model
@@ -1996,6 +1994,7 @@ def _inner_training_loop(
                         optimizer_was_run = scale_before <= scale_after
                     else:
                         self.optimizer.step()
+                        optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
 
                     if optimizer_was_run:
                         # Delay optimizer scheduling until metrics are generated

From d1c039e39864a41f6eb8b770a65f123c40164ea5 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Thu, 8 Jun 2023 01:03:13 +0530
Subject: [PATCH 349/935] fix accelerator prepare during eval only mode
 (#24014)

* fix mixed precision prep during eval only mode

* update to address comments

* update to reflect the changes in accelerate
---
 src/transformers/trainer.py | 44 ++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 0785e03ac545..3231558dec18 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3141,14 +3141,30 @@ def evaluation_loop(
 
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
-        # if eval is called w/o train init deepspeed here
+        # if eval is called w/o train, handle model prep here
         if self.is_deepspeed_enabled and self.model_wrapped is self.model:
             _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
-            model = self.accelerator.prepare(self.model)
-            self.model_wrapped = self.deepspeed = model
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
+        if len(self.accelerator._models) == 0 and model is self.model:
+            model = (
+                self.accelerator.prepare(model)
+                if self.is_deepspeed_enabled
+                else self.accelerator.prepare_model(model, evaluation_mode=True)
+            )
+
+            if self.is_fsdp_enabled:
+                self.model = model
+
+            # for the rest of this function `model` is the outside model, whether it was wrapped or not
+            if model is not self.model:
+                self.model_wrapped = model
+
+            # backward compatibility
+            if self.is_deepspeed_enabled:
+                self.deepspeed = self.model_wrapped
+
         # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
         # while ``train`` is running, cast it to the right dtype first and then put on device
         if not self.is_in_train:
@@ -3736,14 +3752,30 @@ def prediction_loop(
 
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
-        # if eval is called w/o train init deepspeed here
+        # if eval is called w/o train, handle model prep here
         if self.is_deepspeed_enabled and self.model_wrapped is self.model:
             _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
-            model = self.accelerator.prepare(self.model)
-            self.model_wrapped = self.deepspeed = model
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
 
+        if len(self.accelerator._models) == 0 and model is self.model:
+            model = (
+                self.accelerator.prepare(model)
+                if self.is_deepspeed_enabled
+                else self.accelerator.prepare_model(model, evaluation_mode=True)
+            )
+
+            if self.is_fsdp_enabled:
+                self.model = model
+
+            # for the rest of this function `model` is the outside model, whether it was wrapped or not
+            if model is not self.model:
+                self.model_wrapped = model
+
+            # backward compatibility
+            if self.is_deepspeed_enabled:
+                self.deepspeed = self.model_wrapped
+
         # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
         # while ``train`` is running, cast it to the right dtype first and then put on device
         if not self.is_in_train:

From 5eb3d3c7023ed0522d3c743ee2e13d896a3aa788 Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Wed, 7 Jun 2023 16:21:51 -0400
Subject: [PATCH 350/935] Up pinned accelerate version (#24089)

* Min accelerate

* Also min version

* Min accelerate

* Also min version

* To different minor version

* Empty
---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 src/transformers/training_args.py             | 4 ++--
 src/transformers/utils/import_utils.py        | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 394cc9cb25c8..603d0337c195 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
     "Pillow",
-    "accelerate>=0.19.0",
+    "accelerate>=0.20.1",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
     "black~=23.1",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 9419287ae2cf..c97fdc348d11 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
     "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.19.0",
+    "accelerate": "accelerate>=0.20.1",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",
     "black": "black~=23.1",
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 441a546e05ff..d666b40acb0b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1668,9 +1668,9 @@ def _setup_devices(self) -> "torch.device":
         requires_backends(self, ["torch"])
         logger.info("PyTorch: setting up devices")
         if not is_sagemaker_mp_enabled():
-            if not is_accelerate_available(check_partial_state=True):
+            if not is_accelerate_available(min_version="0.20.1"):
                 raise ImportError(
-                    "Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
+                    "Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`"
                 )
             AcceleratorState._reset_state(reset_partial_state=True)
         self.distributed_state = None
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 39d72af305e5..93e5e74bba5d 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -508,9 +508,9 @@ def is_protobuf_available():
     return importlib.util.find_spec("google.protobuf") is not None
 
 
-def is_accelerate_available(check_partial_state=False):
-    if check_partial_state:
-        return _accelerate_available and version.parse(_accelerate_version) >= version.parse("0.19.0")
+def is_accelerate_available(min_version: str = None):
+    if min_version is not None:
+        return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
     return _accelerate_available
 
 

From c3572e6bfba13ce6dc3fedb05cd1a946ea109576 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 7 Jun 2023 16:34:53 -0400
Subject: [PATCH 351/935] Add AzureOpenAiAgent (#24058)

* Add AzureOpenAiAgent

* quality

* Update src/transformers/tools/agents.py

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

---------

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
---
 docs/source/en/main_classes/agent.mdx |   4 +
 src/transformers/__init__.py          |   2 +
 src/transformers/tools/__init__.py    |   4 +-
 src/transformers/tools/agents.py      | 126 ++++++++++++++++++++++++++
 4 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/main_classes/agent.mdx b/docs/source/en/main_classes/agent.mdx
index ecf1747d12bd..5f4b3df306c8 100644
--- a/docs/source/en/main_classes/agent.mdx
+++ b/docs/source/en/main_classes/agent.mdx
@@ -38,6 +38,10 @@ We provide three types of agents: [`HfAgent`] uses inference endpoints for opens
 
 [[autodoc]] OpenAiAgent
 
+### AzureOpenAiAgent
+
+[[autodoc]] AzureOpenAiAgent
+
 ### Agent
 
 [[autodoc]] Agent
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c7deecf7552f..a1d995670f6a 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -619,6 +619,7 @@
     ],
     "tools": [
         "Agent",
+        "AzureOpenAiAgent",
         "HfAgent",
         "LocalAgent",
         "OpenAiAgent",
@@ -4410,6 +4411,7 @@
     # Tools
     from .tools import (
         Agent,
+        AzureOpenAiAgent,
         HfAgent,
         LocalAgent,
         OpenAiAgent,
diff --git a/src/transformers/tools/__init__.py b/src/transformers/tools/__init__.py
index a5b3c7dc05eb..68d66eb275e0 100644
--- a/src/transformers/tools/__init__.py
+++ b/src/transformers/tools/__init__.py
@@ -24,7 +24,7 @@
 
 
 _import_structure = {
-    "agents": ["Agent", "HfAgent", "LocalAgent", "OpenAiAgent"],
+    "agents": ["Agent", "AzureOpenAiAgent", "HfAgent", "LocalAgent", "OpenAiAgent"],
     "base": ["PipelineTool", "RemoteTool", "Tool", "launch_gradio_demo", "load_tool"],
 }
 
@@ -46,7 +46,7 @@
     _import_structure["translation"] = ["TranslationTool"]
 
 if TYPE_CHECKING:
-    from .agents import Agent, HfAgent, LocalAgent, OpenAiAgent
+    from .agents import Agent, AzureOpenAiAgent, HfAgent, LocalAgent, OpenAiAgent
     from .base import PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
 
     try:
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 0ecb600d9212..66563c3529f1 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -453,6 +453,132 @@ def _completion_generate(self, prompts, stop):
         return [answer["text"] for answer in result["choices"]]
 
 
+class AzureOpenAiAgent(Agent):
+    """
+    Agent that uses Azure OpenAI to generate code. See the [official
+    documentation](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/) to learn how to deploy an openAI
+    model on Azure
+
+    <Tip warning={true}>
+
+    The openAI models are used in generation mode, so even for the `chat()` API, it's better to use models like
+    `"text-davinci-003"` over the chat-GPT variant. Proper support for chat-GPT models will come in a next version.
+
+    </Tip>
+
+    Args:
+        deployment_id (`str`):
+            The name of the deployed Azure openAI model to use.
+        api_key (`str`, *optional*):
+            The API key to use. If unset, will look for the environment variable `"AZURE_OPENAI_API_KEY"`.
+        resource_name (`str`, *optional*):
+            The name of your Azure OpenAI Resource. If unset, will look for the environment variable
+            `"AZURE_OPENAI_RESOURCE_NAME"`.
+        api_version (`str`, *optional*, default to `"2022-12-01"`):
+            The API version to use for this agent.
+        is_chat_mode (`bool`, *optional*):
+            Whether you are using a completion model or a chat model (see note above, chat models won't be as
+            efficient). Will default to `gpt` being in the `deployment_id` or not.
+        chat_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `chat_prompt_template.txt` in this repo in this case.
+        run_prompt_template (`str`, *optional*):
+            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
+            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
+            `run_prompt_template.txt` in this repo in this case.
+        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
+            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
+            one of the default tools, that default tool will be overridden.
+
+    Example:
+
+    ```py
+    from transformers import AzureOpenAiAgent
+
+    agent = AzureAiAgent(deployment_id="Davinci-003", api_key=xxx, resource_name=yyy)
+    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
+    ```
+    """
+
+    def __init__(
+        self,
+        deployment_id,
+        api_key=None,
+        resource_name=None,
+        api_version="2022-12-01",
+        is_chat_model=None,
+        chat_prompt_template=None,
+        run_prompt_template=None,
+        additional_tools=None,
+    ):
+        if not is_openai_available():
+            raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
+
+        self.deployment_id = deployment_id
+        openai.api_type = "azure"
+        if api_key is None:
+            api_key = os.environ.get("AZURE_OPENAI_API_KEY", None)
+        if api_key is None:
+            raise ValueError(
+                "You need an Azure openAI key to use `AzureOpenAIAgent`. If you have one, set it in your env with "
+                "`os.environ['AZURE_OPENAI_API_KEY'] = xxx."
+            )
+        else:
+            openai.api_key = api_key
+        if resource_name is None:
+            resource_name = os.environ.get("AZURE_OPENAI_RESOURCE_NAME", None)
+        if resource_name is None:
+            raise ValueError(
+                "You need a resource_name to use `AzureOpenAIAgent`. If you have one, set it in your env with "
+                "`os.environ['AZURE_OPENAI_RESOURCE_NAME'] = xxx."
+            )
+        else:
+            openai.api_base = f"https://{resource_name}.openai.azure.com"
+        openai.api_version = api_version
+
+        if is_chat_model is None:
+            is_chat_model = "gpt" in deployment_id.lower()
+        self.is_chat_model = is_chat_model
+
+        super().__init__(
+            chat_prompt_template=chat_prompt_template,
+            run_prompt_template=run_prompt_template,
+            additional_tools=additional_tools,
+        )
+
+    def generate_many(self, prompts, stop):
+        if self.is_chat_model:
+            return [self._chat_generate(prompt, stop) for prompt in prompts]
+        else:
+            return self._completion_generate(prompts, stop)
+
+    def generate_one(self, prompt, stop):
+        if self.is_chat_model:
+            return self._chat_generate(prompt, stop)
+        else:
+            return self._completion_generate([prompt], stop)[0]
+
+    def _chat_generate(self, prompt, stop):
+        result = openai.ChatCompletion.create(
+            engine=self.deployment_id,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            stop=stop,
+        )
+        return result["choices"][0]["message"]["content"]
+
+    def _completion_generate(self, prompts, stop):
+        result = openai.Completion.create(
+            engine=self.deployment_id,
+            prompt=prompts,
+            temperature=0,
+            stop=stop,
+            max_tokens=200,
+        )
+        return [answer["text"] for answer in result["choices"]]
+
+
 class HfAgent(Agent):
     """
     Agent that uses an inference endpoint to generate code.

From ba695c1efd55091e394eb59c90fb33ac3f9f0d41 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Wed, 7 Jun 2023 16:49:00 -0400
Subject: [PATCH 352/935] v4.31.0.dev0

---
 README.md                                                   | 6 +++---
 examples/flax/question-answering/run_qa.py                  | 2 +-
 examples/flax/text-classification/run_flax_glue.py          | 2 +-
 examples/flax/token-classification/run_flax_ner.py          | 2 +-
 .../audio-classification/run_audio_classification.py        | 2 +-
 examples/pytorch/contrastive-image-text/run_clip.py         | 2 +-
 .../image-classification/run_image_classification.py        | 2 +-
 .../run_image_classification_no_trainer.py                  | 2 +-
 examples/pytorch/image-pretraining/run_mae.py               | 2 +-
 examples/pytorch/image-pretraining/run_mim.py               | 2 +-
 examples/pytorch/image-pretraining/run_mim_no_trainer.py    | 2 +-
 examples/pytorch/language-modeling/run_clm.py               | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py    | 2 +-
 examples/pytorch/language-modeling/run_mlm.py               | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py    | 2 +-
 examples/pytorch/language-modeling/run_plm.py               | 2 +-
 examples/pytorch/multiple-choice/run_swag.py                | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py     | 2 +-
 examples/pytorch/question-answering/run_qa.py               | 2 +-
 examples/pytorch/question-answering/run_qa_beam_search.py   | 2 +-
 .../question-answering/run_qa_beam_search_no_trainer.py     | 2 +-
 examples/pytorch/question-answering/run_qa_no_trainer.py    | 2 +-
 examples/pytorch/question-answering/run_seq2seq_qa.py       | 2 +-
 .../semantic-segmentation/run_semantic_segmentation.py      | 2 +-
 .../run_semantic_segmentation_no_trainer.py                 | 2 +-
 .../speech-recognition/run_speech_recognition_ctc.py        | 2 +-
 .../speech-recognition/run_speech_recognition_seq2seq.py    | 2 +-
 examples/pytorch/summarization/run_summarization.py         | 2 +-
 .../pytorch/summarization/run_summarization_no_trainer.py   | 2 +-
 examples/pytorch/text-classification/run_glue.py            | 2 +-
 examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +-
 examples/pytorch/text-classification/run_xnli.py            | 2 +-
 examples/pytorch/token-classification/run_ner.py            | 2 +-
 examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +-
 examples/pytorch/translation/run_translation.py             | 2 +-
 examples/pytorch/translation/run_translation_no_trainer.py  | 2 +-
 examples/tensorflow/contrastive-image-text/run_clip.py      | 2 +-
 .../image-classification/run_image_classification.py        | 2 +-
 examples/tensorflow/multiple-choice/run_swag.py             | 2 +-
 examples/tensorflow/question-answering/run_qa.py            | 2 +-
 examples/tensorflow/summarization/run_summarization.py      | 2 +-
 examples/tensorflow/text-classification/run_glue.py         | 2 +-
 examples/tensorflow/translation/run_translation.py          | 2 +-
 setup.py                                                    | 2 +-
 src/transformers/__init__.py                                | 2 +-
 45 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 7ae0e7ec2b96..b5ce283be465 100644
--- a/README.md
+++ b/README.md
@@ -292,7 +292,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -406,7 +406,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
@@ -448,7 +448,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 78d51fa7e949..bd79b5822e50 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index ac31c2a00db2..e7f6b78aad28 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -54,7 +54,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index a2f8ba2e60b6..a15946b54b42 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -55,7 +55,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index f689864271ee..9400eb06f2fd 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 9fb0da7c35f2..0b3c03dcbded 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 10a8ece093f3..b30a93edbdf1 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 9f7ddd224368..a072160c73be 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 7af1b3e0b5ba..117a97deff53 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 344f5d9b5b81..9a5c1611e7c1 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index e1fc63b42e19..870be3dfa13d 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index ddf37d35f0bd..791aef8e9cbf 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 176240ec6dae..c38102f6c5c0 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 16ef57d1a9a9..61fe39301c7c 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 46b7ff4b8578..ccc4e0a0987b 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 779cdaabe619..33ab57df5607 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index add403dfb8ad..8447b34a2e37 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 8c98eb603cc7..6999b9c800d8 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 29591a457793..e1f8a7af94b7 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 42b5b7b6580b..558494d7f50b 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index aecff7be0d9a..d1ace7424710 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index fb5a00f1feb0..95564db96798 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index 314c4a201522..c8a1542a64fe 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 0f989351eefb..907f9cea9912 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index cab3198626d4..740c2ea5cbb1 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index f29de6e0b98d..fc3c802b3db0 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 22a29fc73054..5d5f255acf77 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index c89881b0beb9..f5181f55efd9 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index f77f7d48b663..7cf627046e53 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 8f9fa5c1a4b3..3753b4d66936 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index cf5d1f2e7426..0c2b84ec165c 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 4397b4a9433b..6600ba5952fc 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 3391bcecfc2e..2b88217e4551 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 9da37a24fe4d..dcda152dbe31 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 035bb3b06d98..c7e4e9bfd54a 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 5d1fcdcea1e5..d42ade1f59f5 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 42e6dd76b384..6b0a4d1ddfc0 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index ea70d8f53d8d..a95d155937c6 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 7abceab543a4..2640b654bae2 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 15c6884d94ae..d034a796c40c 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 21b77ed639e0..fb48bb175cd1 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 13f840ce174e..12dfd201a95a 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 551d89d0335a..ad663dc24440 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.30.0.dev0")
+check_min_version("4.31.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index 603d0337c195..036055e28ac6 100644
--- a/setup.py
+++ b/setup.py
@@ -428,7 +428,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.30.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.31.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a1d995670f6a..c2966423da53 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.30.0.dev0"
+__version__ = "4.31.0.dev0"
 
 from typing import TYPE_CHECKING
 

From 5fa0a1b23b6a79eb646636dbf9a22cb34ff48a74 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Thu, 8 Jun 2023 16:24:56 +0330
Subject: [PATCH 353/935] Fix a tiny typo in
 `WhisperForConditionalGeneration::generate` docstring (#24045)

---
 src/transformers/models/whisper/modeling_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 6175009e4128..245da6d7ba3a 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1519,10 +1519,10 @@ def generate(
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             return_timestamps (`bool`, *optional*):
                 Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
-            task (`bool`, *optional*):
+            task (`str`, *optional*):
                 Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
                 will be updated accordingly.
-            language (`bool`, *optional*):
+            language (`str`, *optional*):
                 Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. You can
                 find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
             is_multilingual (`bool`, *optional*):

From 0f2360509449cb16362b2e316dcab9fb4d1ee7f0 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Thu, 8 Jun 2023 18:49:07 +0530
Subject: [PATCH 354/935] reset accelerate env variables after each test
 (#24107)

---
 src/transformers/testing_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 40d5d4c022cb..0a35494765a0 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1339,6 +1339,11 @@ def tearDown(self):
             AcceleratorState._reset_state()
             PartialState._reset_state()
 
+            # delete all the env variables having `ACCELERATE` in them
+            for k in list(os.environ.keys()):
+                if "ACCELERATE" in k:
+                    del os.environ[k]
+
 
 def mockenv(**kwargs):
     """

From 2200bf7a45782d42bda73a042606a1abe454a62a Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 8 Jun 2023 15:38:30 +0200
Subject: [PATCH 355/935] [`Trainer`] Correct behavior of `_load_best_model`
 for PEFT models (#24103)

* v1

* some refactor

- add ST format as well

* fix

* add `ADAPTER_WEIGHTS_NAME` & `ADAPTER_SAFE_WEIGHTS_NAME`
---
 src/transformers/trainer.py        | 28 ++++++++++++++++++++--------
 src/transformers/utils/__init__.py |  2 ++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 3231558dec18..d93e6b587de0 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -134,6 +134,8 @@
 )
 from .training_args import OptimizerNames, ParallelMode, TrainingArguments
 from .utils import (
+    ADAPTER_SAFE_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
     CONFIG_NAME,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -2177,11 +2179,20 @@ def _load_best_model(self):
         logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
         best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
         best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
+        best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME)
+        best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
+
         model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-        if os.path.exists(best_model_path) or os.path.exists(best_safe_model_path):
+        if (
+            os.path.exists(best_model_path)
+            or os.path.exists(best_safe_model_path)
+            or os.path.exists(best_adapter_model_path)
+            or os.path.exists(best_safe_adapter_model_path)
+        ):
             if self.is_deepspeed_enabled:
                 deepspeed_load_checkpoint(self.model_wrapped, self.state.best_model_checkpoint)
             else:
+                has_been_loaded = True
                 if is_sagemaker_mp_enabled():
                     if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
                         # If the 'user_content.pt' file exists, load with the new smp api.
@@ -2207,10 +2218,10 @@ def _load_best_model(self):
                         self.accelerator, model, self.state.best_model_checkpoint
                     )
                 else:
-                    if hasattr(model, "base_model") and getattr(model.base_model, "is_8bit_serializable", False):
-                        # If train base_8_bit_models using PEFT & LoRA, assume that adapter have been saved properly.
+                    if is_peft_available() and isinstance(model, PeftModel):
+                        # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
                         if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
-                            if os.path.exists(os.path.join(self.state.best_model_checkpoint, "adapter_model.bin")):
+                            if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
                                 model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
                                 # Load_adapter has no return value present, modify it when appropriate.
                                 from torch.nn.modules.module import _IncompatibleKeys
@@ -2219,12 +2230,13 @@ def _load_best_model(self):
                             else:
                                 logger.warning(
                                     "The intermediate checkpoints of PEFT may not be saved correctly, "
-                                    "using `TrainerCallback` to save adapter_model.bin in corresponding folders, "
+                                    f"using `TrainerCallback` to save {ADAPTER_WEIGHTS_NAME} in corresponding folders, "
                                     "here are some examples https://github.com/huggingface/peft/issues/96"
                                 )
+                                has_been_loaded = False
                         else:
-                            # We can't do pure 8bit training using transformers.
-                            logger.warning("Could not loading a quantized checkpoint.")
+                            logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
+                            has_been_loaded = False
                     else:
                         # We load the model state dict on the CPU to avoid an OOM error.
                         if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
@@ -2236,7 +2248,7 @@ def _load_best_model(self):
                         # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
                         # which takes *args instead of **kwargs
                         load_result = model.load_state_dict(state_dict, False)
-                if not is_sagemaker_mp_enabled():
+                if not is_sagemaker_mp_enabled() and has_been_loaded:
                     self._issue_warnings_after_load(load_result)
         elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):
             load_result = load_sharded_checkpoint(
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 7169c7daf969..3aa1f8aeb926 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -177,6 +177,8 @@
 
 WEIGHTS_NAME = "pytorch_model.bin"
 WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
+ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
+ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
 TF2_WEIGHTS_NAME = "tf_model.h5"
 TF2_WEIGHTS_INDEX_NAME = "tf_model.h5.index.json"
 TF_WEIGHTS_NAME = "model.ckpt"

From 8c5f306719832221e4cf1e261f7fefef8dd869a6 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 8 Jun 2023 10:11:01 -0400
Subject: [PATCH 356/935] Update the pin on Accelerate (#24110)

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 036055e28ac6..b395adf6f586 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
     "Pillow",
-    "accelerate>=0.20.1",
+    "accelerate>=0.20.2",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
     "black~=23.1",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index c97fdc348d11..64ec3c2a5400 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
     "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.20.1",
+    "accelerate": "accelerate>=0.20.2",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",
     "black": "black~=23.1",

From 71a114d3e0c234469335d5ceeefd15ea6a7884cb Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Thu, 8 Jun 2023 10:14:27 -0400
Subject: [PATCH 357/935] fix get_keys_to_not_convert function (#24095)

* fix get_keys_to_not_convert funct

* Fix style
---
 src/transformers/utils/bitsandbytes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 978726be4b91..35448b8779aa 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -245,9 +245,9 @@ def get_keys_to_not_convert(model):
     tied_params = find_tied_parameters(tied_model)
     # For compatibility with Accelerate < 0.18
     if isinstance(tied_params, dict):
-        tied_keys = list(tied_params.values())
+        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
     else:
-        tied_keys = sum([x[1:] for x in tied_params], [])
+        tied_keys = sum(tied_params, [])
     has_tied_params = len(tied_keys) > 0
 
     # Check if it is a base model
@@ -258,12 +258,12 @@ def get_keys_to_not_convert(model):
         return []
 
     # otherwise they have an attached head
-    list_modules = list(model.named_parameters())
+    list_modules = list(model.named_children())
     list_last_module = [list_modules[-1][0]]
 
     # add last module together with tied weights
     intersection = set(list_last_module) - set(tied_keys)
-    list_untouched = tied_keys + list(intersection)
+    list_untouched = list(set(tied_keys)) + list(intersection)
 
     # remove ".weight" from the keys
     names_to_remove = [".weight", ".bias"]

From 8b169142f8b5735f25ad81313ee382350161d993 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 8 Jun 2023 16:21:42 +0200
Subject: [PATCH 358/935] [`GPT2`] Add correct keys on
 `_keys_to_ignore_on_load_unexpected` on all child classes of
 `GPT2PreTrainedModel` (#24113)

* add correct keys on `_keys_to_ignore_on_load_unexpected`

* oops
---
 src/transformers/models/gpt2/modeling_gpt2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 00d92f0bb23c..0cb406081f27 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -668,7 +668,8 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
     GPT2_START_DOCSTRING,
 )
 class GPT2Model(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
+    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1149,6 +1150,7 @@ def _reorder_cache(
     GPT2_START_DOCSTRING,
 )
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
     _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
 
     def __init__(self, config):
@@ -1377,6 +1379,7 @@ def _reorder_cache(
     GPT2_START_DOCSTRING,
 )
 class GPT2ForSequenceClassification(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
     _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
 
     def __init__(self, config):
@@ -1600,6 +1603,7 @@ def forward(
     GPT2_START_DOCSTRING,
 )
 class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
     _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
 
     def __init__(self, config):

From a73883ae9ec66cb35a8222f204a5f2fafc326d3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radam=C3=A9s=20Ajna?= <radamajna@gmail.com>
Date: Thu, 8 Jun 2023 08:13:57 -0700
Subject: [PATCH 359/935] add trust_remote_code option to CLI download cmd
 (#24097)

* add trust_remote_code option

* require_torch
---
 src/transformers/commands/download.py | 18 ++++++++---
 tests/utils/test_cli.py               | 46 ++++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/transformers/commands/download.py b/src/transformers/commands/download.py
index 3c224555dfd5..8af3c6397b44 100644
--- a/src/transformers/commands/download.py
+++ b/src/transformers/commands/download.py
@@ -18,7 +18,7 @@
 
 
 def download_command_factory(args):
-    return DownloadCommand(args.model, args.cache_dir, args.force)
+    return DownloadCommand(args.model, args.cache_dir, args.force, args.trust_remote_code)
 
 
 class DownloadCommand(BaseTransformersCLICommand):
@@ -31,16 +31,26 @@ def register_subcommand(parser: ArgumentParser):
         download_parser.add_argument(
             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
         )
+        download_parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files. Use only if you've reviewed the code as it will execute on your local machine",
+        )
         download_parser.add_argument("model", type=str, help="Name of the model to download")
         download_parser.set_defaults(func=download_command_factory)
 
-    def __init__(self, model: str, cache: str, force: bool):
+    def __init__(self, model: str, cache: str, force: bool, trust_remote_code: bool):
         self._model = model
         self._cache = cache
         self._force = force
+        self._trust_remote_code = trust_remote_code
 
     def run(self):
         from ..models.auto import AutoModel, AutoTokenizer
 
-        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
-        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
+        AutoModel.from_pretrained(
+            self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
+        )
+        AutoTokenizer.from_pretrained(
+            self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
+        )
diff --git a/tests/utils/test_cli.py b/tests/utils/test_cli.py
index f39aa600679a..fc7b8ebb5e02 100644
--- a/tests/utils/test_cli.py
+++ b/tests/utils/test_cli.py
@@ -18,7 +18,7 @@
 import unittest
 from unittest.mock import patch
 
-from transformers.testing_utils import CaptureStd, is_pt_tf_cross_test
+from transformers.testing_utils import CaptureStd, is_pt_tf_cross_test, require_torch
 
 
 class CLITest(unittest.TestCase):
@@ -45,3 +45,47 @@ def test_cli_pt_to_tf(self):
 
         # The original repo has no TF weights -- if they exist, they were created by the CLI
         self.assertTrue(os.path.exists("/tmp/hf-internal-testing/tiny-random-gptj/tf_model.h5"))
+
+    @require_torch
+    @patch("sys.argv", ["fakeprogrampath", "download", "hf-internal-testing/tiny-random-gptj", "--cache-dir", "/tmp"])
+    def test_cli_download(self):
+        import transformers.commands.transformers_cli
+
+        # # remove any previously downloaded model to start clean
+        shutil.rmtree("/tmp/models--hf-internal-testing--tiny-random-gptj", ignore_errors=True)
+
+        # run the command
+        transformers.commands.transformers_cli.main()
+
+        # check if the model files are downloaded correctly on /tmp/models--hf-internal-testing--tiny-random-gptj
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/blobs"))
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/refs"))
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/snapshots"))
+
+    @require_torch
+    @patch(
+        "sys.argv",
+        [
+            "fakeprogrampath",
+            "download",
+            "hf-internal-testing/test_dynamic_model_with_tokenizer",
+            "--trust-remote-code",
+            "--cache-dir",
+            "/tmp",
+        ],
+    )
+    def test_cli_download_trust_remote(self):
+        import transformers.commands.transformers_cli
+
+        # # remove any previously downloaded model to start clean
+        shutil.rmtree("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer", ignore_errors=True)
+
+        # run the command
+        transformers.commands.transformers_cli.main()
+
+        # check if the model files are downloaded correctly on /tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/blobs"))
+        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/refs"))
+        self.assertTrue(
+            os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/snapshots")
+        )

From 9322c24476e826ee87c7a4b515e466efa7c70703 Mon Sep 17 00:00:00 2001
From: Serge Panev <spanev@nvidia.com>
Date: Thu, 8 Jun 2023 09:19:07 -0700
Subject: [PATCH 360/935] Fix typo in Llama docstrings (#24020)

* Fix typo in Llama docstrings

Signed-off-by: Serge Panev <spanev@nvidia.com>

* Update

Signed-off-by: Serge Panev <spanev@nvidia.com>

* make style

Signed-off-by: Serge Panev <spanev@nvidia.com>

---------

Signed-off-by: Serge Panev <spanev@nvidia.com>
---
 src/transformers/models/llama/modeling_llama.py           | 4 ++--
 src/transformers/models/open_llama/modeling_open_llama.py | 4 ++--
 src/transformers/models/opt/modeling_opt.py               | 4 ++--
 src/transformers/models/opt/modeling_tf_opt.py            | 4 +++-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 346da82d86f5..fc5c2a1ce42b 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -669,13 +669,13 @@ def forward(
         >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 9a49f2380682..f5c38f154b05 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -706,13 +706,13 @@ def forward(
         >>> model = OpenLlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 15fc3b033a22..e493705d78c5 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -926,13 +926,13 @@ def forward(
         >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
         >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 5f7dd22369b8..9d92ef5c4cd3 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -54,7 +54,9 @@
 _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
 
 # Causal LM output
-_CAUSAL_LM_EXPECTED_OUTPUT = "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+_CAUSAL_LM_EXPECTED_OUTPUT = (
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
+)
 
 LARGE_NEGATIVE = -1e8
 

From 2e2088f24b60d8817c74c32a0ac6bb1c5d39544d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 8 Jun 2023 18:21:09 +0200
Subject: [PATCH 361/935] Avoid `GPT-2` daily CI job OOM (in TF tests) (#24106)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/gpt2/test_modeling_gpt2.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 0575b74e4c8e..65542b495497 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -15,6 +15,7 @@
 
 
 import datetime
+import gc
 import math
 import unittest
 
@@ -500,6 +501,12 @@ def setUp(self):
         self.model_tester = GPT2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
 
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -683,6 +690,12 @@ def test_model_from_pretrained(self):
 
 @require_torch
 class GPT2ModelLanguageGenerationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def _test_lm_generate_gpt2_helper(
         self,
         gradient_checkpointing=False,

From 535542d38d7f19c6347ad684347737a38107f148 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 9 Jun 2023 09:36:19 +0200
Subject: [PATCH 362/935] [Lllama] Update tokenization code to ensure parsing
 of the special tokens [core] (#24042)

* preventllama fast from returning token type ids

* remove type hints

* normalised False
---
 src/transformers/convert_slow_tokenizer.py               | 6 +++---
 src/transformers/models/llama/tokenization_llama_fast.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 1934e35a575d..8ff40c573e3c 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -1134,9 +1134,9 @@ def tokenizer(self, proto):
             )
             tokenizer.add_special_tokens(
                 [
-                    AddedToken("<unk>", normalized=True),
-                    AddedToken("<s>", normalized=True),
-                    AddedToken("</s>", normalized=True),
+                    AddedToken("<unk>", normalized=False),
+                    AddedToken("<s>", normalized=False),
+                    AddedToken("</s>", normalized=False),
                 ]
             )
         else:
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index c3946d83b0e0..095f65b628b2 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -77,6 +77,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
     vocab_files_names = VOCAB_FILES_NAMES
     slow_tokenizer_class = LlamaTokenizer
     padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(
         self,

From e2972dffdd2f41cfdaae89b0ec22bf7917d43657 Mon Sep 17 00:00:00 2001
From: Elliott Wang <etfirebox@163.com>
Date: Fri, 9 Jun 2023 19:32:16 +0800
Subject: [PATCH 363/935] PLAM => PaLM (#24129)

---
 docs/source/en/model_doc/open-llama.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/open-llama.mdx b/docs/source/en/model_doc/open-llama.mdx
index f4d2b72b09a3..fa463953eeb4 100644
--- a/docs/source/en/model_doc/open-llama.mdx
+++ b/docs/source/en/model_doc/open-llama.mdx
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
 
-The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PLAM.
+The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
 And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
 
 This model was contributed by [s-JoL](https://huggingface.co/s-JoL).

From a6d05d55f65727e0be3e71cad4c648c33b0be177 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 9 Jun 2023 13:41:14 +0200
Subject: [PATCH 364/935] [`bnb`] Fix bnb config json serialization (#24137)

* fix bnb config json serialization

* forward contrib credits from discussions

---------

Co-authored-by: Andrechang <Andrechang@users.noreply.github.com>
---
 src/transformers/configuration_utils.py |  7 +++++++
 tests/bitsandbytes/test_4bit.py         | 13 +++++++++++++
 tests/bitsandbytes/test_mixed_int8.py   | 13 +++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 6e8104c0cd2e..1bcdef152a62 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -784,6 +784,13 @@ def to_diff_dict(self) -> Dict[str, Any]:
             ):
                 serializable_config_dict[key] = value
 
+        if hasattr(self, "quantization_config"):
+            serializable_config_dict["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict)
+                else self.quantization_config
+            )
+
         self.dict_torch_dtype_to_str(serializable_config_dict)
 
         return serializable_config_dict
diff --git a/tests/bitsandbytes/test_4bit.py b/tests/bitsandbytes/test_4bit.py
index 0b6445b2c11c..182dfb9a170b 100644
--- a/tests/bitsandbytes/test_4bit.py
+++ b/tests/bitsandbytes/test_4bit.py
@@ -111,6 +111,19 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
+    def test_quantization_config_json_serialization(self):
+        r"""
+        A simple test to check if the quantization config is correctly serialized and deserialized
+        """
+        config = self.model_4bit.config
+
+        self.assertTrue(hasattr(config, "quantization_config"))
+
+        _ = config.to_dict()
+        _ = config.to_diff_dict()
+
+        _ = config.to_json_string()
+
     def test_memory_footprint(self):
         r"""
         A simple test to check if the model conversion has been done correctly by checking on the
diff --git a/tests/bitsandbytes/test_mixed_int8.py b/tests/bitsandbytes/test_mixed_int8.py
index 09157a251e02..7927045d7888 100644
--- a/tests/bitsandbytes/test_mixed_int8.py
+++ b/tests/bitsandbytes/test_mixed_int8.py
@@ -118,6 +118,19 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
+    def test_quantization_config_json_serialization(self):
+        r"""
+        A simple test to check if the quantization config is correctly serialized and deserialized
+        """
+        config = self.model_8bit.config
+
+        self.assertTrue(hasattr(config, "quantization_config"))
+
+        _ = config.to_dict()
+        _ = config.to_diff_dict()
+
+        _ = config.to_json_string()
+
     def test_memory_footprint(self):
         r"""
         A simple test to check if the model conversion has been done correctly by checking on the

From 03585f3734f51e7075f5740fe470a9734dc352a9 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 9 Jun 2023 13:11:01 +0100
Subject: [PATCH 365/935] Correctly build models and import call_context for
 older TF versions (#24138)

---
 src/transformers/modeling_tf_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 5127e4c9821b..b23d2bae6610 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -81,7 +81,7 @@
     from keras.engine.keras_tensor import KerasTensor
 else:
     from tensorflow.python.keras import backend as K
-    from tensorflow.python.keras.engine import call_context
+    from tensorflow.python.keras.engine.base_layer_utils import call_context
     from tensorflow.python.keras.engine.keras_tensor import KerasTensor
 
 
@@ -1156,8 +1156,8 @@ def build(self, input_shape=None):
         if self.built or call_context().in_call:
             self.built = True
         else:
-            self(self.dummy_inputs, training=False)
             self.built = True
+            self(self.dummy_inputs, training=False)
 
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)

From be10092e6372d558d600595bf17d212b41d63511 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Fri, 9 Jun 2023 13:20:05 +0100
Subject: [PATCH 366/935] Generate: PT's `top_p` enforces `min_tokens_to_keep`
 when it is `1` (#24111)

---
 src/transformers/generation/flax_logits_process.py | 2 ++
 src/transformers/generation/logits_process.py      | 7 ++++---
 src/transformers/generation/tf_logits_process.py   | 2 ++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index 9e5b6897ce4b..f48d56517f4a 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -129,6 +129,8 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 0):
+            raise ValueError(f"`min_tokens_to_keep` has to be a non-negative integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 51ef06d5e0b0..14bde38b6749 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -255,6 +255,8 @@ def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens
         top_p = float(top_p)
         if top_p < 0 or top_p > 1.0:
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 0):
+            raise ValueError(f"`min_tokens_to_keep` has to be a non-negative integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
@@ -266,9 +268,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
         # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
         sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)
-        if self.min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep
-            sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
+        # Keep at least min_tokens_to_keep
+        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
 
         # scatter sorted tensors to original indexing
         indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 369c969526a4..0efac5804f2b 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -160,6 +160,8 @@ class TFTopPLogitsWarper(TFLogitsWarper):
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 0):
+            raise ValueError(f"`min_tokens_to_keep` has to be a non-negative integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value

From f2b918356cee05bd200b5b3295d12753b0adf7eb Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 9 Jun 2023 17:54:53 +0530
Subject: [PATCH 367/935] fix bugs with trainer (#24134)

* fix the deepspeed test failures

* apex fix

* FSDP save ckpt fix

* Update src/transformers/trainer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/trainer.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d93e6b587de0..bf240cd0ebc0 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1749,7 +1749,16 @@ def _inner_training_loop(
 
         # prepare using `accelerator` prepare
         if use_accelerator_prepare:
-            model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+            if hasattr(self.lr_scheduler, "step"):
+                if self.use_apex:
+                    model = self.accelerator.prepare(self.model)
+                else:
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+            else:
+                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
+                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                    self.model, self.optimizer, self.lr_scheduler
+                )
 
         if self.is_fsdp_enabled:
             self.model = model
@@ -2841,6 +2850,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             or self.is_fsdp_enabled
         ):
             if self.is_fsdp_enabled:
+                os.makedirs(output_dir, exist_ok=True)
                 self.accelerator.state.fsdp_plugin.save_model(self.accelerator, self.model, output_dir)
             else:
                 state_dict = self.model.state_dict()

From 707023d15590ab2776df728d3a499a4f05d0a726 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 9 Jun 2023 15:03:11 +0200
Subject: [PATCH 368/935] Fix TF Rag OOM issue (#24122)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/rag/test_modeling_tf_rag.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
index b4720f7c7f0d..a7edf6e0f1b7 100644
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ b/tests/models/rag/test_modeling_tf_rag.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import gc
 import json
 import os
 import shutil
@@ -550,6 +551,11 @@ def config_and_inputs(self):
 @require_sentencepiece
 @require_tokenizers
 class TFRagModelIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     @cached_property
     def token_model(self):
         return TFRagTokenForGeneration.from_pretrained_question_encoder_generator(

From b8fe259f163c48a18c9b27428b72b2ac104de346 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 9 Jun 2023 15:07:08 +0200
Subject: [PATCH 369/935] Fix SAM OOM issue on CI (#24125)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/sam/test_modeling_sam.py    | 7 +++++++
 tests/models/sam/test_modeling_tf_sam.py | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index a70151452229..c8bc30a298a2 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -15,6 +15,7 @@
 """ Testing suite for the PyTorch SAM model. """
 
 
+import gc
 import inspect
 import unittest
 
@@ -461,6 +462,12 @@ def prepare_dog_img():
 
 @slow
 class SamModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def test_inference_mask_generation_no_point(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
         processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index 4e918a1cd13e..33cb38c6fbbf 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -17,6 +17,7 @@
 
 from __future__ import annotations
 
+import gc
 import inspect
 import unittest
 
@@ -429,6 +430,11 @@ def prepare_dog_img():
 
 @slow
 class SamModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     def test_inference_mask_generation_no_point(self):
         model = TFSamModel.from_pretrained("facebook/sam-vit-base")
         processor = SamProcessor.from_pretrained("facebook/sam-vit-base")

From 847b47c0eed4e6ab904f584fb415e3d3a397867f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 9 Jun 2023 15:20:59 +0200
Subject: [PATCH 370/935] Fix XGLM OOM on CI (#24123)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/xglm/test_modeling_tf_xglm.py | 6 ++++++
 tests/models/xglm/test_modeling_xglm.py    | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index e2b8cc2e6cbc..3582209cc7d1 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import gc
 import unittest
 
 from transformers import XGLMConfig, XGLMTokenizer, is_tf_available
@@ -190,6 +191,11 @@ def test_resize_token_embeddings(self):
 
 @require_tf
 class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     @slow
     def test_lm_generate_xglm(self, verify_outputs=True):
         model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 5028c30ea93d..bbb87abe6d6e 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import datetime
+import gc
 import math
 import unittest
 
@@ -349,6 +350,12 @@ def test_model_from_pretrained(self):
 
 @require_torch
 class XGLMModelLanguageGenerationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def _test_lm_generate_xglm_helper(
         self,
         gradient_checkpointing=False,

From 62fe753325f29532a329bd6e0d4c6542ca9e4ef2 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 9 Jun 2023 16:22:09 +0200
Subject: [PATCH 371/935] [`SAM`] Fix sam slow test (#24140)

* fix sam test

* update pipeline typehint
---
 src/transformers/pipelines/base.py    | 2 +-
 tests/models/sam/test_modeling_sam.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 466ce81999d9..510c07cf5f67 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -760,7 +760,7 @@ def __init__(
         framework: Optional[str] = None,
         task: str = "",
         args_parser: ArgumentHandler = None,
-        device: Union[int, str, "torch.device"] = None,
+        device: Union[int, "torch.device"] = None,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         binary_output: bool = False,
         **kwargs,
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index c8bc30a298a2..a0f39a401355 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -760,7 +760,9 @@ def test_inference_mask_generation_three_boxes_point_batch(self):
         torch.testing.assert_allclose(iou_scores, EXPECTED_IOU, atol=1e-4, rtol=1e-4)
 
     def test_dummy_pipeline_generation(self):
-        generator = pipeline("mask-generation", model="facebook/sam-vit-base", device=torch_device)
+        generator = pipeline(
+            "mask-generation", model="facebook/sam-vit-base", device=0 if torch.cuda.is_available() else -1
+        )
         raw_image = prepare_image()
 
         _ = generator(raw_image, points_per_batch=64)

From 5af3a1aa48ac5909fce2f870c72c2a8297715c94 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 9 Jun 2023 16:30:20 +0200
Subject: [PATCH 372/935] [lamaTokenizerFast] Update documentation (#24132)

* Update documentation

* nits
---
 docs/source/en/model_doc/llama.mdx                       | 1 +
 src/transformers/models/llama/tokenization_llama_fast.py | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/docs/source/en/model_doc/llama.mdx b/docs/source/en/model_doc/llama.mdx
index a5f0553358be..5a107eaa8479 100644
--- a/docs/source/en/model_doc/llama.mdx
+++ b/docs/source/en/model_doc/llama.mdx
@@ -65,6 +65,7 @@ This model was contributed by [zphang](https://huggingface.co/zphang) with contr
     - build_inputs_with_special_tokens
     - get_special_tokens_mask
     - create_token_type_ids_from_sequences
+    - update_post_processor
     - save_vocabulary
 
 ## LlamaModel
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 095f65b628b2..28e9413a50a2 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -48,6 +48,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
     >>> [1, 15043, 445, 338, 263, 1243]
     ```
 
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+
     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods.
 
@@ -108,6 +114,9 @@ def __init__(
         self.can_save_slow_tokenizer = False if not self.vocab_file else True
 
     def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
         bos = self.bos_token
         bos_token_id = self.bos_token_id
 

From a7501f6fc69811f54687a31fb883e50c15aff908 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 9 Jun 2023 16:31:57 +0200
Subject: [PATCH 373/935] [BlenderBotSmall] Update doc example (#24092)

* small tokenizer uses `__start__` and `__end__`

* fix PR doctest
---
 .../models/blenderbot_small/modeling_blenderbot_small.py  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index b123e4e0d5d3..e8f4250398b2 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -536,14 +536,14 @@ def dummy_inputs(self):
     Human: I'm not sure
 
     >>> NEXT_UTTERANCE = (
-    ...     "My friends are cool but they eat too many carbs.</s> <s>what kind of carbs do they eat? "
-    ...     "i don't know much about carbs</s> "
-    ...     "<s> I'm not sure."
+    ...     "My friends are cool but they eat too many carbs.__end__ __start__what kind of carbs do they eat? "
+    ...     "i don't know much about carbs__end__ "
+    ...     "__start__ I'm not sure."
     ... )
     >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
     >>> next_reply_ids = model.generate(**inputs)
     >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
-    Bot:  they eat a lot of carbs. carbs are high in fat, protein, and carbohydrates.
+    Bot:  they eat a lot of carbs. carbs are high in fat, protein, and fats.
     ```
 """
 

From d0d1632958c7d543e07afc672a8501d704e5a65f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 9 Jun 2023 16:49:02 +0200
Subject: [PATCH 374/935] Fix Pipeline CI OOM issue (#24124)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/pipelines/test_pipelines_common.py      | 15 +++++++++++++++
 .../test_pipelines_conversational.py          | 19 ++++++++++++++++++-
 tests/pipelines/test_pipelines_fill_mask.py   | 11 +++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 1a3ddace2871..7909ad22d656 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import logging
 import os
 import sys
@@ -507,6 +508,10 @@ def test_load_default_pipelines_pt(self):
 
             self.check_default_pipeline(task, "pt", set_seed_fn, self.check_models_equal_pt)
 
+            # clean-up as much as possible GPU memory occupied by PyTorch
+            gc.collect()
+            torch.cuda.empty_cache()
+
     @slow
     @require_tf
     def test_load_default_pipelines_tf(self):
@@ -522,6 +527,9 @@ def test_load_default_pipelines_tf(self):
 
             self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf)
 
+            # clean-up as much as possible GPU memory occupied by PyTorch
+            gc.collect()
+
     @slow
     @require_torch
     def test_load_default_pipelines_pt_table_qa(self):
@@ -530,6 +538,10 @@ def test_load_default_pipelines_pt_table_qa(self):
         set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
         self.check_default_pipeline("table-question-answering", "pt", set_seed_fn, self.check_models_equal_pt)
 
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     @slow
     @require_tf
     @require_tensorflow_probability
@@ -539,6 +551,9 @@ def test_load_default_pipelines_tf_table_qa(self):
         set_seed_fn = lambda: tf.random.set_seed(0)  # noqa: E731
         self.check_default_pipeline("table-question-answering", "tf", set_seed_fn, self.check_models_equal_tf)
 
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     def check_default_pipeline(self, task, framework, set_seed_fn, check_models_equal_fn):
         from transformers.pipelines import SUPPORTED_TASKS, pipeline
 
diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py
index 70ecbc310425..f627150d806a 100644
--- a/tests/pipelines/test_pipelines_conversational.py
+++ b/tests/pipelines/test_pipelines_conversational.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import unittest
 
 from transformers import (
@@ -29,7 +30,14 @@
     TFAutoModelForCausalLM,
     pipeline,
 )
-from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, slow, torch_device
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    require_tf,
+    require_torch,
+    slow,
+    torch_device,
+)
 
 from .test_pipelines_common import ANY
 
@@ -39,6 +47,15 @@
 
 @is_pipeline_test
 class ConversationalPipelineTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        if is_torch_available():
+            import torch
+
+            torch.cuda.empty_cache()
+
     model_mapping = dict(
         list(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items())
         if MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index d6dec8db171b..4c53a905eddf 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import unittest
 
 from transformers import MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING, FillMaskPipeline, pipeline
 from transformers.pipelines import PipelineException
 from transformers.testing_utils import (
     is_pipeline_test,
+    is_torch_available,
     nested_simplify,
     require_tf,
     require_torch,
@@ -33,6 +35,15 @@ class FillMaskPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_MASKED_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING
 
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        if is_torch_available():
+            import torch
+
+            torch.cuda.empty_cache()
+
     @require_tf
     def test_small_model_tf(self):
         unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="tf")

From 12bb853ccd3d5df121d7ee21a93c5f816a2d0e02 Mon Sep 17 00:00:00 2001
From: LiamSwayne <108629034+LiamSwayne@users.noreply.github.com>
Date: Fri, 9 Jun 2023 11:59:44 -0400
Subject: [PATCH 375/935] [documentation] grammatical fixes in
 image_classification.mdx (#24141)

Update image_classification.mdx
---
 docs/source/en/tasks/image_classification.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.mdx
index d8f6f63ce09b..c1a2c94eb5e5 100644
--- a/docs/source/en/tasks/image_classification.mdx
+++ b/docs/source/en/tasks/image_classification.mdx
@@ -289,7 +289,7 @@ You're ready to start training your model now! Load ViT with [`AutoModelForImage
 
 At this point, only three steps remain:
 
-1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because this'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+1. Define your training hyperparameters in [`TrainingArguments`]. It is important you don't remove unused columns because that'll drop the `image` column. Without the `image` column, you can't create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
 3. Call [`~Trainer.train`] to finetune your model.
 
@@ -343,7 +343,7 @@ If you are unfamiliar with fine-tuning a model with Keras, check out the [basic
 
 To fine-tune a model in TensorFlow, follow these steps:
 1. Define the training hyperparameters, and set up an optimizer and a learning rate schedule.
-2. Instantiate a pre-treined model.
+2. Instantiate a pre-trained model.
 3. Convert a 🤗 Dataset to a `tf.data.Dataset`.
 4. Compile your model.
 5. Add callbacks and use the `fit()` method to run the training.

From 061580c82c2db1de9139528243e105953793f7a2 Mon Sep 17 00:00:00 2001
From: Freddie Vargus <freddiev4@github.com>
Date: Fri, 9 Jun 2023 17:27:46 +0100
Subject: [PATCH 376/935] Fix typo in streamers.py (#24144)

---
 src/transformers/generation/streamers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py
index 4a6226c0b7b5..4b299db5da69 100644
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@@ -81,7 +81,7 @@ def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **deco
 
     def put(self, value):
         """
-        Recives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
         """
         if len(value.shape) > 1 and value.shape[0] > 1:
             raise ValueError("TextStreamer only supports batch size 1")

From deff5979fee1f989d26e4946c92a5c35ce695af8 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Fri, 9 Jun 2023 13:34:07 -0400
Subject: [PATCH 377/935] Tool types (#24032)

* Tool types

* Tests + fixes

* Isolate types

* Oops

* Review comments + docs

* Tests + docs

* soundfile -> vision
---
 docs/source/en/main_classes/agent.mdx         |  29 ++
 src/transformers/tools/agent_types.py         | 277 ++++++++++++++++++
 src/transformers/tools/base.py                |  12 +-
 tests/tools/test_agent_types.py               | 121 ++++++++
 .../tools/test_document_question_answering.py |  17 +-
 tests/tools/test_text_to_speech.py            |   8 +-
 tests/tools/test_tools_common.py              |  39 ++-
 tests/tools/test_translation.py               |  33 +++
 8 files changed, 521 insertions(+), 15 deletions(-)
 create mode 100644 src/transformers/tools/agent_types.py
 create mode 100644 tests/tools/test_agent_types.py

diff --git a/docs/source/en/main_classes/agent.mdx b/docs/source/en/main_classes/agent.mdx
index 5f4b3df306c8..37e9f00ecb59 100644
--- a/docs/source/en/main_classes/agent.mdx
+++ b/docs/source/en/main_classes/agent.mdx
@@ -70,3 +70,32 @@ We provide three types of agents: [`HfAgent`] uses inference endpoints for opens
 ### launch_gradio_demo
 
 [[autodoc]] launch_gradio_demo
+
+## Agent Types
+
+Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return
+text, image, audio, video, among other types. In order to increase compatibility between tools, as well as to 
+correctly render these returns in ipython (jupyter, colab, ipython notebooks, ...), we implement wrapper classes
+around these types.
+
+The wrapped objects should continue behaving as initially; a text object should still behave as a string, an image
+object should still behave as a `PIL.Image`.
+
+These types have three specific purposes:
+
+- Calling `to_raw` on the type should return the underlying object
+- Calling `to_string` on the type should return the object as a string: that can be the string in case of an `AgentText`
+  but will be the path of the serialized version of the object in other instances
+- Displaying it in an ipython kernel should display the object correctly
+
+### AgentText
+
+[[autodoc]] transformers.tools.agent_types.AgentText
+
+### AgentImage
+
+[[autodoc]] transformers.tools.agent_types.AgentImage
+
+### AgentAudio
+
+[[autodoc]] transformers.tools.agent_types.AgentAudio
diff --git a/src/transformers/tools/agent_types.py b/src/transformers/tools/agent_types.py
new file mode 100644
index 000000000000..f1c3261d57ca
--- /dev/null
+++ b/src/transformers/tools/agent_types.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pathlib
+import tempfile
+import uuid
+
+import numpy as np
+
+from ..utils import is_soundfile_availble, is_torch_available, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+if is_vision_available():
+    import PIL.Image
+    from PIL import Image
+    from PIL.Image import Image as ImageType
+else:
+    ImageType = object
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_availble():
+    import soundfile as sf
+
+
+class AgentType:
+    """
+    Abstract class to be reimplemented to define types that can be returned by agents.
+
+    These objects serve three purposes:
+
+    - They behave as they were the type they're meant to be, e.g., a string for text, a PIL.Image for images
+    - They can be stringified: str(object) in order to return a string defining the object
+    - They should be displayed correctly in ipython notebooks/colab/jupyter
+    """
+
+    def __init__(self, value):
+        self._value = value
+
+    def __str__(self):
+        return self.to_string()
+
+    def to_raw(self):
+        logger.error(
+            "This is a raw AgentType of unknown type. Display in notebooks and string conversion will be unreliable"
+        )
+        return self._value
+
+    def to_string(self) -> str:
+        logger.error(
+            "This is a raw AgentType of unknown type. Display in notebooks and string conversion will be unreliable"
+        )
+        return str(self._value)
+
+
+class AgentText(AgentType, str):
+    """
+    Text type returned by the agent. Behaves as a string.
+    """
+
+    def to_raw(self):
+        return self._value
+
+    def to_string(self):
+        return self._value
+
+
+class AgentImage(AgentType, ImageType):
+    """
+    Image type returned by the agent. Behaves as a PIL.Image.
+    """
+
+    def __init__(self, value):
+        super().__init__(value)
+
+        if not is_vision_available():
+            raise ImportError("PIL must be installed in order to handle images.")
+
+        self._path = None
+        self._raw = None
+        self._tensor = None
+
+        if isinstance(value, ImageType):
+            self._raw = value
+        elif isinstance(value, (str, pathlib.Path)):
+            self._path = value
+        elif isinstance(value, torch.Tensor):
+            self._tensor = value
+        else:
+            raise ValueError(f"Unsupported type for {self.__class__.__name__}: {type(value)}")
+
+    def _ipython_display_(self, include=None, exclude=None):
+        """
+        Displays correctly this type in an ipython notebook (ipython, colab, jupyter, ...)
+        """
+        from IPython.display import Image, display
+
+        display(Image(self.to_string()))
+
+    def to_raw(self):
+        """
+        Returns the "raw" version of that object. In the case of an AgentImage, it is a PIL.Image.
+        """
+        if self._raw is not None:
+            return self._raw
+
+        if self._path is not None:
+            self._raw = Image.open(self._path)
+            return self._raw
+
+    def to_string(self):
+        """
+        Returns the stringified version of that object. In the case of an AgentImage, it is a path to the serialized
+        version of the image.
+        """
+        if self._path is not None:
+            return self._path
+
+        if self._raw is not None:
+            directory = tempfile.mkdtemp()
+            self._path = os.path.join(directory, str(uuid.uuid4()) + ".png")
+            self._raw.save(self._path)
+
+            return self._path
+
+        if self._tensor is not None:
+            array = self._tensor.cpu().detach().numpy()
+
+            # There is likely simpler than load into image into save
+            img = Image.fromarray((array * 255).astype(np.uint8))
+
+            directory = tempfile.mkdtemp()
+            self._path = os.path.join(directory, str(uuid.uuid4()) + ".png")
+
+            img.save(self._path)
+
+            return self._path
+
+
+class AgentAudio(AgentType):
+    """
+    Audio type returned by the agent.
+    """
+
+    def __init__(self, value, samplerate=16_000):
+        super().__init__(value)
+
+        if not is_soundfile_availble():
+            raise ImportError("soundfile must be installed in order to handle audio.")
+
+        self._path = None
+        self._tensor = None
+
+        self.samplerate = samplerate
+
+        if isinstance(value, (str, pathlib.Path)):
+            self._path = value
+        elif isinstance(value, torch.Tensor):
+            self._tensor = value
+        else:
+            raise ValueError(f"Unsupported audio type: {type(value)}")
+
+    def _ipython_display_(self, include=None, exclude=None):
+        """
+        Displays correctly this type in an ipython notebook (ipython, colab, jupyter, ...)
+        """
+        from IPython.display import Audio, display
+
+        display(Audio(self.to_string(), rate=self.samplerate))
+
+    def to_raw(self):
+        """
+        Returns the "raw" version of that object. It is a `torch.Tensor` object.
+        """
+        if self._tensor is not None:
+            return self._tensor
+
+        if self._path is not None:
+            tensor, self.samplerate = sf.read(self._path)
+            self._tensor = torch.tensor(tensor)
+            return self._tensor
+
+    def to_string(self):
+        """
+        Returns the stringified version of that object. In the case of an AgentAudio, it is a path to the serialized
+        version of the audio.
+        """
+        if self._path is not None:
+            return self._path
+
+        if self._tensor is not None:
+            directory = tempfile.mkdtemp()
+            self._path = os.path.join(directory, str(uuid.uuid4()) + ".wav")
+            sf.write(self._path, self._tensor, samplerate=self.samplerate)
+            return self._path
+
+
+AGENT_TYPE_MAPPING = {"text": AgentText, "image": AgentImage, "audio": AgentAudio}
+INSTANCE_TYPE_MAPPING = {str: AgentText}
+
+if is_vision_available():
+    INSTANCE_TYPE_MAPPING[PIL.Image] = AgentImage
+
+
+def handle_agent_inputs(*args, **kwargs):
+    args = [(arg.to_raw() if isinstance(arg, AgentType) else arg) for arg in args]
+    kwargs = {k: (v.to_raw() if isinstance(v, AgentType) else v) for k, v in kwargs.items()}
+    return args, kwargs
+
+
+def handle_agent_outputs(outputs, output_types=None):
+    if isinstance(outputs, dict):
+        decoded_outputs = {}
+        for i, (k, v) in enumerate(outputs.items()):
+            if output_types is not None:
+                # If the class has defined outputs, we can map directly according to the class definition
+                if output_types[i] in AGENT_TYPE_MAPPING:
+                    decoded_outputs[k] = AGENT_TYPE_MAPPING[output_types[i]](v)
+                else:
+                    decoded_outputs[k] = AgentType(v)
+
+            else:
+                # If the class does not have defined output, then we map according to the type
+                for _k, _v in INSTANCE_TYPE_MAPPING.items():
+                    if isinstance(v, _k):
+                        decoded_outputs[k] = _v(v)
+                if k not in decoded_outputs:
+                    decoded_outputs[k] = AgentType[v]
+
+    elif isinstance(outputs, (list, tuple)):
+        decoded_outputs = type(outputs)()
+        for i, v in enumerate(outputs):
+            if output_types is not None:
+                # If the class has defined outputs, we can map directly according to the class definition
+                if output_types[i] in AGENT_TYPE_MAPPING:
+                    decoded_outputs.append(AGENT_TYPE_MAPPING[output_types[i]](v))
+                else:
+                    decoded_outputs.append(AgentType(v))
+            else:
+                # If the class does not have defined output, then we map according to the type
+                found = False
+                for _k, _v in INSTANCE_TYPE_MAPPING.items():
+                    if isinstance(v, _k):
+                        decoded_outputs.append(_v(v))
+                        found = True
+
+                if not found:
+                    decoded_outputs.append(AgentType(v))
+
+    else:
+        if output_types[0] in AGENT_TYPE_MAPPING:
+            # If the class has defined outputs, we can map directly according to the class definition
+            decoded_outputs = AGENT_TYPE_MAPPING[output_types[0]](outputs)
+
+        else:
+            # If the class does not have defined output, then we map according to the type
+            for _k, _v in INSTANCE_TYPE_MAPPING.items():
+                if isinstance(outputs, _k):
+                    return _v(outputs)
+            return AgentType(outputs)
+
+    return decoded_outputs
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index add64e373dcd..90ddcf6aa82e 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -37,6 +37,7 @@
     is_vision_available,
     logging,
 )
+from .agent_types import handle_agent_inputs, handle_agent_outputs
 
 
 logger = logging.get_logger(__name__)
@@ -413,6 +414,8 @@ def extract_outputs(self, outputs):
         return outputs
 
     def __call__(self, *args, **kwargs):
+        args, kwargs = handle_agent_inputs(*args, **kwargs)
+
         output_image = self.tool_class is not None and self.tool_class.outputs == ["image"]
         inputs = self.prepare_inputs(*args, **kwargs)
         if isinstance(inputs, dict):
@@ -421,6 +424,9 @@ def __call__(self, *args, **kwargs):
             outputs = self.client(inputs, output_image=output_image)
         if isinstance(outputs, list) and len(outputs) == 1 and isinstance(outputs[0], list):
             outputs = outputs[0]
+
+        outputs = handle_agent_outputs(outputs, self.tool_class.outputs if self.tool_class is not None else None)
+
         return self.extract_outputs(outputs)
 
 
@@ -550,6 +556,8 @@ def decode(self, outputs):
         return self.post_processor(outputs)
 
     def __call__(self, *args, **kwargs):
+        args, kwargs = handle_agent_inputs(*args, **kwargs)
+
         if not self.is_initialized:
             self.setup()
 
@@ -557,7 +565,9 @@ def __call__(self, *args, **kwargs):
         encoded_inputs = send_to_device(encoded_inputs, self.device)
         outputs = self.forward(encoded_inputs)
         outputs = send_to_device(outputs, "cpu")
-        return self.decode(outputs)
+        decoded_outputs = self.decode(outputs)
+
+        return handle_agent_outputs(decoded_outputs, self.outputs)
 
 
 def launch_gradio_demo(tool_class: Tool):
diff --git a/tests/tools/test_agent_types.py b/tests/tools/test_agent_types.py
new file mode 100644
index 000000000000..a1cc4f70cc65
--- /dev/null
+++ b/tests/tools/test_agent_types.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+import uuid
+from pathlib import Path
+
+from transformers.testing_utils import get_tests_dir, require_soundfile, require_torch, require_vision
+from transformers.tools.agent_types import AgentAudio, AgentImage, AgentText
+from transformers.utils import is_soundfile_availble, is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_availble():
+    import soundfile as sf
+
+if is_vision_available():
+    from PIL import Image
+
+
+def get_new_path(suffix="") -> str:
+    directory = tempfile.mkdtemp()
+    return os.path.join(directory, str(uuid.uuid4()) + suffix)
+
+
+@require_soundfile
+@require_torch
+class AgentAudioTests(unittest.TestCase):
+    def test_from_tensor(self):
+        tensor = torch.rand(12, dtype=torch.float64) - 0.5
+        agent_type = AgentAudio(tensor)
+        path = str(agent_type.to_string())
+
+        # Ensure that the tensor and the agent_type's tensor are the same
+        self.assertTrue(torch.allclose(tensor, agent_type.to_raw(), atol=1e-4))
+
+        del agent_type
+
+        # Ensure the path remains even after the object deletion
+        self.assertTrue(os.path.exists(path))
+
+        # Ensure that the file contains the same value as the original tensor
+        new_tensor, _ = sf.read(path)
+        self.assertTrue(torch.allclose(tensor, torch.tensor(new_tensor), atol=1e-4))
+
+    def test_from_string(self):
+        tensor = torch.rand(12, dtype=torch.float64) - 0.5
+        path = get_new_path(suffix=".wav")
+        sf.write(path, tensor, 16000)
+
+        agent_type = AgentAudio(path)
+
+        self.assertTrue(torch.allclose(tensor, agent_type.to_raw(), atol=1e-4))
+        self.assertEqual(agent_type.to_string(), path)
+
+
+@require_vision
+@require_torch
+class AgentImageTests(unittest.TestCase):
+    def test_from_tensor(self):
+        tensor = torch.randint(0, 256, (64, 64, 3))
+        agent_type = AgentImage(tensor)
+        path = str(agent_type.to_string())
+
+        # Ensure that the tensor and the agent_type's tensor are the same
+        self.assertTrue(torch.allclose(tensor, agent_type._tensor, atol=1e-4))
+
+        self.assertIsInstance(agent_type.to_raw(), Image.Image)
+
+        # Ensure the path remains even after the object deletion
+        del agent_type
+        self.assertTrue(os.path.exists(path))
+
+    def test_from_string(self):
+        path = Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png"
+        image = Image.open(path)
+        agent_type = AgentImage(path)
+
+        self.assertTrue(path.samefile(agent_type.to_string()))
+        self.assertTrue(image == agent_type.to_raw())
+
+        # Ensure the path remains even after the object deletion
+        del agent_type
+        self.assertTrue(os.path.exists(path))
+
+    def test_from_image(self):
+        path = Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png"
+        image = Image.open(path)
+        agent_type = AgentImage(image)
+
+        self.assertFalse(path.samefile(agent_type.to_string()))
+        self.assertTrue(image == agent_type.to_raw())
+
+        # Ensure the path remains even after the object deletion
+        del agent_type
+        self.assertTrue(os.path.exists(path))
+
+
+class AgentTextTests(unittest.TestCase):
+    def test_from_string(self):
+        string = "Hey!"
+        agent_type = AgentText(string)
+
+        self.assertEqual(string, agent_type.to_string())
+        self.assertEqual(string, agent_type.to_raw())
+        self.assertEqual(string, agent_type)
diff --git a/tests/tools/test_document_question_answering.py b/tests/tools/test_document_question_answering.py
index a799676e9a62..1d77bcb47080 100644
--- a/tests/tools/test_document_question_answering.py
+++ b/tests/tools/test_document_question_answering.py
@@ -30,28 +30,27 @@ def setUp(self):
 
     def test_exact_match_arg(self):
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[0]["image"]
+        document = dataset[0]["image"]
 
-        result = self.tool(image, "When is the coffee break?")
+        result = self.tool(document, "When is the coffee break?")
         self.assertEqual(result, "11-14 to 11:39 a.m.")
 
     def test_exact_match_arg_remote(self):
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[0]["image"]
+        document = dataset[0]["image"]
 
-        result = self.remote_tool(image, "When is the coffee break?")
+        result = self.remote_tool(document, "When is the coffee break?")
         self.assertEqual(result, "11-14 to 11:39 a.m.")
 
     def test_exact_match_kwarg(self):
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[0]["image"]
+        document = dataset[0]["image"]
 
-        result = self.tool(image=image, question="When is the coffee break?")
-        self.assertEqual(result, "11-14 to 11:39 a.m.")
+        self.tool(document=document, question="When is the coffee break?")
 
     def test_exact_match_kwarg_remote(self):
         dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[0]["image"]
+        document = dataset[0]["image"]
 
-        result = self.remote_tool(image=image, question="When is the coffee break?")
+        result = self.remote_tool(document=document, question="When is the coffee break?")
         self.assertEqual(result, "11-14 to 11:39 a.m.")
diff --git a/tests/tools/test_text_to_speech.py b/tests/tools/test_text_to_speech.py
index d404f3574058..a63017d27706 100644
--- a/tests/tools/test_text_to_speech.py
+++ b/tests/tools/test_text_to_speech.py
@@ -37,9 +37,11 @@ def test_exact_match_arg(self):
         # SpeechT5 isn't deterministic
         torch.manual_seed(0)
         result = self.tool("hey")
+        resulting_tensor = result.to_raw()
         self.assertTrue(
             torch.allclose(
-                result[:3], torch.tensor([-0.00040140701457858086, -0.0002551682700868696, -0.00010294507956132293])
+                resulting_tensor[:3],
+                torch.tensor([-0.0005966668832115829, -0.0003657640190795064, -0.00013439502799883485]),
             )
         )
 
@@ -47,8 +49,10 @@ def test_exact_match_kwarg(self):
         # SpeechT5 isn't deterministic
         torch.manual_seed(0)
         result = self.tool("hey")
+        resulting_tensor = result.to_raw()
         self.assertTrue(
             torch.allclose(
-                result[:3], torch.tensor([-0.00040140701457858086, -0.0002551682700868696, -0.00010294507956132293])
+                resulting_tensor[:3],
+                torch.tensor([-0.0005966668832115829, -0.0003657640190795064, -0.00013439502799883485]),
             )
         )
diff --git a/tests/tools/test_tools_common.py b/tests/tools/test_tools_common.py
index 5e66885cac7d..984edfcd8c60 100644
--- a/tests/tools/test_tools_common.py
+++ b/tests/tools/test_tools_common.py
@@ -18,6 +18,7 @@
 
 from transformers import is_torch_available, is_vision_available
 from transformers.testing_utils import get_tests_dir, is_tool_test
+from transformers.tools.agent_types import AGENT_TYPE_MAPPING, AgentAudio, AgentImage, AgentText
 
 
 if is_torch_available():
@@ -54,11 +55,11 @@ def output_types(outputs: List):
     output_types = []
 
     for output in outputs:
-        if isinstance(output, str):
+        if isinstance(output, (str, AgentText)):
             output_types.append("text")
-        elif isinstance(output, Image.Image):
+        elif isinstance(output, (Image.Image, AgentImage)):
             output_types.append("image")
-        elif isinstance(output, torch.Tensor):
+        elif isinstance(output, (torch.Tensor, AgentAudio)):
             output_types.append("audio")
         else:
             raise ValueError(f"Invalid output: {output}")
@@ -98,3 +99,35 @@ def test_common_attributes(self):
         self.assertTrue(hasattr(self.tool, "description"))
         self.assertTrue(hasattr(self.tool, "default_checkpoint"))
         self.assertTrue(self.tool.description.startswith("This is a tool that"))
+
+    def test_agent_types_outputs(self):
+        inputs = create_inputs(self.tool.inputs)
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
+
+        for output, output_type in zip(outputs, self.tool.outputs):
+            agent_type = AGENT_TYPE_MAPPING[output_type]
+            self.assertTrue(isinstance(output, agent_type))
+
+    def test_agent_types_inputs(self):
+        inputs = create_inputs(self.tool.inputs)
+
+        _inputs = []
+
+        for _input, input_type in zip(inputs, self.tool.inputs):
+            if isinstance(input_type, list):
+                _inputs.append([AGENT_TYPE_MAPPING[_input_type](_input) for _input_type in input_type])
+            else:
+                _inputs.append(AGENT_TYPE_MAPPING[input_type](_input))
+
+        # Should not raise an error
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
diff --git a/tests/tools/test_translation.py b/tests/tools/test_translation.py
index 2ccb043b0049..15e1c8cd6ae4 100644
--- a/tests/tools/test_translation.py
+++ b/tests/tools/test_translation.py
@@ -16,6 +16,7 @@
 import unittest
 
 from transformers import load_tool
+from transformers.tools.agent_types import AGENT_TYPE_MAPPING
 
 from .test_tools_common import ToolTesterMixin, output_types
 
@@ -51,3 +52,35 @@ def test_call(self):
             outputs = [outputs]
 
         self.assertListEqual(output_types(outputs), self.tool.outputs)
+
+    def test_agent_types_outputs(self):
+        inputs = ["Hey, what's up?", "English", "Spanish"]
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))
+
+        for output, output_type in zip(outputs, self.tool.outputs):
+            agent_type = AGENT_TYPE_MAPPING[output_type]
+            self.assertTrue(isinstance(output, agent_type))
+
+    def test_agent_types_inputs(self):
+        inputs = ["Hey, what's up?", "English", "Spanish"]
+
+        _inputs = []
+
+        for _input, input_type in zip(inputs, self.tool.inputs):
+            if isinstance(input_type, list):
+                _inputs.append([AGENT_TYPE_MAPPING[_input_type](_input) for _input_type in input_type])
+            else:
+                _inputs.append(AGENT_TYPE_MAPPING[input_type](_input))
+
+        # Should not raise an error
+        outputs = self.tool(*inputs)
+
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        self.assertEqual(len(outputs), len(self.tool.outputs))

From 0d217f428fa5b84fcde9c31ef1c95e4215856f7e Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 9 Jun 2023 21:53:11 -0700
Subject: [PATCH 378/935] [tests] fix bitsandbytes import issue (#24151)

fix bitsandbytes import issue
---
 tests/{bitsandbytes => bnb}/README.md          | 0
 tests/{bitsandbytes => bnb}/__init__.py        | 0
 tests/{bitsandbytes => bnb}/test_4bit.py       | 0
 tests/{bitsandbytes => bnb}/test_mixed_int8.py | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/{bitsandbytes => bnb}/README.md (100%)
 rename tests/{bitsandbytes => bnb}/__init__.py (100%)
 rename tests/{bitsandbytes => bnb}/test_4bit.py (100%)
 rename tests/{bitsandbytes => bnb}/test_mixed_int8.py (100%)

diff --git a/tests/bitsandbytes/README.md b/tests/bnb/README.md
similarity index 100%
rename from tests/bitsandbytes/README.md
rename to tests/bnb/README.md
diff --git a/tests/bitsandbytes/__init__.py b/tests/bnb/__init__.py
similarity index 100%
rename from tests/bitsandbytes/__init__.py
rename to tests/bnb/__init__.py
diff --git a/tests/bitsandbytes/test_4bit.py b/tests/bnb/test_4bit.py
similarity index 100%
rename from tests/bitsandbytes/test_4bit.py
rename to tests/bnb/test_4bit.py
diff --git a/tests/bitsandbytes/test_mixed_int8.py b/tests/bnb/test_mixed_int8.py
similarity index 100%
rename from tests/bitsandbytes/test_mixed_int8.py
rename to tests/bnb/test_mixed_int8.py

From 8f093fb799246f7dd9104ff44728da0c53a9f67a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Sat, 10 Jun 2023 09:47:38 +0200
Subject: [PATCH 379/935] Avoid OOM in doctest CI (#24139)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/testing_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 0a35494765a0..f7035751901d 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1870,9 +1870,18 @@ def preprocess_string(string, skip_cuda_tests):
         ):
             is_cuda_found = True
             break
+
     modified_string = ""
     if not is_cuda_found:
         modified_string = "".join(codeblocks)
+
+        if ">>>" in modified_string:
+            lines = modified_string.split("\n")
+            indent = len(lines[-1]) - len(lines[-1].lstrip())
+
+            cleanup = ">>> import gc; gc.collect()  # doctest: +IGNORE_RESULT"
+            modified_string += "\n" + " " * indent + cleanup
+
     return modified_string
 
 

From e26c6f03be35a621d26d79ae59e21ceac3ffa73e Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 12 Jun 2023 11:39:04 +0200
Subject: [PATCH 380/935] Fix `Wav2Vec2` CI OOM  (#24190)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/wav2vec2/test_modeling_tf_wav2vec2.py | 6 ++++++
 tests/models/wav2vec2/test_modeling_wav2vec2.py    | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index ef4c38e2a303..391d8e8ce1f1 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import copy
+import gc
 import glob
 import inspect
 import math
@@ -709,6 +710,11 @@ def test_compute_mask_indices_overlap(self):
 @require_tf
 @slow
 class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index cf41dd9a301a..87206a4b9b8f 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Wav2Vec2 model. """
 
+import gc
 import math
 import multiprocessing
 import os
@@ -1374,6 +1375,12 @@ def test_sample_negatives_with_mask(self):
 @require_soundfile
 @slow
 class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def _load_datasamples(self, num_samples):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech

From 93f73a38483172a3dd7160ec98b3aedbe630a243 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Mon, 12 Jun 2023 14:51:09 +0200
Subject: [PATCH 381/935] Fix push to hub (#24187)

Add fix
---
 src/transformers/utils/hub.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 076e6bdbb094..ffed74360d05 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -712,7 +712,6 @@ def _upload_modified_files(
         operations = []
         # upload standalone files
         for file in modified_files:
-            operations.append(CommitOperationAdd(path_or_fileobj=os.path.join(working_dir, file), path_in_repo=file))
             if os.path.isdir(os.path.join(working_dir, file)):
                 # go over individual files of folder
                 for f in os.listdir(os.path.join(working_dir, file)):

From ba64ec07bbfcbd8f8c40afaece57daa180b5854b Mon Sep 17 00:00:00 2001
From: AinL <gmlwns5176@gmail.com>
Date: Mon, 12 Jun 2023 21:56:48 +0900
Subject: [PATCH 382/935] Change ProgressCallback to use dynamic_ncols=True
 (#24101)

* Change ProgressCallback to use dynamic_ncols=True

* style: make style

* Revert "style: make style"

This reverts commit dee484904cd30a072d80e3be0a3d74a03cff30c6.

* run make style only trainer_callback
---
 src/transformers/trainer_callback.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 808c1b4702f9..bade6841ed1b 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -473,7 +473,7 @@ def __init__(self):
 
     def on_train_begin(self, args, state, control, **kwargs):
         if state.is_local_process_zero:
-            self.training_bar = tqdm(total=state.max_steps)
+            self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True)
         self.current_step = 0
 
     def on_step_end(self, args, state, control, **kwargs):
@@ -484,7 +484,9 @@ def on_step_end(self, args, state, control, **kwargs):
     def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
         if state.is_local_process_zero and has_length(eval_dataloader):
             if self.prediction_bar is None:
-                self.prediction_bar = tqdm(total=len(eval_dataloader), leave=self.training_bar is None)
+                self.prediction_bar = tqdm(
+                    total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True
+                )
             self.prediction_bar.update(1)
 
     def on_evaluate(self, args, state, control, **kwargs):

From 535f92aea3be061f06abde143dcdc90141d7c431 Mon Sep 17 00:00:00 2001
From: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Date: Mon, 12 Jun 2023 21:59:18 +0900
Subject: [PATCH 383/935] [i18n]Translated "attention.mdx" to korean (#23878)

* [i18n]Translated "attention.mdx" to korean

Co-Authored-By: Hyeonseo Yun <0525yhs@gmail.com>
Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>
Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* Update _toctree.yml

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml  |  6 ++---
 docs/source/ko/attention.mdx | 50 ++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/ko/attention.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 8c15b26039c2..eef826837a19 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -164,8 +164,8 @@
     title: (번역중) The Transformer model family
   - local: in_translation
     title: (번역중) Summary of the tokenizers
-  - local: in_translation
-    title: (번역중) Attention mechanisms
+  - local: attention
+    title: 어텐션 매커니즘
   - local: pad_truncation
     title: 패딩과 잘라내기
   - local: bertology
@@ -673,4 +673,4 @@
     - local: in_translation
       title: (번역중) Utilities for Time Series
     title: (번역중) Internal Helpers
-  title: (번역중) API
\ No newline at end of file
+  title: (번역중) API
diff --git a/docs/source/ko/attention.mdx b/docs/source/ko/attention.mdx
new file mode 100644
index 000000000000..5ba05938e14d
--- /dev/null
+++ b/docs/source/ko/attention.mdx
@@ -0,0 +1,50 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 어텐션 메커니즘[[attention_mechanisms]]
+
+대부분의 트랜스포머 모델은 정방행렬인 전체 어텐션을 사용합니다. 
+하지만 이는 긴 텍스트를 다룰 때는 큰 계산 병목 현상을 유발할 수 있습니다. 
+`Longformer`와 `Reformer`는 훈련 속도를 높이기 위해 어텐션 행렬의 희소 버전을 사용하여 효율을 높이려는 모델입니다.
+
+## LSH 어텐션[[lsh_attention]]
+
+
+[Reformer](#reformer)는 LSH(Locality Sensitive Hashing) 어텐션을 사용합니다. softmax(QK^t)에서는 행렬 QK^t의 (softmax 차원에서) 가장 큰 요소들만 유용한 기여를 할 것입니다. 
+따라서 각각의 쿼리 q에 대해, q와 가까운 키 k만 고려할 수 있습니다. 해시 함수는 q와 k가 가까운지 여부를 결정하는 데 사용됩니다. 
+어텐션 마스크는 현재 토큰을 마스킹하여 변경됩니다. 이 때 첫 번째 위치의 토큰은 제외합니다. 왜냐하면 쿼리와 키가 동일한 값을 갖게 되기 때문입니다(서로 매우 유사함). 
+해시는 약간의 무작위성을 가질 수 있으므로, 실제로는 여러 개의 해시 함수가 사용되고 (`n_rounds` 매개변수에 의해 결정됨) 그 후에 평균값을 취하게 됩니다.
+
+## 지역 어텐션[[local_attention]]
+
+[Longformer](#longformer)는 지역 어텐션을 사용합니다. 종종 특정 토큰에 대해 지역 컨텍스트(예: 왼쪽과 오른쪽에 있는 두 개의 토큰은 무엇인가요?)만으로도 작업을 수행하는데 충분합니다. 
+또한 작은 창(window)을 가진 어텐션 레이어를 쌓음으로써 마지막 레이어는 창 내의 토큰뿐만 아니라 더 많은 수의 토큰에 대한 수용 영역(receptive field)을 갖게 되어 전체 문장의 표현을 구축할 수 있습니다.
+
+사전에 선택된 일부 입력 토큰들은 전역 어텐션을 받습니다. 이 몇 개의 토큰에 대해서는 어텐션 행렬이 모든 토큰에 접근할 수 있으며, 이 과정은 대칭적으로 이루어집니다. 
+다른 모든 토큰들은 로컬 창 내의 토큰들에 더해 해당 특정 토큰들에도 접근할 수 있습니다. 이는 논문의 Figure 2d에서 나타나며, 아래에 샘플 어텐션 마스크가 제시되어 있습니다:
+
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+
+적은 파라미터의 어텐션 행렬을 사용하면 모델이 더 큰 시퀀스 입력 길이를 가질 수 있습니다.
+
+## 다른 방법들[[other_tricks]]
+
+### 축별 위치 인코딩[[axial_positional_encodings]]
+
+[Reformer](#reformer)는 축별 위치 인코딩(axial positional encodings)을 사용합니다. 기존의 트랜스포머 모델에서는 위치 인코딩 행렬 E는 크기가 \\(l \times d\\)인 행렬이며, 
+여기서 \\(l\\)은 시퀀스 길이(sequence length)이고 \\(d\\)는 숨겨진 상태(hidden state)의 차원입니다. 매우 긴 텍스트의 경우, 이 행렬은 매우 크며 GPU 상에서 공간을 많이 차지할 수 있습니다. 
+이를 완화하기 위해, 축별 위치 인코딩은 큰 행렬 E를 두 개의 작은 행렬 E1과 E2로 분해합니다. 이때 E1의 크기는 \\(l_{1} \times d_{1}\\)이고, E2의 크기는 \\(l_{2} \times d_{2}\\)입니다. 
+이때 \\(l_{1} \times l_{2} = l\\)이고 \\(d_{1} + d_{2} = d\\)(길이에 대한 곱셈 연산을 사용하면 훨씬 작아집니다). E의 시간 단계 j에 대한 임베딩은 E1에서 시간 단계 \\(j \% l1\\)의 임베딩과 E2에서 시간 단계  \\(j // l1\\)의 임베딩을 연결하여 얻습니다.
\ No newline at end of file

From 9f81f4f6dda9105180c03c521c5c63866cc1234a Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 12 Jun 2023 14:10:49 +0100
Subject: [PATCH 384/935] Generate: force caching on the main model, in
 assisted generation (#24177)

---
 src/transformers/generation/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index e88c446ad3dc..a5e6a70379ff 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4322,6 +4322,7 @@ def assisted_decoding(
                         encoder_outputs=model_kwargs["encoder_outputs"],
                         output_attentions=output_attentions,
                         output_hidden_states=output_hidden_states,
+                        use_cache=True,
                     )
                 else:
                     outputs = self(
@@ -4330,6 +4331,7 @@ def assisted_decoding(
                         past_key_values=model_kwargs["past_key_values"],
                         output_attentions=output_attentions,
                         output_hidden_states=output_hidden_states,
+                        use_cache=True,
                     )
             else:
                 if self.config.is_encoder_decoder:
@@ -4338,12 +4340,14 @@ def assisted_decoding(
                         encoder_outputs=model_kwargs["encoder_outputs"],
                         output_attentions=output_attentions,
                         output_hidden_states=output_hidden_states,
+                        use_cache=True,
                     )
                 else:
                     outputs = self(
                         candidate_input_ids,
                         output_attentions=output_attentions,
                         output_hidden_states=output_hidden_states,
+                        use_cache=True,
                     )
 
             # 2.2. Process the new logits

From a9cdb059a863765c81d8767002966ac355ad32c3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 12 Jun 2023 15:21:27 +0200
Subject: [PATCH 385/935] Fix device issue in
 `OpenLlamaModelTest::test_model_parallelism` (#24195)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/open_llama/modeling_open_llama.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index f5c38f154b05..12c85e19f25d 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -736,12 +736,16 @@ def forward(
 
         hidden_states = outputs[0]
         if self.config.shared_input_output_embedding:
-            logits = torch.einsum("blh,vh->blv", hidden_states, self.model.embed_tokens.weight)
+            logits = torch.einsum(
+                "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight
+            )
         else:
             logits = self.lm_head(hidden_states)
 
         loss = None
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()

From dadc9fb4275f4b7c2984d16d6d9a7880ec76d872 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 12 Jun 2023 15:37:12 +0200
Subject: [PATCH 386/935] Update `GPTNeoXLanguageGenerationTest` (#24193)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/gpt_neox/test_modeling_gpt_neox.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index c2bbbe58e11c..39eac1ccc2ad 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -314,9 +314,9 @@ def test_lm_generate_gptneox(self):
             model.to(torch_device)
 
             inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
-            expected_output = (
-                "My favorite food is the chicken and rice.\n\nI love to cook and bake. I love to cook and bake"
-            )
+            # The hub repo. is updated on 2023-04-04, resulting in poor outputs.
+            # See: https://github.com/huggingface/transformers/pull/24193
+            expected_output = "My favorite food is a good old-fashioned, old-fashioned, old-fashioned.\n\nI'm not sure"
 
             output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
             output_str = tokenizer.batch_decode(output_ids)[0]

From 97527898da1097df83cb47256e64abf53c113adb Mon Sep 17 00:00:00 2001
From: Jacob <zhangshaojie18@mails.ucas.ac.cn>
Date: Mon, 12 Jun 2023 22:43:58 +0800
Subject: [PATCH 387/935] typo: fix typos in CONTRIBUTING.md and deepspeed.mdx
 (#24184)

* typo: fix typos in CONTRIBUTING.md and deepspeed.mdx

* Update CONTRIBUTING.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 CONTRIBUTING.md                           | 4 ++--
 docs/source/en/main_classes/deepspeed.mdx | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9635ae09d739..1c2b896815be 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -275,7 +275,7 @@ You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/mai
 request description to make sure they are linked (and people viewing the issue know you
 are working on it).<br>
 ☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
-useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.
+useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
 ☐ Make sure existing tests pass.<br>
 ☐ If adding a new feature, also add tests for it.<br>
    - If you are adding a new model, make sure you use
@@ -284,7 +284,7 @@ useful to avoid duplicated work, and to differentiate it from PRs ready to be me
      `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
    - If you are adding a new tokenizer, write tests and make sure
      `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
-   CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
+   - CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
 
 ☐ All public methods must have informative docstrings (see
 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
diff --git a/docs/source/en/main_classes/deepspeed.mdx b/docs/source/en/main_classes/deepspeed.mdx
index 2c78078d60d0..3733b0a1b883 100644
--- a/docs/source/en/main_classes/deepspeed.mdx
+++ b/docs/source/en/main_classes/deepspeed.mdx
@@ -760,7 +760,7 @@ time. "reuse distance" is a metric we are using to figure out when will a parame
 use the `stage3_max_reuse_distance` to decide whether to throw away the parameter or to keep it. If a parameter is
 going to be used again in near future (less than `stage3_max_reuse_distance`) then we keep it to reduce communication
 overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and
-backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward
+backward passes a single layer granularity and want to keep the parameter in the forward recompute till the backward
 
 The following configuration values depend on the model's hidden size:
 

From 60b69f7de2b745fb6f7dab7d8eeff91218bc25b9 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 12 Jun 2023 16:06:20 +0100
Subject: [PATCH 388/935] Generate: detect special architectures when loaded
 from PEFT (#24198)

---
 src/transformers/generation/utils.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index a5e6a70379ff..d077597ac6d4 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4232,6 +4232,15 @@ def assisted_decoding(
 
         # other auxiliary variables
         max_len = stopping_criteria[0].max_length
+        assistant_kv_indexing = (
+            1
+            if "bloom" in assistant_model.__class__.__name__.lower()
+            or (
+                assistant_model.config.architectures is not None
+                and "bloom" in assistant_model.config.architectures[0].lower()
+            )
+            else 0
+        )
 
         this_peer_finished = False  # used by synced_gpus only
         while True:
@@ -4247,7 +4256,6 @@ def assisted_decoding(
 
             # Assistant: main logic start
             cur_len = input_ids.shape[-1]
-            assistant_kv_indexing = 0 if "bloom" not in assistant_model.__class__.__name__.lower() else 1
 
             #  1. Forecast next N tokens using the assistant model. This `for` block can be replaced with a
             # `.generate()` call if we decide to add `past_key_values` as a possible output of generate, as we
@@ -4512,7 +4520,10 @@ def _crop_past_key_values(model, past_key_values, maximum_length):
                 )
             )
         past_key_values = tuple(new_past)
-    elif "bloom" in model.__class__.__name__.lower():  # bloom is special
+    # bloom is special
+    elif "bloom" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "bloom" in model.config.architectures[0].lower()
+    ):
         for idx in range(len(past_key_values)):
             new_past.append(
                 (
@@ -4521,7 +4532,10 @@ def _crop_past_key_values(model, past_key_values, maximum_length):
                 )
             )
         past_key_values = tuple(new_past)
-    elif "gptbigcode" in model.__class__.__name__.lower():  # gptbigcode is too
+    # gptbigcode is too
+    elif "gptbigcode" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "gptbigcode" in model.config.architectures[0].lower()
+    ):
         if model.config.multi_query:
             for idx in range(len(past_key_values)):
                 past_key_values[idx] = past_key_values[idx][:, :maximum_length, :]

From dc42a9d76f3fb0ff0277a51e76833748d4cd1e3f Mon Sep 17 00:00:00 2001
From: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Date: Tue, 13 Jun 2023 00:07:15 +0900
Subject: [PATCH 389/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20tasks=5Fsummary.mdx=20to=20Korean=20(#23977)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 🌐 [i18n-KO] Translated tasks_summary.mdx to Korean

Co-Authored-By: Hyeonseo Yun <0525yhs@gmail.com>
Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>
Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>

* Apply suggestions from code review

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* Update _toctree.yml

* Delete generation_strategies.mdx

* Delete tasks_explained.mdx

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
---
 docs/source/ko/_toctree.yml     |   4 +-
 docs/source/ko/task_summary.mdx | 337 ++++++++++++++++++++++++++++++++
 2 files changed, 339 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/task_summary.mdx

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index eef826837a19..0796b7b26fd0 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -156,8 +156,8 @@
     title: (번역중) Philosophy
   - local: in_translation
     title: (번역중) Glossary
-  - local: in_translation
-    title: (번역중) 🤗 What 🤗 Transformers can do
+  - local: task_summary
+    title: 🤗 Transformers로 할 수 있는 작업
   - local: tasks_explained
     title: 🤗 Transformers로 작업을 해결하는 방법
   - local: in_translation
diff --git a/docs/source/ko/task_summary.mdx b/docs/source/ko/task_summary.mdx
new file mode 100644
index 000000000000..b495e356827e
--- /dev/null
+++ b/docs/source/ko/task_summary.mdx
@@ -0,0 +1,337 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 🤗 Transformers로 할 수 있는 것[[what__transformers_can_do]]
+
+🤗 Transformers는 자연어처리(NLP), 컴퓨터 비전, 오디오 및 음성 처리 작업에 대한 사전훈련된 최첨단 모델 라이브러리입니다. 
+이 라이브러리는 트랜스포머 모델뿐만 아니라 컴퓨터 비전 작업을 위한 현대적인 합성곱 신경망과 같은 트랜스포머가 아닌 모델도 포함하고 있습니다. 
+
+스마트폰, 앱, 텔레비전과 같은 오늘날 가장 인기 있는 소비자 제품을 살펴보면, 딥러닝 기술이 그 뒤에 사용되고 있을 확률이 높습니다. 
+스마트폰으로 촬영한 사진에서 배경 객체를 제거하고 싶다면 어떻게 할까요? 이는 파놉틱 세그멘테이션 작업의 예입니다(아직 이게 무엇인지 모른다면, 다음 섹션에서 설명하겠습니다!).
+
+이 페이지는 다양한 음성 및 오디오, 컴퓨터 비전, NLP 작업을 🤗 Transformers 라이브러리를 활용하여 다루는 간단한 예제를 3줄의 코드로 제공합니다. 
+
+## 오디오[[audio]]
+
+
+음성 및 오디오 처리 작업은 다른 모달리티와 약간 다릅니다. 이는 주로 오디오가 연속적인 신호로 입력되기 때문입니다. 
+텍스트와 달리 원본 오디오 파형(waveform)은 문장이 단어로 나눠지는 것처럼 깔끔하게 이산적인 묶음으로 나눌 수 없습니다. 
+이를 극복하기 위해 원본 오디오 신호는 일정한 간격으로 샘플링됩니다. 해당 간격 내에서 더 많은 샘플을 취할 경우 샘플링률이 높아지며, 오디오는 원본 오디오 소스에 더 가까워집니다.
+
+과거의 접근 방식은 오디오에서 유용한 특징을 추출하기 위해 오디오를 전처리하는 것이었습니다. 
+하지만 현재는 원본 오디오 파형을 특성 인코더에 직접 넣어서 오디오 표현(representation)을 추출하는 것이 더 일반적입니다. 
+이렇게 하면 전처리 단계가 단순해지고 모델이 가장 중요한 특징을 학습할 수 있습니다.
+
+### 오디오 분류[[audio_classification]]
+
+
+오디오 분류는 오디오 데이터에 미리 정의된 클래스 집합의 레이블을 지정하는 작업입니다. 이는 많은 구체적인 응용 프로그램을 포함한 넓은 범주입니다.
+
+일부 예시는 다음과 같습니다:
+
+* 음향 장면 분류: 오디오에 장면 레이블("사무실", "해변", "경기장")을 지정합니다.
+* 음향 이벤트 감지: 오디오에 소리 이벤트 레이블("차 경적", "고래 울음소리", "유리 파손")을 지정합니다.
+* 태깅: 여러 가지 소리(새 지저귐, 회의에서의 화자 식별)가 포함된 오디오에 레이블을 지정합니다.
+* 음악 분류: 음악에 장르 레이블("메탈", "힙합", "컨트리")을 지정합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### 자동 음성 인식[[automatic_speech_recognition]]
+
+
+자동 음성 인식(ASR)은 음성을 텍스트로 변환하는 작업입니다. 
+음성은 인간의 자연스러운 의사소통 형태이기 때문에 ASR은 가장 일반적인 오디오 작업 중 하나입니다. 
+오늘날 ASR 시스템은 스피커, 전화 및 자동차와 같은 "스마트" 기술 제품에 내장되어 있습니다. 
+우리는 가상 비서에게 음악 재생, 알림 설정 및 날씨 정보를 요청할 수 있습니다.
+
+하지만 트랜스포머 아키텍처가 해결하는 데 도움을 준 핵심 도전 과제 중 하나는 양이 데이터 양이 적은 언어(low-resource language)에 대한 것입니다. 대량의 음성 데이터로 사전 훈련한 후 데이터 양이 적은 언어에서 레이블이 지정된 음성 데이터 1시간만으로 모델을 미세 조정하면 이전의 100배 많은 레이블이 지정된 데이터로 훈련된 ASR 시스템보다 훨씬 더 높은 품질의 결과를 얻을 수 있습니다. 
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## 컴퓨터 비전[[computer_vision]]
+
+컴퓨터 비전 작업 중 가장 초기의 성공적인 작업 중 하나는 [합성곱 신경망(CNN)](glossary#convolution)을 사용하여 우편번호 숫자 이미지를 인식하는 것이었습니다. 이미지는 픽셀로 구성되어 있으며 각 픽셀은 숫자 값으로 표현됩니다. 이로써 이미지를 픽셀 값의 행렬로 나타내는 것이 쉬워집니다. 특정한 픽셀 값의 조합은 이미지의 색상을 의미합니다.
+
+컴퓨터 비전 작업은 일반적으로 다음 두 가지 방법으로 접근 가능합니다:
+
+1. 합성곱을 사용하여 이미지의 낮은 수준 특징에서 높은 수준의 추상적인 요소까지 계층적으로 학습합니다.
+
+2. 이미지를 패치로 나누고 트랜스포머를 사용하여 점진적으로 각 이미지 패치가 서로 어떠한 방식으로 연관되어 이미지를 형성하는지 학습합니다. `CNN`에서 선호하는 상향식 접근법과는 달리, 이 방식은 흐릿한 이미지로 초안을 그리고 점진적으로 선명한 이미지로 만들어가는 것과 유사합니다.
+
+### 이미지 분류[[image_classification]]
+
+
+이미지 분류는 한 개의 전체 이미지에 미리 정의된 클래스 집합의 레이블을 지정하는 작업입니다. 
+
+대부분의 분류 작업과 마찬가지로, 이미지 분류에는 다양한 실용적인 용도가 있으며, 일부 예시는 다음과 같습니다:
+
+
+* 의료: 질병을 감지하거나 환자 건강을 모니터링하기 위해 의료 이미지에 레이블을 지정합니다.
+* 환경: 위성 이미지를 분류하여 산림 벌채를 감시하고 야생 지역 관리를 위한 정보를 제공하거나 산불을 감지합니다. 
+* 농업: 작물 이미지를 분류하여 식물 건강을 확인하거나 위성 이미지를 분류하여 토지 이용 관찰에 사용합니다.
+* 생태학: 동물이나 식물 종 이미지를 분류하여 야생 동물 개체군을 조사하거나 멸종 위기에 처한 종을 추적합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### 객체 탐지[[object_detection]]
+
+
+이미지 분류와 달리 객체 탐지는 이미지 내에서 여러 객체를 식별하고 바운딩 박스로 정의된 객체의 위치를 파악합니다. 
+
+객체 탐지의 몇 가지 응용 예시는 다음과 같습니다:
+
+* 자율 주행 차량: 다른 차량, 보행자 및 신호등과 같은 일상적인 교통 객체를 감지합니다.
+* 원격 감지: 재난 모니터링, 도시 계획 및 기상 예측 등을 수행합니다.
+* 결함 탐지: 건물의 균열이나 구조적 손상, 제조 결함 등을 탐지합니다.
+
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### 이미지 분할[[image_segmentation]]
+
+
+이미지 분할은 픽셀 차원의 작업으로, 이미지 내의 모든 픽셀을 클래스에 할당합니다. 이는 객체 탐지와 다릅니다. 객체 탐지는 바운딩 박스를 사용하여 이미지 내의 객체를 레이블링하고 예측하는 반면, 분할은 더 세분화된 작업입니다. 분할은 픽셀 수준에서 객체를 감지할 수 있습니다. 
+
+이미지 분할에는 여러 유형이 있습니다:
+
+* 인스턴스 분할: 개체의 클래스를 레이블링하는 것 외에도, 개체의 각 구분된 인스턴스에도 레이블을 지정합니다 ("개-1", "개-2" 등).
+* 파놉틱 분할: 의미적 분할과 인스턴스 분할의 조합입니다. 각 픽셀을 의미적 클래스로 레이블링하는 **동시에** 개체의 각각 구분된 인스턴스로도 레이블을 지정합니다.
+
+분할 작업은 자율 주행 차량에서 유용하며, 주변 환경의 픽셀 수준 지도를 생성하여 보행자와 다른 차량 주변에서 안전하게 탐색할 수 있습니다. 또한 의료 영상에서도 유용합니다. 분할 작업이 픽셀 수준에서 객체를 감지할 수 있기 때문에 비정상적인 세포나 장기의 특징을 식별하는 데 도움이 될 수 있습니다. 이미지 분할은 의류 가상 시착이나 카메라를 통해 실제 세계에 가상 개체를 덧씌워 증강 현실 경험을 만드는 등 전자 상거래 분야에서도 사용될 수 있습니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### 깊이 추정[[depth_estimation]]
+
+깊이 추정은 카메라로부터 이미지 내부의 각 픽셀의 거리를 예측합니다. 이 컴퓨터 비전 작업은 특히 장면 이해와 재구성에 중요합니다. 예를 들어, 자율 주행 차량은 보행자, 교통 표지판 및 다른 차량과 같은 객체와의 거리를 이해하여 장애물과 충돌을 피해야 합니다. 깊이 정보는 또한 2D 이미지에서 3D 표현을 구성하는 데 도움이 되며 생물학적 구조나 건물의 고품질 3D 표현을 생성하는 데 사용될 수 있습니다.
+
+깊이 추정에는 두 가지 접근 방식이 있습니다:
+
+* 스테레오: 약간 다른 각도에서 촬영된 동일한 이미지 두 장을 비교하여 깊이를 추정합니다.
+* 단안: 단일 이미지에서 깊이를 추정합니다.
+
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## 자연어처리[[natural_language_processing]]
+
+텍스트는 인간이 의사 소통하는 자연스러운 방식 중 하나이기 때문에 자연어처리 역시 가장 일반적인 작업 유형 중 하나입니다. 모델이 인식하는 형식으로 텍스트를 변환하려면 토큰화해야 합니다. 이는 텍스트 시퀀스를 개별 단어 또는 하위 단어(토큰)로 분할한 다음 이러한 토큰을 숫자로 변환하는 것을 의미합니다. 결과적으로 텍스트 시퀀스를 숫자 시퀀스로 표현할 수 있으며, 숫자 시퀀스를 다양한 자연어처리 작업을 해결하기 위한 모델에 입력할 수 있습니다!
+
+### 텍스트 분류[[text_classification]]
+
+다른 모달리티에서의 분류 작업과 마찬가지로 텍스트 분류는 미리 정의된 클래스 집합에서 텍스트 시퀀스(문장 수준, 단락 또는 문서 등)에 레이블을 지정합니다. 텍스트 분류에는 다양한 실용적인 응용 사례가 있으며, 일부 예시는 다음과 같습니다:
+
+* 감성 분석: 텍스트를 `긍정` 또는 `부정`과 같은 어떤 극성에 따라 레이블링하여 정치, 금융, 마케팅과 같은 분야에서 의사 결정에 정보를 제공하고 지원할 수 있습니다.
+* 콘텐츠 분류: 텍스트를 주제에 따라 레이블링(날씨, 스포츠, 금융 등)하여 뉴스 및 소셜 미디어 피드에서 정보를 구성하고 필터링하는 데 도움이 될 수 있습니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### 토큰 분류[[token_classification]]
+
+모든 자연어처리 작업에서는 텍스트가 개별 단어나 하위 단어로 분리되어 전처리됩니다. 분리된 단어를 [토큰](/glossary#token)이라고 합니다. 토큰 분류는 각 토큰에 미리 정의된 클래스 집합의 레이블을 할당합니다.
+
+토큰 분류의 두 가지 일반적인 유형은 다음과 같습니다:
+
+* 개체명 인식 (NER): 토큰을 조직, 인물, 위치 또는 날짜와 같은 개체 범주에 따라 레이블링합니다. NER은 특히 유전체학적인 환경에서 유전자, 단백질 및 약물 이름에 레이블을 지정하는 데 널리 사용됩니다.
+* 품사 태깅 (POS): 명사, 동사, 형용사와 같은 품사에 따라 토큰에 레이블을 할당합니다. POS는 번역 시스템이 동일한 단어가 문법적으로 어떻게 다른지 이해하는 데 도움이 됩니다 (명사로 사용되는 "bank(은행)"과 동사로 사용되는 "bank(예금을 예치하다)"과 같은 경우).
+
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+
+### 질의응답[[question_answering]]
+
+질의응답은 또 하나의 토큰 차원의 작업으로, 문맥이 있을 때(개방형 도메인)와 문맥이 없을 때(폐쇄형 도메인) 질문에 대한 답변을 반환합니다. 이 작업은 가상 비서에게 식당이 영업 중인지와 같은 질문을 할 때마다 발생할 수 있습니다. 고객 지원 또는 기술 지원을 제공하거나 검색 엔진이 요청한 정보를 검색하는 데 도움을 줄 수 있습니다.
+
+질문 답변에는 일반적으로 두 가지 유형이 있습니다:
+
+* 추출형: 질문과 문맥이 주어졌을 때, 모델이 주어진 문맥의 일부에서 가져온 텍스트의 범위를 답변으로 합니다.
+* 생성형: 질문과 문맥이 주어졌을 때, 주어진 문맥을 통해 답변을 생성합니다. 이 접근 방식은 [`QuestionAnsweringPipeline`] 대신 [`Text2TextGenerationPipeline`]을 통해 처리됩니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### 요약[[summarization]]
+
+요약은 원본 문서의 의미를 최대한 보존하면서 긴 문서를 짧은 문서로 만드는 작업입니다. 요약은 `sequence-to-sequence` 작업입니다. 입력보다 짧은 텍스트 시퀀스를 출력합니다. 요약 작업은 독자가 장문 문서들의 주요 포인트를 빠르게 이해하는 데 도움을 줄 수 있습니다. 입법안, 법률 및 금융 문서, 특허 및 과학 논문은 요약 작업이 독자의 시간을 절약하고 독서 보조 도구로 사용될 수 있는 몇 가지 예시입니다.
+
+질문 답변과 마찬가지로 요약에는 두 가지 유형이 있습니다:
+
+* 추출형: 원본 텍스트에서 가장 중요한 문장을 식별하고 추출합니다.
+* 생성형: 원본 텍스트에서 목표 요약을 생성합니다. 입력 문서에 없는 새로운 단어를 포함할 수도 있습니다. [`SummarizationPipeline`]은 생성형 접근 방식을 사용합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### 번역[[translation]]
+
+번역은 한 언어로 된 텍스트 시퀀스를 다른 언어로 변환하는 작업입니다. 이는 서로 다른 배경을 가진 사람들이 서로 소통하는 데 도움을 주는 중요한 역할을 합니다. 더 넓은 대중에게 콘텐츠를 번역하여 전달하거나, 새로운 언어를 배우는 데 도움이 되는 학습 도구가 될 수도 있습니다. 요약과 마찬가지로, 번역은 `sequence-to-sequence` 작업입니다. 즉, 모델은 입력 시퀀스를 받아서 출력이 되는 목표 시퀀스를 반환합니다.
+
+초기의 번역 모델은 대부분 단일 언어로 이루어져 있었지만, 최근에는 많은 언어 쌍 간에 번역을 수행할 수 있는 다중 언어 모델에 대한 관심이 높아지고 있습니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### 언어 모델링[[language_modeling]]
+
+언어 모델링은 텍스트 시퀀스에서 단어를 예측하는 작업입니다. 사전 훈련된 언어 모델은 많은 다른 하위 작업에 따라 미세 조정될 수 있기 때문에 매우 인기 있는 자연어처리 작업이 되었습니다. 최근에는 제로 샷(zero-shot) 또는 퓨 샷(few-shot) 학습이 가능한 대규모 언어 모델(Large Language Models, LLM)에 대한 많은 관심이 발생하고 있습니다. 이는 모델이 명시적으로 훈련되지 않은 작업도 해결할 수 있다는 것을 의미합니다! 언어 모델은 유창하고 설득력 있는 텍스트를 생성하는 데 사용될 수 있지만, 텍스트가 항상 정확하지는 않을 수 있으므로 주의가 필요합니다.
+
+언어 모델링에는 두 가지 유형이 있습니다:
+
+* 인과적 언어 모델링: 이 모델의 목적은 시퀀스에서 다음 토큰을 예측하는 것이며, 미래 토큰이 마스킹 됩니다.
+    ```py
+    >>> from transformers import pipeline
+
+    >>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+    >>> generator = pipeline(task="text-generation")
+    >>> generator(prompt)  # doctest: +SKIP
+    ```
+
+* 마스킹된 언어 모델링: 이 모델의 목적은 시퀀스 내의 마스킹된 토큰을 예측하는 것이며, 시퀀스 내의 모든 토큰에 대한 접근이 제공됩니다.
+    
+    ```py
+    >>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+    >>> fill_mask = pipeline(task="fill-mask")
+    >>> preds = fill_mask(text, top_k=1)
+    >>> preds = [
+    ...     {
+    ...         "score": round(pred["score"], 4),
+    ...         "token": pred["token"],
+    ...         "token_str": pred["token_str"],
+    ...         "sequence": pred["sequence"],
+    ...     }
+    ...     for pred in preds
+    ... ]
+    >>> preds
+    [{'score': 0.2236,
+      'token': 1761,
+      'token_str': ' platform',
+      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+    ```
+
+이 페이지를 통해 각 모달리티의 다양한 작업 유형과 각 작업의 실용적 중요성에 대해 추가적인 배경 정보를 얻으셨기를 바랍니다. 다음 [섹션](tasks_explained)에서는 🤗 Transformer가 이러한 작업을 해결하는 **방법**에 대해 알아보실 수 있습니다.
\ No newline at end of file

From ebd94b0f6f215f6bc0f70e61eba075eb9196f9ef Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 12 Jun 2023 11:23:37 -0400
Subject: [PATCH 390/935] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20?=
 =?UTF-8?q?Replace=20DataLoader=20logic=20for=20Accelerate=20in=20Trainer,?=
 =?UTF-8?q?=20remove=20unneeded=20tests=20=F0=9F=9A=A8=F0=9F=9A=A8?=
 =?UTF-8?q?=F0=9F=9A=A8=20(#24028)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Working integration

* Fix failing test

* Revert label host logic

* Bring it back!
---
 src/transformers/trainer.py   | 249 ++++++++--------------------------
 tests/trainer/test_trainer.py |  89 +-----------
 2 files changed, 60 insertions(+), 278 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index bf240cd0ebc0..79c2d7c6252c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -61,7 +61,6 @@
 from packaging import version
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
 
 from . import __version__
 from .configuration_utils import PretrainedConfig
@@ -73,7 +72,7 @@
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
 from .optimization import Adafactor, get_scheduler
-from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_11
+from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_10
 from .tokenization_utils_base import PreTrainedTokenizerBase
 from .trainer_callback import (
     CallbackHandler,
@@ -85,14 +84,11 @@
     TrainerState,
 )
 from .trainer_pt_utils import (
-    DistributedLengthGroupedSampler,
-    DistributedSamplerWithLoop,
     DistributedTensorGatherer,
     IterableDatasetShard,
     LabelSmoother,
     LengthGroupedSampler,
     SequentialDistributedSampler,
-    ShardSampler,
     distributed_broadcast_scalars,
     distributed_concat,
     find_batch_size,
@@ -102,7 +98,6 @@
     nested_concat,
     nested_detach,
     nested_numpify,
-    nested_truncate,
     nested_xla_mesh_reduce,
     reissue_pt_warnings,
 )
@@ -812,20 +807,6 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
         if self.train_dataset is None or not has_length(self.train_dataset):
             return None
 
-        generator = None
-        if self.args.world_size <= 1:
-            generator = torch.Generator()
-            # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with
-            # `args.seed`) if data_seed isn't provided.
-            # Further on in this method, we default to `args.seed` instead.
-            if self.args.data_seed is None:
-                seed = int(torch.empty((), dtype=torch.int64).random_().item())
-            else:
-                seed = self.args.data_seed
-            generator.manual_seed(seed)
-
-        seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
-
         # Build the sampler.
         if self.args.group_by_length:
             if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
@@ -837,47 +818,15 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
             else:
                 lengths = None
             model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
-            if self.args.world_size <= 1:
-                return LengthGroupedSampler(
-                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                    dataset=self.train_dataset,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    generator=generator,
-                )
-            else:
-                return DistributedLengthGroupedSampler(
-                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                    dataset=self.train_dataset,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    seed=seed,
-                )
+            return LengthGroupedSampler(
+                self.args.train_batch_size * self.args.gradient_accumulation_steps,
+                dataset=self.train_dataset,
+                lengths=lengths,
+                model_input_name=model_input_name,
+            )
 
         else:
-            if self.args.world_size <= 1:
-                return RandomSampler(self.train_dataset, generator=generator)
-            elif (
-                self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
-                and not self.args.dataloader_drop_last
-            ):
-                # Use a loop for TPUs when drop_last is False to have all batches have the same size.
-                return DistributedSamplerWithLoop(
-                    self.train_dataset,
-                    batch_size=self.args.per_device_train_batch_size,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    seed=seed,
-                )
-            else:
-                return DistributedSampler(
-                    self.train_dataset,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    seed=seed,
-                )
+            return RandomSampler(self.train_dataset)
 
     def get_train_dataloader(self) -> DataLoader:
         """
@@ -898,36 +847,19 @@ def get_train_dataloader(self) -> DataLoader:
         else:
             data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
 
-        if isinstance(train_dataset, torch.utils.data.IterableDataset):
-            if self.args.world_size > 1:
-                train_dataset = IterableDatasetShard(
-                    train_dataset,
-                    batch_size=self._train_batch_size,
-                    drop_last=self.args.dataloader_drop_last,
-                    num_processes=self.args.world_size,
-                    process_index=self.args.process_index,
-                )
+        dataloader_params = {
+            "batch_size": self._train_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
 
-            return DataLoader(
-                train_dataset,
-                batch_size=self._train_batch_size,
-                collate_fn=data_collator,
-                num_workers=self.args.dataloader_num_workers,
-                pin_memory=self.args.dataloader_pin_memory,
-            )
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
 
-        train_sampler = self._get_train_sampler()
-
-        return DataLoader(
-            train_dataset,
-            batch_size=self._train_batch_size,
-            sampler=train_sampler,
-            collate_fn=data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-            worker_init_fn=seed_worker,
-        )
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
 
     def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
         # Deprecated code
@@ -943,20 +875,13 @@ def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.
                     rank=smp.dp_rank(),
                     batch_size=self.args.per_device_eval_batch_size,
                 )
-            elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
-                return SequentialDistributedSampler(eval_dataset)
             else:
                 return SequentialSampler(eval_dataset)
 
         if self.args.world_size <= 1:
             return SequentialSampler(eval_dataset)
         else:
-            return ShardSampler(
-                eval_dataset,
-                batch_size=self.args.per_device_eval_batch_size,
-                num_processes=self.args.world_size,
-                process_index=self.args.process_index,
-            )
+            return None
 
     def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
         """
@@ -979,34 +904,18 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
         else:
             data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation")
 
-        if isinstance(eval_dataset, torch.utils.data.IterableDataset):
-            if self.args.world_size > 1:
-                eval_dataset = IterableDatasetShard(
-                    eval_dataset,
-                    batch_size=self.args.per_device_eval_batch_size,
-                    drop_last=self.args.dataloader_drop_last,
-                    num_processes=self.args.world_size,
-                    process_index=self.args.process_index,
-                )
-            return DataLoader(
-                eval_dataset,
-                batch_size=self.args.eval_batch_size,
-                collate_fn=data_collator,
-                num_workers=self.args.dataloader_num_workers,
-                pin_memory=self.args.dataloader_pin_memory,
-            )
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
 
-        eval_sampler = self._get_eval_sampler(eval_dataset)
+        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
 
-        return DataLoader(
-            eval_dataset,
-            sampler=eval_sampler,
-            batch_size=self.args.eval_batch_size,
-            collate_fn=data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-        )
+        return self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         """
@@ -1026,35 +935,19 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         else:
             data_collator = self._get_collator_with_removed_columns(data_collator, description="test")
 
-        if isinstance(test_dataset, torch.utils.data.IterableDataset):
-            if self.args.world_size > 1:
-                test_dataset = IterableDatasetShard(
-                    test_dataset,
-                    batch_size=self.args.eval_batch_size,
-                    drop_last=self.args.dataloader_drop_last,
-                    num_processes=self.args.world_size,
-                    process_index=self.args.process_index,
-                )
-            return DataLoader(
-                test_dataset,
-                batch_size=self.args.eval_batch_size,
-                collate_fn=data_collator,
-                num_workers=self.args.dataloader_num_workers,
-                pin_memory=self.args.dataloader_pin_memory,
-            )
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
 
-        test_sampler = self._get_eval_sampler(test_dataset)
+        if not isinstance(test_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(test_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
 
         # We use the same batch_size as for eval.
-        return DataLoader(
-            test_dataset,
-            sampler=test_sampler,
-            batch_size=self.args.eval_batch_size,
-            collate_fn=data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-        )
+        return self.accelerator.prepare(DataLoader(test_dataset, **dataloader_params))
 
     def create_optimizer_and_scheduler(self, num_training_steps: int):
         """
@@ -1864,26 +1757,11 @@ def _inner_training_loop(
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
-                    train_dataloader.sampler, RandomSampler
-                )
-                if is_torch_less_than_1_11 or not is_random_sampler:
-                    # We just need to begin an iteration to create the randomization of the sampler.
-                    # That was before PyTorch 1.11 however...
-                    for _ in train_dataloader:
-                        break
-                else:
-                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
-                    # AT THE VERY END!
-                    _ = list(train_dataloader.sampler)
+                for _ in train_dataloader:
+                    break
 
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
-            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
-                train_dataloader.sampler.set_epoch(epoch)
-            elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
-                train_dataloader.dataset.set_epoch(epoch)
-
             if is_torch_tpu_available():
                 parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device)
                 epoch_iterator = parallel_loader
@@ -3250,27 +3128,29 @@ def evaluation_loop(
 
             # Update containers on host
             if loss is not None:
-                losses = self._nested_gather(loss.repeat(batch_size))
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+                losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size)))
+                losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
             if labels is not None:
-                labels = self._pad_across_processes(labels)
+                labels = self.accelerator.pad_across_processes(labels)
             if inputs_decode is not None:
-                inputs_decode = self._pad_across_processes(inputs_decode)
-                inputs_decode = self._nested_gather(inputs_decode)
+                inputs_decode = self.accelerator.pad_across_processes(inputs_decode)
+                inputs_decode = self.accelerator.gather_for_metrics((inputs_decode))
                 inputs_host = (
                     inputs_decode
                     if inputs_host is None
                     else nested_concat(inputs_host, inputs_decode, padding_index=-100)
                 )
             if logits is not None:
-                logits = self._pad_across_processes(logits)
+                logits = self.accelerator.pad_across_processes(logits)
                 if self.preprocess_logits_for_metrics is not None:
                     logits = self.preprocess_logits_for_metrics(logits, labels)
-                logits = self._nested_gather(logits)
+                logits = self.accelerator.gather_for_metrics((logits))
                 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+
             if labels is not None:
-                labels = self._nested_gather(labels)
+                labels = self.accelerator.gather_for_metrics((labels))
                 labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
@@ -3303,19 +3183,13 @@ def evaluation_loop(
 
         # Gather all remaining tensors and put them back on the CPU
         if losses_host is not None:
-            losses = nested_numpify(losses_host)
-            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+            all_losses = nested_numpify(losses_host)
         if preds_host is not None:
-            logits = nested_numpify(preds_host)
-            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+            all_preds = nested_numpify(preds_host)
         if inputs_host is not None:
-            inputs_decode = nested_numpify(inputs_host)
-            all_inputs = (
-                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-            )
+            all_inputs = nested_numpify(inputs_host)
         if labels_host is not None:
-            labels = nested_numpify(labels_host)
-            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+            all_labels = nested_numpify(labels_host)
 
         # Number of samples
         if has_length(eval_dataset):
@@ -3332,17 +3206,6 @@ def evaluation_loop(
         if num_samples == 0 and observed_num_examples > 0:
             num_samples = observed_num_examples
 
-        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
-        # samplers has been rounded to a multiple of batch_size, so we truncate.
-        if all_losses is not None:
-            all_losses = all_losses[:num_samples]
-        if all_preds is not None:
-            all_preds = nested_truncate(all_preds, num_samples)
-        if all_labels is not None:
-            all_labels = nested_truncate(all_labels, num_samples)
-        if all_inputs is not None:
-            all_inputs = nested_truncate(all_inputs, num_samples)
-
         # Metrics!
         if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
             if args.include_inputs_for_metrics:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 95b92d5295d0..ea823d06a789 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -798,9 +798,9 @@ def is_any_loss_nan_or_inf(log_history):
     def test_train_and_eval_dataloaders(self):
         n_gpu = max(1, torch.cuda.device_count())
         trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
-        self.assertEqual(trainer.get_train_dataloader().batch_size, 16 * n_gpu)
+        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
         trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
-        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16 * n_gpu)
+        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
 
         # Check drop_last works
         trainer = get_regression_trainer(
@@ -833,67 +833,6 @@ def test_dataloader_without_dataset(self):
         trainer.train()
         trainer.evaluate()
 
-    def test_sampler_seed(self):
-        # nb: we don't want to inherit from IterableDataset to hit the right code path
-        class DummyDataset(torch.utils.data.Dataset):
-            def __init__(self, length: int = 101):
-                self.length = length
-
-            def __len__(self):
-                return self.length
-
-            def __getitem__(self, i):
-                if (i < 0) or (i >= self.length):
-                    raise IndexError
-                return {"input_ids": [i]}
-
-        class DummyModel(PreTrainedModel):
-            def __init__(self, num_params: int):
-                super().__init__(PretrainedConfig())
-                # Add some (unused) params. the point here is that randomness in model_init shouldn't influence
-                # data loader order.
-                self.params = nn.Parameter(torch.randn(num_params))
-
-            def forward(self, input_ids, labels=None):
-                if labels is not None:
-                    return torch.tensor(0.0, device=input_ids.device), input_ids
-                else:
-                    return input_ids
-
-        def _get_first_data_sample(num_params, seed, data_seed, **kwargs):
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = Trainer(
-                    model_init=lambda: DummyModel(num_params),
-                    args=TrainingArguments(
-                        output_dir=tmpdir,
-                        **kwargs,
-                        seed=seed,
-                        data_seed=data_seed,
-                        local_rank=-1,
-                    ),
-                    train_dataset=DummyDataset(),
-                )
-
-                return next(iter(trainer.get_train_dataloader()))
-
-        # test that the seed is passed to the sampler
-        # the codepath we want to hit is world_size <= 1, and both group_by_length
-        for group_by_length in [True, False]:
-            sample42_1 = _get_first_data_sample(num_params=10, seed=42, data_seed=42, group_by_length=group_by_length)
-            sample42_2 = _get_first_data_sample(num_params=11, seed=42, data_seed=42, group_by_length=group_by_length)
-            self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_2["input_ids"]))
-
-            # should get same samples with different seed, so long as data_seed is the same
-            sample42_3 = _get_first_data_sample(num_params=11, seed=11, data_seed=42, group_by_length=group_by_length)
-            self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_3["input_ids"]))
-
-            # make sure we have some randomness in the samples if data_seed is different
-            others = [
-                _get_first_data_sample(num_params=i, seed=42, data_seed=i, group_by_length=group_by_length)
-                for i in range(10)
-            ]
-            self.assertTrue(any(not torch.equal(sample42_1["input_ids"], sample["input_ids"]) for sample in others))
-
     @require_torch_multi_gpu
     def test_data_is_not_parallelized_when_model_is_parallel(self):
         model = RegressionModel()
@@ -907,9 +846,9 @@ def test_data_is_not_parallelized_when_model_is_parallel(self):
         self.assertEqual(trainer.args.n_gpu, 1)
 
         # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
-        self.assertEqual(trainer.get_train_dataloader().batch_size, 16)
+        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
         self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
-        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16)
+        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
         self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
 
     def test_evaluate(self):
@@ -1742,26 +1681,6 @@ def test_training_iterable_dataset(self):
         self.assertIsInstance(loader, torch.utils.data.DataLoader)
         self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
 
-    def test_training_finite_iterable_dataset(self):
-        config = RegressionModelConfig()
-        model = RegressionPreTrainedModel(config)
-
-        batch_size = 1
-        num_samples = 10
-
-        available_steps = num_samples // batch_size
-
-        data = FiniteIterableDataset(length=num_samples)
-        train_args = TrainingArguments(
-            "..",
-            max_steps=available_steps + 1,  # set a higher number than actually available
-            per_device_train_batch_size=batch_size,
-        )
-        trainer = Trainer(model, train_dataset=data, args=train_args)
-        with self.assertLogs("transformers.trainer", level="WARNING") as logs:
-            trainer.train()
-        self.assertIn(f"stopping training at step {available_steps}!", logs.output[0])
-
     def test_evaluation_iterable_dataset(self):
         config = RegressionModelConfig(a=1.5, b=2.5)
         model = RegressionPreTrainedModel(config)

From 08ae37c820395e91fc3aa8b801696de5002481d2 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Mon, 12 Jun 2023 11:31:06 -0400
Subject: [PATCH 391/935] Fix `_load_pretrained_model` (#24200)

Fix test
---
 src/transformers/modeling_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 4002f188e9ee..388c59d9cb3d 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3026,7 +3026,6 @@ def _fix_key(key):
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         if find_tied_parameters is not None:
-            model.tie_weights()
             tied_params = find_tied_parameters(model)
         else:
             tied_params = []

From f7d80cb3d25e26cfae7fa5c91bc6fe8552ed68ea Mon Sep 17 00:00:00 2001
From: Ethan <ethanyt@qq.com>
Date: Mon, 12 Jun 2023 23:49:55 +0800
Subject: [PATCH 392/935] Fix steps bugs in no trainer examples (#24197)

Fix step bugs in no trainer + load checkpoint + grad acc
---
 .../run_image_classification_no_trainer.py                | 5 +++--
 examples/pytorch/image-pretraining/run_mim_no_trainer.py  | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py  | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py  | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py   | 5 +++--
 .../question-answering/run_qa_beam_search_no_trainer.py   | 5 +++--
 examples/pytorch/question-answering/run_qa_no_trainer.py  | 2 +-
 .../run_semantic_segmentation_no_trainer.py               | 5 +++--
 .../pytorch/summarization/run_summarization_no_trainer.py | 5 +++--
 .../pytorch/text-classification/run_glue_no_trainer.py    | 8 +++++++-
 .../pytorch/token-classification/run_ner_no_trainer.py    | 8 +++++++-
 .../pytorch/translation/run_translation_no_trainer.py     | 2 +-
 12 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index a072160c73be..f447b5a7a27f 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -453,10 +453,11 @@ def collate_fn(examples):
             resume_step = None
             completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_step
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 870be3dfa13d..93782ae4b8cc 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -666,7 +666,7 @@ def preprocess_images(examples):
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_steps
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index c38102f6c5c0..1ec311289aa8 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -572,7 +572,7 @@ def group_texts(examples):
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_steps
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index ccc4e0a0987b..760181feffc3 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -616,7 +616,7 @@ def group_texts(examples):
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_steps
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 6999b9c800d8..ca6f49a4ab19 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -559,10 +559,11 @@ def preprocess_function(examples):
             resume_step = None
             completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_stepp
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index d1ace7424710..3cc6a9686a25 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -811,10 +811,11 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
             resume_step = None
             completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_stepp
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 95564db96798..60e54ad8c5eb 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -830,7 +830,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
             resume_step = int(training_difference.replace("step_", ""))
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_stepp
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 740c2ea5cbb1..fe0ae637936c 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -556,10 +556,11 @@ def preprocess_val(example_batch):
             resume_step = None
             completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_stepp
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 7cf627046e53..7b5ce25d880c 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -628,10 +628,11 @@ def postprocess_text(preds, labels):
             resume_step = None
             completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_stepp
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 0c2b84ec165c..ff8b65ef6c1d 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -501,10 +501,16 @@ def preprocess_function(examples):
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_step
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index dcda152dbe31..84f506a42ca6 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -659,10 +659,16 @@ def compute_metrics():
         if "epoch" in training_difference:
             starting_epoch = int(training_difference.replace("epoch_", "")) + 1
             resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
         else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_stepp
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
 
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index d42ade1f59f5..3faf99cc245c 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -613,7 +613,7 @@ def postprocess_text(preds, labels):
             resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
             starting_epoch = resume_step // len(train_dataloader)
             resume_step -= starting_epoch * len(train_dataloader)
-            completed_steps = resume_step
+            completed_steps = resume_step // args.gradient_accumulation_stepp
 
     # update the progress_bar if load from checkpoint
     progress_bar.update(completed_steps)

From 4fe9716a793cf077e07f21d0da0b9332d986dd2d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 12 Jun 2023 18:14:15 +0200
Subject: [PATCH 393/935] Skip RWKV test in past CI (#24204)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/pytorch_utils.py       | 1 +
 tests/models/rwkv/test_modeling_rwkv.py | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 6c5fc5f4a414..f076eb445cb6 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -28,6 +28,7 @@
 
 parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
 
+is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
 is_torch_greater_or_equal_than_1_10 = parsed_torch_version_base >= version.parse("1.10")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
 
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 638180cbcf9b..67430f6f3dc0 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -34,6 +34,9 @@
         RwkvForCausalLM,
         RwkvModel,
     )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0
+else:
+    is_torch_greater_or_equal_than_2_0 = False
 
 
 class RwkvModelTester:
@@ -258,6 +261,9 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204"
+)
 @require_torch
 class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (RwkvModel, RwkvForCausalLM) if is_torch_available() else ()
@@ -418,6 +424,9 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(model)
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204"
+)
 @slow
 class RWKVIntegrationTests(unittest.TestCase):
     def setUp(self):

From e5dd7432e7f274d7292666d3e8f3b3f9041d6e6c Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 13 Jun 2023 01:18:04 +0900
Subject: [PATCH 394/935] Remove unnecessary aten::to overhead in llama
 (#24203)

* fix dtype init

* fix copies

* fix fixcopies mess

* edit forward as well

* copy
---
 src/transformers/models/llama/modeling_llama.py          | 9 +++++----
 .../models/open_llama/modeling_open_llama.py             | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index fc5c2a1ce42b..3d31928562a2 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -100,8 +100,9 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        dtype = torch.get_default_dtype()
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
@@ -112,8 +113,8 @@ def forward(self, x, seq_len=None):
             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
             # Different from paper, but it uses a different permutation in order to obtain the same calculation
             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
         return (
             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 12c85e19f25d..16ad554dc313 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -111,8 +111,9 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        dtype = torch.get_default_dtype()
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
@@ -123,8 +124,8 @@ def forward(self, x, seq_len=None):
             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
             # Different from paper, but it uses a different permutation in order to obtain the same calculation
             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
         return (
             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),

From 0675600a60b260d6bdb9c8ad91d932d690672bf0 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 12 Jun 2023 19:10:31 +0200
Subject: [PATCH 395/935] Update `WhisperForAudioClassification` doc example
 (#24188)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/whisper/modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 245da6d7ba3a..71c32a185955 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1778,7 +1778,7 @@ def forward(
         >>> predicted_class_ids = torch.argmax(logits).item()
         >>> predicted_label = model.config.id2label[predicted_class_ids]
         >>> predicted_label
-        'af_za'
+        'Afrikaans'
         ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

From 4da84008dc01c906f9227d39c76c221a88c02d09 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 12 Jun 2023 13:26:17 -0400
Subject: [PATCH 396/935] Finish dataloader integration (#24201)

---
 src/transformers/trainer.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 79c2d7c6252c..cb8bc18c6c8d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -176,7 +176,6 @@
 if is_torch_tpu_available(check_device=False):
     import torch_xla.core.xla_model as xm
     import torch_xla.debug.metrics as met
-    import torch_xla.distributed.parallel_loader as pl
 
 if is_fairscale_available():
     dep_version_check("fairscale")
@@ -1762,11 +1761,7 @@ def _inner_training_loop(
 
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
-            if is_torch_tpu_available():
-                parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device)
-                epoch_iterator = parallel_loader
-            else:
-                epoch_iterator = train_dataloader
+            epoch_iterator = train_dataloader
 
             # Reset the past mems state at the beginning of each epoch if necessary.
             if args.past_index >= 0:
@@ -3088,9 +3083,6 @@ def evaluation_loop(
         # Do this before wrapping.
         eval_dataset = getattr(dataloader, "dataset", None)
 
-        if is_torch_tpu_available():
-            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
-
         if args.past_index >= 0:
             self._past = None
 
@@ -3694,9 +3686,6 @@ def prediction_loop(
 
         model.eval()
 
-        if is_torch_tpu_available():
-            dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device)
-
         if args.past_index >= 0:
             self._past = None
 

From 41a8fa4e14ae14405a69efc65bfc21c9daa71c1a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 12 Jun 2023 21:27:10 +0200
Subject: [PATCH 397/935] Add the number of `model` test failures to slack CI
 report (#24207)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/notification_service.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils/notification_service.py b/utils/notification_service.py
index 32df44ef3e79..1d10fa5d821f 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -193,8 +193,9 @@ def failures(self) -> Dict:
             "text": {
                 "type": "plain_text",
                 "text": (
-                    f"There were {self.n_failures} failures, out of {self.n_tests} tests.\nThe suite ran in"
-                    f" {self.time}."
+                    f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
+                    f"Number of model failures: {self.n_model_failures}.\n"
+                    f"The suite ran in {self.time}."
                 ),
                 "emoji": True,
             },

From 70c79940957fb25b54bd1b106935c756b90345eb Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 12 Jun 2023 16:24:27 -0400
Subject: [PATCH 398/935] Fix README copies

---
 README_es.md      | 6 +++---
 README_hd.md      | 6 +++---
 README_ja.md      | 6 +++---
 README_ko.md      | 6 +++---
 README_zh-hans.md | 6 +++---
 README_zh-hant.md | 6 +++---
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/README_es.md b/README_es.md
index 493c56c385af..32f955dbdab0 100644
--- a/README_es.md
+++ b/README_es.md
@@ -267,7 +267,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -381,7 +381,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
@@ -423,7 +423,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/README_hd.md b/README_hd.md
index ca28c89a0aea..2195e7cc638d 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -239,7 +239,7 @@ conda install -c huggingface transformers
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research से) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. द्वाराअनुसंधान पत्र [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) के साथ जारी किया गया
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया।
@@ -353,7 +353,7 @@ conda install -c huggingface transformers
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
@@ -395,7 +395,7 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [लार्ज-स्केल सेल्फ- एंड सेमी-सुपरवाइज्ड लर्निंग फॉर स्पीच ट्रांसलेशन](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया।
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [स्पैन सिलेक्शन को प्री-ट्रेनिंग करके कुछ-शॉट क्वेश्चन आंसरिंग](https:// arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा।
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा।
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883।
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/README_ja.md b/README_ja.md
index 001ef70b23b8..2331bb06cf55 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -301,7 +301,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research から) Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. から公開された研究論文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
@@ -415,7 +415,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. から) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam から公開された研究論文: [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861)
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. から) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen から公開された研究論文: [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
@@ -457,7 +457,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
diff --git a/README_ko.md b/README_ko.md
index b6416ca2ae6b..b1fffac0225f 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -216,7 +216,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (Google Research 에서 제공)은 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.의 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918)논문과 함께 발표했습니다.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -330,7 +330,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (Google Inc. 에서) Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 의 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 논문과 함께 발표했습니다.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (Google Inc. 에서) Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 의 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 논문과 함께 발표했습니다.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
@@ -372,7 +372,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index aed58c1f808c..a463c171e5c6 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -240,7 +240,7 @@ conda install -c huggingface transformers
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (来自 Google Research) 伴随论文 [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) 由 Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig 发布。
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
@@ -354,7 +354,7 @@ conda install -c huggingface transformers
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (来自 Google Inc.) 伴随论文 [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 由 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 发布。
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (来自 Google Inc.) 伴随论文 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 由 Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 发布。
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
@@ -396,7 +396,7 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 6d2cae9292e3..7d28c925a752 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -252,7 +252,7 @@ conda install -c huggingface transformers
 1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-1. **[Autoformer](https://huggingface.co/docs/transformers/main/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -366,7 +366,7 @@ conda install -c huggingface transformers
 1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
 1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
-1. **[MobileViTV2](https://huggingface.co/docs/transformers/main/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
@@ -408,7 +408,7 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.

From d7389cd20168052e5fc7abe0cf31cd1eb960fbc9 Mon Sep 17 00:00:00 2001
From: yuanwu2017 <yuan.wu@intel.com>
Date: Tue, 13 Jun 2023 17:42:41 +0800
Subject: [PATCH 399/935] fix: TextIteratorStreamer cannot work with pipeline
 (#23641)

* fix: TextIteratorStreamer cannot work with pipeline

Deepcopying the TextIteratorStreamer object causes the exception.

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Update src/transformers/pipelines/text_generation.py

Got it. I will update the patch.

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update src/transformers/pipelines/text_generation.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update text_generation.py

---------

Signed-off-by: yuanwu <yuan.wu@intel.com>
Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/pipelines/text_generation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 60037339072d..0f17dcc1e8ba 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -1,4 +1,3 @@
-import copy
 import enum
 import warnings
 
@@ -242,7 +241,6 @@ def _forward(self, model_inputs, **generate_kwargs):
 
         # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
         # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
-        generate_kwargs = copy.deepcopy(generate_kwargs)
         prefix_length = generate_kwargs.pop("prefix_length", 0)
         if prefix_length > 0:
             has_max_new_tokens = "max_new_tokens" in generate_kwargs or (

From 74b846cacfc5cade3454f56ce43f7503763606da Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 13 Jun 2023 14:28:14 +0200
Subject: [PATCH 400/935] Update `(TF)SamModelIntegrationTest` (#24199)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/sam/test_modeling_tf_sam.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index 33cb38c6fbbf..863d4bd842b9 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -428,8 +428,9 @@ def prepare_dog_img():
     return raw_image
 
 
+@require_tf
 @slow
-class SamModelIntegrationTest(unittest.TestCase):
+class TFSamModelIntegrationTest(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch

From fdd78d91532dffc4b2493d3b9bd9e19aaf78fe6b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 13 Jun 2023 15:07:00 +0200
Subject: [PATCH 401/935] Improving error message when using
 `use_safetensors=True`. (#24232)

---
 src/transformers/modeling_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 388c59d9cb3d..c0fe22fe8838 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2445,6 +2445,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`"
                         " to load this model from those weights."
                     )
+                elif use_safetensors:
+                    raise EnvironmentError(
+                        f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
+                        f" {pretrained_model_name_or_path}."
+                    )
                 else:
                     raise EnvironmentError(
                         f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {TF2_WEIGHTS_NAME},"

From f91810da88d298030518a592d921e2e3725fe117 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 13 Jun 2023 14:28:08 +0100
Subject: [PATCH 402/935] Safely import pytest in testing_utils.py (#24241)

---
 src/transformers/testing_utils.py      | 31 ++++++++++++++++----------
 src/transformers/utils/__init__.py     |  1 +
 src/transformers/utils/import_utils.py |  5 +++++
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index f7035751901d..ce2dec08e897 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -36,18 +36,6 @@
 
 import huggingface_hub
 import requests
-from _pytest.doctest import (
-    Module,
-    _get_checker,
-    _get_continue_on_failure,
-    _get_runner,
-    _is_mocked,
-    _patch_unwrap_mock_aware,
-    get_optionflags,
-    import_path,
-)
-from _pytest.outcomes import skip
-from pytest import DoctestItem
 
 from transformers import logging as transformers_logging
 
@@ -83,6 +71,7 @@
     is_phonemizer_available,
     is_pyctcdecode_available,
     is_pytesseract_available,
+    is_pytest_available,
     is_pytorch_quantization_available,
     is_rjieba_available,
     is_safetensors_available,
@@ -116,6 +105,24 @@
     from accelerate.state import AcceleratorState, PartialState
 
 
+if is_pytest_available():
+    from _pytest.doctest import (
+        Module,
+        _get_checker,
+        _get_continue_on_failure,
+        _get_runner,
+        _is_mocked,
+        _patch_unwrap_mock_aware,
+        get_optionflags,
+        import_path,
+    )
+    from _pytest.outcomes import skip
+    from pytest import DoctestItem
+else:
+    Module = object
+    DoctestItem = object
+
+
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
 DUMMY_UNKNOWN_IDENTIFIER = "julien-c/dummy-unknown"
 DUMMY_DIFF_TOKENIZER_IDENTIFIER = "julien-c/dummy-diff-tokenizer"
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 3aa1f8aeb926..9fc9d7ee3073 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -133,6 +133,7 @@
     is_py3nvml_available,
     is_pyctcdecode_available,
     is_pytesseract_available,
+    is_pytest_available,
     is_pytorch_quantization_available,
     is_rjieba_available,
     is_sacremoses_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 93e5e74bba5d..eb9cdcc67b0f 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -105,6 +105,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _py3nvml_available = _is_package_available("py3nvml")
 _pyctcdecode_available = _is_package_available("pyctcdecode")
 _pytesseract_available = _is_package_available("pytesseract")
+_pytest_available = _is_package_available("pytest")
 _pytorch_quantization_available = _is_package_available("pytorch_quantization")
 _rjieba_available = _is_package_available("rjieba")
 _sacremoses_available = _is_package_available("sacremoses")
@@ -547,6 +548,10 @@ def is_pytesseract_available():
     return _pytesseract_available
 
 
+def is_pytest_available():
+    return _pytest_available
+
+
 def is_spacy_available():
     return _spacy_available
 

From 3e142cb0f5b712eba058cdf2195dbca6eb387264 Mon Sep 17 00:00:00 2001
From: Sebastian <sjrl@users.noreply.github.com>
Date: Tue, 13 Jun 2023 16:04:27 +0200
Subject: [PATCH 403/935] fix overflow when training mDeberta in fp16 (#24116)

* Porting changes from https://github.com/microsoft/DeBERTa/ that hopefully allows for fp16 training of mdeberta

* Updates to deberta modeling from microsoft repo

* Performing some cleanup

* Undoing changes that weren't necessary

* Undoing float calls

* Minimally change the p2c block

* Fix error

* Minimally changing the c2p block

* Switch to torch sqrt

* Remove math

* Adding back the to calls to scale

* Undoing attention_scores change

* Removing commented out code

* Updating modeling_sew_d.py to satisfy utils/check_copies.py

* Missed changed

* Further reduce changes needed to get fp16 working

* Reverting changes to modeling_sew_d.py

* Make same change in TF
---
 src/transformers/models/deberta_v2/modeling_deberta_v2.py    | 2 +-
 src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py | 2 +-
 src/transformers/models/sew_d/modeling_sew_d.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 95a288657e48..4d89184bd9b3 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -721,7 +721,7 @@ def forward(
         if "p2c" in self.pos_att_type:
             scale_factor += 1
         scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
-        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_attention_bias(
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 1075cc855a02..d3c254f7ce68 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -676,7 +676,7 @@ def call(
         if "p2c" in self.pos_att_type:
             scale_factor += 1
         scale = tf.math.sqrt(tf.cast(shape_list(query_layer)[-1] * scale_factor, tf.float32))
-        attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1])) / scale
+        attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 2, 1]) / scale)
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index f1642fab1a77..b39ac0466f33 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -789,7 +789,7 @@ def forward(
         if "p2c" in self.pos_att_type:
             scale_factor += 1
         scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
-        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_attention_bias(

From 3723329d014a7b144863e597ea4fe6de5e6a8279 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Tue, 13 Jun 2023 19:48:36 +0530
Subject: [PATCH 404/935] deprecate `use_mps_device` (#24239)

---
 docs/source/en/main_classes/trainer.mdx |  4 +--
 src/transformers/training_args.py       | 43 +++++++++++--------------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
index 409a6c6d33af..728d7555cf4e 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.mdx
@@ -656,7 +656,8 @@ Therefore, improving end-to-end performance.
 please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
 
 **Usage**:
-User has to just pass `--use_mps_device` argument. 
+`mps` device will be used by default if available similar to the way `cuda` device is used.
+Therefore, no action from user is required. 
 For example, you can run the official Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
 
 ```bash
@@ -672,7 +673,6 @@ python examples/pytorch/text-classification/run_glue.py \
   --learning_rate 2e-5 \
   --num_train_epochs 3 \
   --output_dir /tmp/$TASK_NAME/ \
-  --use_mps_device \
   --overwrite_output_dir
 ```
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index d666b40acb0b..5fe680a802f8 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -581,7 +581,7 @@ class TrainingArguments:
             (https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
             information.
         use_mps_device (`bool`, *optional*, defaults to `False`):
-            Whether to use Apple Silicon chip based `mps` device.
+            This argument is deprecated.`mps` device will be used if it is available similar to `cuda` device.
         torch_compile (`bool`, *optional*, defaults to `False`):
             Whether or not to compile the model using PyTorch 2.0
             [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/).
@@ -780,7 +780,11 @@ class TrainingArguments:
     )
     no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
     use_mps_device: bool = field(
-        default=False, metadata={"help": "Whether to use Apple Silicon chip based `mps` device."}
+        default=False,
+        metadata={
+            "help": "This argument is deprecated. `mps` device will be used if available similar to `cuda` device."
+            " It will be removed in version 5.0 of 🤗 Transformers"
+        },
     )
     seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
     data_seed: Optional[int] = field(default=None, metadata={"help": "Random seed to be used with data samplers."})
@@ -1714,29 +1718,18 @@ def _setup_devices(self) -> "torch.device":
             pass
         elif self.distributed_state.distributed_type == DistributedType.NO:
             if self.use_mps_device:
-                if not torch.backends.mps.is_available():
-                    if not torch.backends.mps.is_built():
-                        raise AssertionError(
-                            "MPS not available because the current PyTorch install was not "
-                            "built with MPS enabled. Please install torch version >=1.12.0 on "
-                            "your Apple silicon Mac running macOS 12.3 or later with a native "
-                            "version (arm64) of Python"
-                        )
-                    else:
-                        raise AssertionError(
-                            "MPS not available because the current MacOS version is not 12.3+ "
-                            "and/or you do not have an MPS-enabled device on this machine."
-                        )
-                else:
-                    if not version.parse(version.parse(torch.__version__).base_version) > version.parse("1.12.0"):
-                        warnings.warn(
-                            "We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
-                            " on your MacOS machine. It has major fixes related to model correctness and performance"
-                            " improvements for transformer based models. Please refer to"
-                            " https://github.com/pytorch/pytorch/issues/82707 for more details."
-                        )
-                    device = torch.device("mps")
-                    self._n_gpu = 1
+                warnings.warn(
+                    "`use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers."
+                    "`mps` device will be used by default if available similar to the way `cuda` device is used."
+                    "Therefore, no action from user is required. "
+                )
+                if device.type != "mps":
+                    raise ValueError(
+                        "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
+                        "or current PyTorch install was not built with MPS enabled."
+                    )
+            if device.type == "mps":
+                self._n_gpu = 1
             elif self.no_cuda:
                 device = torch.device("cpu")
                 self._n_gpu = 0

From 695928e1e573e5a9953ad540c0f8345767348feb Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 13 Jun 2023 11:38:39 -0400
Subject: [PATCH 405/935] Tied params cleanup (#24211)

* First test

* Add info for all models

* style

* Repo consistency

* Fix last model and cleanup prints

* Repo consistency

* Use consistent function for detecting tied weights
---
 src/transformers/modeling_utils.py            |  5 ++--
 .../models/albert/modeling_albert.py          |  2 ++
 src/transformers/models/bart/modeling_bart.py |  9 ++++--
 src/transformers/models/bert/modeling_bert.py |  3 ++
 .../modeling_bert_generation.py               |  1 +
 .../models/big_bird/modeling_big_bird.py      |  3 ++
 .../modeling_bigbird_pegasus.py               |  9 ++++--
 .../models/biogpt/modeling_biogpt.py          |  1 +
 .../models/blenderbot/modeling_blenderbot.py  |  3 ++
 .../modeling_blenderbot_small.py              |  3 ++
 src/transformers/models/blip/modeling_blip.py |  2 ++
 .../models/blip_2/modeling_blip_2.py          | 10 +++++++
 .../models/bloom/modeling_bloom.py            |  1 +
 .../bridgetower/modeling_bridgetower.py       |  8 +++++
 .../models/camembert/modeling_camembert.py    |  2 ++
 .../models/codegen/modeling_codegen.py        |  1 +
 .../models/convbert/modeling_convbert.py      |  1 +
 .../models/cpmant/modeling_cpmant.py          |  1 +
 src/transformers/models/ctrl/modeling_ctrl.py |  1 +
 .../models/data2vec/modeling_data2vec_text.py |  2 ++
 .../models/deberta/modeling_deberta.py        |  1 +
 .../models/deberta_v2/modeling_deberta_v2.py  |  1 +
 .../modeling_deformable_detr.py               |  1 +
 src/transformers/models/deta/modeling_deta.py |  1 +
 .../models/distilbert/modeling_distilbert.py  |  1 +
 .../models/electra/modeling_electra.py        |  2 ++
 .../models/ernie/modeling_ernie.py            |  3 ++
 src/transformers/models/esm/modeling_esm.py   |  1 +
 .../models/flaubert/modeling_flaubert.py      |  1 +
 .../models/flava/modeling_flava.py            |  6 ++++
 src/transformers/models/fnet/modeling_fnet.py |  2 ++
 src/transformers/models/fsmt/modeling_fsmt.py |  2 ++
 .../models/funnel/modeling_funnel.py          |  1 +
 src/transformers/models/git/modeling_git.py   |  2 ++
 src/transformers/models/gpt2/modeling_gpt2.py |  2 ++
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  1 +
 .../models/gpt_neo/modeling_gpt_neo.py        |  1 +
 .../models/gpt_neox/modeling_gpt_neox.py      |  1 +
 .../modeling_gpt_neox_japanese.py             |  1 +
 src/transformers/models/gptj/modeling_gptj.py |  1 +
 .../modeling_gptsan_japanese.py               |  1 +
 .../models/ibert/modeling_ibert.py            |  1 +
 .../models/imagegpt/modeling_imagegpt.py      |  1 +
 .../models/layoutlm/modeling_layoutlm.py      |  1 +
 src/transformers/models/led/modeling_led.py   |  4 +++
 .../models/llama/modeling_llama.py            |  2 ++
 .../models/longformer/modeling_longformer.py  |  1 +
 .../models/longt5/modeling_longt5.py          |  3 ++
 src/transformers/models/luke/modeling_luke.py |  1 +
 .../models/lxmert/modeling_lxmert.py          |  1 +
 .../models/m2m_100/modeling_m2m_100.py        |  2 ++
 .../models/marian/modeling_marian.py          |  4 ++-
 .../models/mbart/modeling_mbart.py            |  5 ++++
 src/transformers/models/mega/modeling_mega.py |  2 ++
 .../megatron_bert/modeling_megatron_bert.py   |  3 ++
 .../models/mobilebert/modeling_mobilebert.py  |  2 ++
 .../models/mpnet/modeling_mpnet.py            |  1 +
 src/transformers/models/mt5/modeling_mt5.py   |  3 ++
 src/transformers/models/mvp/modeling_mvp.py   |  5 ++++
 .../models/nezha/modeling_nezha.py            |  2 ++
 .../models/nllb_moe/modeling_nllb_moe.py      |  2 ++
 .../nystromformer/modeling_nystromformer.py   |  1 +
 .../models/openai/modeling_openai.py          |  2 ++
 src/transformers/models/opt/modeling_opt.py   |  1 +
 .../models/pegasus/modeling_pegasus.py        |  3 ++
 .../models/pegasus_x/modeling_pegasus_x.py    |  2 ++
 .../models/pix2struct/modeling_pix2struct.py  |  2 ++
 .../models/plbart/modeling_plbart.py          |  4 +++
 .../models/prophetnet/modeling_prophetnet.py  |  3 ++
 .../models/qdqbert/modeling_qdqbert.py        |  2 ++
 .../models/realm/modeling_realm.py            |  2 ++
 .../models/reformer/modeling_reformer.py      |  3 ++
 .../models/rembert/modeling_rembert.py        |  3 ++
 .../models/roberta/modeling_roberta.py        |  2 ++
 .../modeling_roberta_prelayernorm.py          |  2 ++
 .../models/roc_bert/modeling_roc_bert.py      |  3 ++
 .../models/roformer/modeling_roformer.py      |  2 ++
 src/transformers/models/rwkv/modeling_rwkv.py |  2 ++
 src/transformers/models/sam/modeling_sam.py   |  1 +
 .../speech_to_text/modeling_speech_to_text.py |  1 +
 .../modeling_speech_to_text_2.py              |  1 +
 .../models/speecht5/modeling_speecht5.py      |  1 +
 .../squeezebert/modeling_squeezebert.py       |  1 +
 .../modeling_switch_transformers.py           |  3 ++
 src/transformers/models/t5/modeling_t5.py     |  3 ++
 .../models/tapas/modeling_tapas.py            |  1 +
 .../models/transfo_xl/modeling_transfo_xl.py  |  1 +
 .../models/trocr/modeling_trocr.py            |  1 +
 src/transformers/models/vilt/modeling_vilt.py |  1 +
 .../visual_bert/modeling_visual_bert.py       |  2 ++
 .../models/whisper/modeling_whisper.py        |  1 +
 src/transformers/models/xglm/modeling_xglm.py |  1 +
 src/transformers/models/xlm/modeling_xlm.py   |  1 +
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |  3 ++
 .../xlm_roberta/modeling_xlm_roberta.py       |  2 ++
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |  2 ++
 .../models/xlnet/modeling_xlnet.py            |  1 +
 src/transformers/models/xmod/modeling_xmod.py |  2 ++
 src/transformers/models/yoso/modeling_yoso.py |  1 +
 tests/test_modeling_common.py                 | 30 +++++++++++++++++++
 100 files changed, 240 insertions(+), 7 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c0fe22fe8838..08bf79a3ff7f 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1069,6 +1069,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     # a list of `state_dict` keys to ignore when saving the model (useful for keys that aren't
     # trained, but which are either deterministic or tied variables)
     _keys_to_ignore_on_save = None
+    # a list of `state_dict` keys that are potentially tied to another key in the state_dict.
+    _tied_weights_keys = None
 
     is_parallelizable = False
     supports_gradient_checkpointing = False
@@ -1778,8 +1780,7 @@ def save_pretrained(
             # We're going to remove aliases before saving
             ptrs = collections.defaultdict(list)
             for name, tensor in state_dict.items():
-                ident = (tensor.data_ptr(), tensor.device, tensor.shape, tensor.stride())
-                ptrs[ident].append(name)
+                ptrs[id_tensor_storage(tensor)].append(name)
 
             # These are all the pointers of shared tensors.
             shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 687a927ef0c4..9eadaa219834 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -759,6 +759,7 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForPreTraining(AlbertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
     _keys_to_ignore_on_load_missing = [
         "predictions.decoder.weight",
         "predictions.decoder.bias",
@@ -912,6 +913,7 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
     _keys_to_ignore_on_load_missing = [
         "predictions.decoder.weight",
         "predictions.decoder.bias",
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 59d99f9aa133..50452449021c 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1166,6 +1166,7 @@ def custom_forward(*inputs):
 )
 class BartModel(BartPretrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig):
         super().__init__(config)
@@ -1293,9 +1294,10 @@ def forward(
 )
 class BartForConditionalGeneration(BartPretrainedModel):
     base_model_prefix = "model"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"lm_head.weight",
+        "final_logits_bias",
+        "lm_head.weight",
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",
     ]
@@ -1472,6 +1474,7 @@ def _reorder_cache(past_key_values, beam_idx):
 )
 class BartForSequenceClassification(BartPretrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1602,6 +1605,7 @@ def forward(
 )
 class BartForQuestionAnswering(BartPretrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1740,6 +1744,7 @@ def forward(self, *args, **kwargs):
 )
 class BartForCausalLM(BartPretrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index ee7dcd5e4363..fb92a0e84cc4 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1054,6 +1054,7 @@ def forward(
 )
 class BertForPreTraining(BertPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1161,6 +1162,7 @@ def forward(
 class BertLMHeadModel(BertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1301,6 +1303,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 class BertForMaskedLM(BertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 3ae1a5d90ac7..f92b7a0633e8 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -861,6 +861,7 @@ def _tie_weights(self):
 )
 class BertGenerationDecoder(BertGenerationPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 1f707069704d..e1346a23c9db 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2262,6 +2262,7 @@ def _pad_to_block_size(
 
 class BigBirdForPreTraining(BigBirdPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2368,6 +2369,7 @@ def forward(
 @add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
 class BigBirdForMaskedLM(BigBirdPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2517,6 +2519,7 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
         "cls.predictions.decoder.weight",
         "cls.predictions.decoder.bias",
     ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 61972c53f017..8d7906631d54 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2354,6 +2354,7 @@ def custom_forward(*inputs):
 )
 class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BigBirdPegasusConfig):
         super().__init__(config)
@@ -2484,9 +2485,10 @@ def forward(
 # Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
 class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
     base_model_prefix = "model"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"lm_head.weight",
+        "final_logits_bias",
+        "lm_head.weight",
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",
     ]
@@ -2663,6 +2665,7 @@ def _reorder_cache(past_key_values, beam_idx):
 )
 class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BigBirdPegasusConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -2792,6 +2795,7 @@ def forward(
 )
 class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2924,6 +2928,7 @@ def forward(self, *args, **kwargs):
 
 class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index df6c18182dc2..a9ecb11a61f1 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -647,6 +647,7 @@ def custom_forward(*inputs):
 )
 class BioGptForCausalLM(BioGptPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["output_projection.weight"]
+    _tied_weights_keys = ["output_projection.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 6ab2de61e9f9..8f2780772cbd 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1098,6 +1098,7 @@ def custom_forward(*inputs):
 )
 class BlenderbotModel(BlenderbotPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: BlenderbotConfig):
         super().__init__(config)
@@ -1246,6 +1247,7 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
         "decoder.embed_tokens.weight",
         "encoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: BlenderbotConfig):
         super().__init__(config)
@@ -1435,6 +1437,7 @@ def forward(self, *args, **kwargs):
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill
 class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index e8f4250398b2..ef8d51a2b0e7 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1092,6 +1092,7 @@ def custom_forward(*inputs):
 )
 class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__(config)
@@ -1228,6 +1229,7 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__(config)
@@ -1402,6 +1404,7 @@ def forward(self, *args, **kwargs):
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M
 class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index e41bb55fa19b..f16b89b7a316 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -930,6 +930,7 @@ def forward(
 class BlipForConditionalGeneration(BlipPreTrainedModel):
     config_class = BlipConfig
     _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
     main_input_name = "pixel_values"
 
     def __init__(self, config: BlipConfig):
@@ -1102,6 +1103,7 @@ def generate(
 class BlipForQuestionAnswering(BlipPreTrainedModel):
     config_class = BlipConfig
     _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
 
     def __init__(self, config: BlipConfig):
         super().__init__(config)
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 2cb77f44a28b..82a879771b78 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1232,6 +1232,11 @@ def __init__(self, config: Blip2Config):
             language_model = AutoModelForCausalLM.from_config(config.text_config)
         else:
             language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        # Update _tied_weights_keys using the base model used.
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+
         self.language_model = language_model
 
         # Initialize weights and apply final processing
@@ -1587,6 +1592,11 @@ def __init__(self, config: Blip2Config):
             language_model = AutoModelForCausalLM.from_config(config.text_config)
         else:
             language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        # Update _tied_weights_keys using the base model used.
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+
         self.language_model = language_model
 
         # Initialize weights and apply final processing
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index a3d2242f8a6a..4f6de49a1447 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -827,6 +827,7 @@ def custom_forward(*inputs):
 )
 class BloomForCausalLM(BloomPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: BloomConfig):
         super().__init__(config)
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index eb95f12c854e..2eee5c9fe56d 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -1265,6 +1265,12 @@ def __init__(self, config):
 
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
     @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1548,6 +1554,8 @@ def forward(self, x):
     BRIDGETOWER_START_DOCSTRING,
 )
 class BridgeTowerForMaskedLM(BridgeTowerPreTrainedModel):
+    _tied_weights_keys = ["mlm_score.decoder.weight"]
+
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 52ce517099ca..e98840fbc6d2 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -938,6 +938,7 @@ class CamembertForMaskedLM(CamembertPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1433,6 +1434,7 @@ class CamembertForCausalLM(CamembertPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index aff556d2f144..8b1d34f59e7b 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -601,6 +601,7 @@ def custom_forward(*inputs):
 )
 class CodeGenForCausalLM(CodeGenPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index 59f81d31459c..bbdba210c233 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -881,6 +881,7 @@ def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTens
 @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
 class ConvBertForMaskedLM(ConvBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"]
+    _tied_weights_keys = ["generator.lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
index db0e0f468cef..33ead6a10464 100755
--- a/src/transformers/models/cpmant/modeling_cpmant.py
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -750,6 +750,7 @@ def forward(
 )
 class CpmAntForCausalLM(CpmAntPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: CpmAntConfig):
         super().__init__(config)
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index b41b7c5d1b4b..dadcbb494cf9 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -510,6 +510,7 @@ def forward(
 )
 class CTRLLMHeadModel(CTRLPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 1c0dc7418de8..206fe1603b00 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -886,6 +886,7 @@ class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1040,6 +1041,7 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 62e89c824e64..b2a4e8152234 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -1022,6 +1022,7 @@ def forward(
 class DebertaForMaskedLM(DebertaPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 4d89184bd9b3..434f3d208aec 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -1122,6 +1122,7 @@ def forward(
 class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index d85d98ef9ec8..6469cf7a65df 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1824,6 +1824,7 @@ def forward(
 class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
     _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
 
     def __init__(self, config: DeformableDetrConfig):
         super().__init__(config)
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 5f14aa2469b4..af218829d6f9 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -1776,6 +1776,7 @@ def forward(
 class DetaForObjectDetection(DetaPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
     _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+    _tied_weights_keys = [r"bbox_embed\.\d+"]
 
     # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
     def __init__(self, config: DetaConfig):
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 84db89e0f4b5..8b71c086bbcc 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -596,6 +596,7 @@ def forward(
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["vocab_projector.weight"]
+    _tied_weights_keys = ["vocab_projector.weight"]
 
     def __init__(self, config: PretrainedConfig):
         super().__init__(config)
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 9c6b68f26ac7..a7ee4ec93202 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -1167,6 +1167,7 @@ def forward(
 )
 class ElectraForMaskedLM(ElectraPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
+    _tied_weights_keys = ["generator_lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1534,6 +1535,7 @@ def forward(
 )
 class ElectraForCausalLM(ElectraPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
+    _tied_weights_keys = ["generator_lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 97b5129e6f59..b8df1b2d5035 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -984,6 +984,7 @@ def forward(
 )
 class ErnieForPreTraining(ErniePreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
@@ -1096,6 +1097,7 @@ def forward(
 class ErnieForCausalLM(ErniePreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
     def __init__(self, config):
@@ -1243,6 +1245,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 class ErnieForMaskedLM(ErniePreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index 6926ad016343..e0b26e0f7812 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -962,6 +962,7 @@ def predict_contacts(self, tokens, attention_mask):
 class EsmForMaskedLM(EsmPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index bb7790730115..38705bec09e0 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -655,6 +655,7 @@ def forward(
 # Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
 class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
+    _tied_weights_keys = ["pred_layer.proj.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 1d149a8775e8..5d49197f8ca5 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -1730,6 +1730,12 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
         "mlm_head.decoder.bias",
         "mim_head.decoder.bias",
     ]
+    _tied_weights_keys = [
+        "mmm_text_head.decoder.bias",
+        "mmm_image_head.decoder.bias",
+        "mlm_head.decoder.bias",
+        "mim_head.decoder.bias",
+    ]
 
     def __init__(self, config: FlavaConfig, image_codebook: Optional[nn.Module] = None):
         super().__init__(config)
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index ebc58167b517..6bc526eeebcb 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -622,6 +622,7 @@ def forward(
 )
 class FNetForPreTraining(FNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -716,6 +717,7 @@ def forward(
 @add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING)
 class FNetForMaskedLM(FNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 4bc01b6fef79..22c3a0a2489a 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -1035,6 +1035,7 @@ def _get_shape(t):
 )
 class FSMTModel(PretrainedFSMTModel):
     _keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight"]
 
     def __init__(self, config: FSMTConfig):
         super().__init__(config)
@@ -1180,6 +1181,7 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
         "model.encoder.embed_positions.weight",
         "model.decoder.embed_positions.weight",
     ]
+    _tied_weights_keys = ["model.decoder.embed_tokens.weight"]
 
     def __init__(self, config: FSMTConfig):
         super().__init__(config)
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 2c760b3cf224..805b651f2126 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -1191,6 +1191,7 @@ def forward(
 @add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
 class FunnelForMaskedLM(FunnelPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: FunnelConfig) -> None:
         super().__init__(config)
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index f55a687c4bc1..23ae6d64962f 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -1324,6 +1324,8 @@ def forward(
     """GIT Model with a `language modeling` head on top for autoregressive language modeling.""", GIT_START_DOCSTRING
 )
 class GitForCausalLM(GitPreTrainedModel):
+    _tied_weights_keys = ["output.weight"]
+
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 0cb406081f27..b9a8568f00e7 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -959,6 +959,7 @@ def custom_forward(*inputs):
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
     _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1152,6 +1153,7 @@ def _reorder_cache(
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
     _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 0bbe76482379..705d07b1da25 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -723,6 +723,7 @@ def custom_forward(*inputs):
 )
 class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index f98a73373b4a..b67f4ddbfaca 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -669,6 +669,7 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
         r"h\.\d+\.attn\.attention\.bias",
     ]
     _keys_to_ignore_on_save = [r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 5ae2807608a9..b7a9a46c0659 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -597,6 +597,7 @@ def custom_forward(*inputs):
 )
 class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["embed_out.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index aeab9434e515..1671e5916ef7 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -592,6 +592,7 @@ def forward(
 )
 class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"]
+    _tied_weights_keys = ["embed_out.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 91b32d1a8242..de120167989d 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -735,6 +735,7 @@ def custom_forward(*inputs):
 )
 class GPTJForCausalLM(GPTJPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index a0b68543b3da..30ebf003145e 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -1112,6 +1112,7 @@ def forward(
 )
 class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: GPTSanJapaneseConfig):
         super().__init__(config)
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index de7f6f4d707c..7f300e01ae4e 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -856,6 +856,7 @@ def forward(
 class IBertForMaskedLM(IBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 6d5b67d7a12b..262c45c92738 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -894,6 +894,7 @@ def custom_forward(*inputs):
 )
 class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: ImageGPTConfig):
         super().__init__(config)
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index a3e1708b7f62..410f76509422 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -862,6 +862,7 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
         "cls.predictions.decoder.weight",
         "embeddings.position_ids",
     ]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 43f4f75a090c..a11659e38933 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -2210,6 +2210,7 @@ def custom_forward(*inputs):
 )
 class LEDModel(LEDPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: LEDConfig):
         super().__init__(config)
@@ -2342,6 +2343,7 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
         "decoder.embed_tokens.weight",
         "encoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: LEDConfig):
         super().__init__(config)
@@ -2529,6 +2531,7 @@ def _reorder_cache(past_key_values, beam_idx):
 )
 class LEDForSequenceClassification(LEDPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: LEDConfig, **kwargs):
         warnings.warn(
@@ -2665,6 +2668,7 @@ def forward(
 )
 class LEDForQuestionAnswering(LEDPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 3d31928562a2..e0580880114b 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -611,6 +611,8 @@ def custom_forward(*inputs):
 
 
 class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index cd975380be55..665e2cb56421 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1772,6 +1772,7 @@ def forward(
 class LongformerForMaskedLM(LongformerPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.decoder"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 56baa32eb488..1a49444e8a50 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -1770,6 +1770,7 @@ class LongT5Model(LongT5PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: LongT5Config):
         super().__init__(config)
@@ -1924,6 +1925,7 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: LongT5Config):
         super().__init__(config)
@@ -2159,6 +2161,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 )
 class LongT5EncoderModel(LongT5PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: LongT5Config):
         super().__init__(config)
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 5d3167b716cd..ba21d3deb32e 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -1289,6 +1289,7 @@ class LukeForMaskedLM(LukePreTrainedModel):
         r"lm_head.decoder.bias",
         r"entity_predictions.decoder.weight",
     ]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 73586872e8a1..21a279ec29ca 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -1019,6 +1019,7 @@ def forward(
 )
 class LxmertForPreTraining(LxmertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index afb3d7577eae..f8f9e1d3a8ee 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -1146,6 +1146,7 @@ class M2M100Model(M2M100PreTrainedModel):
         "decoder.embed_positions.weights",
         "decoder.embed_positions.bias",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: M2M100Config):
         super().__init__(config)
@@ -1269,6 +1270,7 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
         r"decoder.embed_positions.weights",
         r"decoder.embed_positions.bias",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: M2M100Config):
         super().__init__(config)
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 37ba6825bb2e..a75f833fb5cb 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1099,6 +1099,7 @@ def custom_forward(*inputs):
 )
 class MarianModel(MarianPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MarianConfig):
         super().__init__(config)
@@ -1294,8 +1295,8 @@ class MarianMTModel(MarianPreTrainedModel):
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",
     ]
-
     _keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MarianConfig):
         super().__init__(config)
@@ -1556,6 +1557,7 @@ def forward(self, *args, **kwargs):
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en
 class MarianForCausalLM(MarianPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index ceb6aa67a40a..67750ab42f71 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1152,6 +1152,7 @@ def custom_forward(*inputs):
 )
 class MBartModel(MBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MBartConfig):
         super().__init__(config)
@@ -1279,6 +1280,7 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MBartConfig):
         super().__init__(config)
@@ -1446,6 +1448,7 @@ def _reorder_cache(past_key_values, beam_idx):
 )
 class MBartForSequenceClassification(MBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
 
     def __init__(self, config: MBartConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1575,6 +1578,7 @@ def forward(
 )
 class MBartForQuestionAnswering(MBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1708,6 +1712,7 @@ def forward(self, *args, **kwargs):
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
 class MBartForCausalLM(MBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py
index 9d1890788f4f..19e1cc107504 100644
--- a/src/transformers/models/mega/modeling_mega.py
+++ b/src/transformers/models/mega/modeling_mega.py
@@ -1659,6 +1659,7 @@ class MegaForCausalLM(MegaPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.weight", r"lm_head.bias"]
     _keys_to_ignore_on_load_missing = [r"lm_head.weight", r"lm_head.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: MegaConfig):
         super().__init__(config)
@@ -1823,6 +1824,7 @@ class MegaForMaskedLM(MegaPreTrainedModel):
     _keys_to_ignore_on_save = [r"mlm_head.weight", r"mlm_head.bias"]
     _keys_to_ignore_on_load_missing = [r"mlm_head.weight", r"mlm_head.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["mlm_head.weight"]
 
     def __init__(self, config: MegaConfig):
         super().__init__(config)
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index d08a6edd1258..bba7e7369cb8 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1015,6 +1015,7 @@ def forward(
 )
 class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config, add_binary_head=True):
         super().__init__(config)
@@ -1122,6 +1123,7 @@ def forward(
 class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1267,6 +1269,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 8e9ff45d6acf..fcd49a8f8cf3 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -928,6 +928,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
         "cls.predictions.decoder.bias",
         "embeddings.position_ids",
     ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1041,6 +1042,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
         "cls.predictions.decoder.bias",
         "embeddings.position_ids",
     ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index 6dc145fdccc5..93e5abe72a48 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -572,6 +572,7 @@ def forward(
 class MPNetForMaskedLM(MPNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 23a3cc168c1c..a3cfce8ffc4a 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -1324,6 +1324,7 @@ class MT5Model(MT5PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
     def __init__(self, config: MT5Config):
@@ -1556,6 +1557,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
     def __init__(self, config: MT5Config):
@@ -1898,6 +1900,7 @@ class MT5EncoderModel(MT5PreTrainedModel):
         r"encoder.embed_tokens.weight",
     ]
     _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5
     def __init__(self, config: MT5Config):
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 6b2bdcb93d5a..6a44768d8eec 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -1297,6 +1297,7 @@ def custom_forward(*inputs):
 class MvpModel(MvpPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MvpConfig):
         super().__init__(config)
@@ -1433,6 +1434,7 @@ def forward(
 )
 class MvpForConditionalGeneration(MvpPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MvpConfig):
         super().__init__(config)
@@ -1606,6 +1608,7 @@ def _reorder_cache(past_key_values, beam_idx):
 class MvpForSequenceClassification(MvpPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MvpConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1734,6 +1737,7 @@ def forward(
 class MvpForQuestionAnswering(MvpPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1865,6 +1869,7 @@ def forward(self, *args, **kwargs):
 
 class MvpForCausalLM(MvpPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index a6081bc947a4..97c5b5a90ec3 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -1038,6 +1038,7 @@ def forward(
 )
 class NezhaForPreTraining(NezhaPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1141,6 +1142,7 @@ def forward(
 class NezhaForMaskedLM(NezhaPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index b6ea574469ee..06b61c7497db 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -1509,6 +1509,7 @@ class NllbMoeModel(NllbMoePreTrainedModel):
         "decoder.embed_positions.weights",
         "decoder.embed_positions.bias",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: NllbMoeConfig):
         super().__init__(config)
@@ -1652,6 +1653,7 @@ class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
         r"decoder.embed_positions.weights",
         r"decoder.embed_positions.bias",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: NllbMoeConfig):
         super().__init__(config)
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index 4cea21e705be..b859b0db1d4f 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -659,6 +659,7 @@ def forward(
 @add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING)
 class NystromformerForMaskedLM(NystromformerPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 8ac487aa47da..0949b2f7dac7 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -530,6 +530,7 @@ def forward(
 )
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -621,6 +622,7 @@ def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -
 )
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index e493705d78c5..bd64630c6200 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -818,6 +818,7 @@ def forward(
 
 class OPTForCausalLM(OPTPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 7d0652ff617b..a2bd3f3812e5 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1152,6 +1152,7 @@ def custom_forward(*inputs):
 )
 class PegasusModel(PegasusPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PegasusConfig):
         super().__init__(config)
@@ -1312,6 +1313,7 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel):
         "encoder.embed_tokens.weight",
         "decoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PegasusConfig):
         super().__init__(config)
@@ -1512,6 +1514,7 @@ def forward(self, *args, **kwargs):
 
 class PegasusForCausalLM(PegasusPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index f5375b5079b3..8e380a4de5f0 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -1387,6 +1387,7 @@ def custom_forward(*inputs):
 )
 class PegasusXModel(PegasusXPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PegasusXConfig):
         super().__init__(config)
@@ -1538,6 +1539,7 @@ class PegasusXForConditionalGeneration(PegasusXPreTrainedModel):
         "decoder.embed_tokens.weight",
         "encoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PegasusXConfig):
         super().__init__(config)
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 539c95eda0c7..2db104a5a112 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1317,6 +1317,7 @@ def forward(
 class Pix2StructTextModel(Pix2StructPreTrainedModel):
     config_class = Pix2StructTextConfig
     _no_split_modules = ["Pix2StructTextBlock"]
+    _tied_weights_keys = ["lm_head.weight"]
     supports_gradient_checkpointing = True
 
     def _set_gradient_checkpointing(self, module, value=False):
@@ -1604,6 +1605,7 @@ class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.layer.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["decoder.lm_head.weight"]
 
     def __init__(self, config: Pix2StructConfig):
         super().__init__(config)
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 9ec9bc71433a..365429360af5 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -1128,6 +1128,7 @@ def custom_forward(*inputs):
 )
 class PLBartModel(PLBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PLBartConfig):
         super().__init__(config)
@@ -1253,6 +1254,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel):
         "decoder.embed_tokens.weight",
         "encoder.embed_tokens.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PLBartConfig):
         super().__init__(config)
@@ -1417,6 +1419,7 @@ def _reorder_cache(past_key_values, beam_idx):
 )
 class PLBartForSequenceClassification(PLBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PLBartConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -1555,6 +1558,7 @@ def forward(self, *args, **kwargs):
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base
 class PLBartForCausalLM(PLBartPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 56a89d88f82e..9160d5e1eb46 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1745,6 +1745,7 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
 )
 class ProphetNetModel(ProphetNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
 
     def __init__(self, config: ProphetNetConfig):
         super().__init__(config)
@@ -1878,6 +1879,7 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
         "encoder.word_embeddings.weight",
         "lm_head.weight",
     ]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
 
     def __init__(self, config: ProphetNetConfig):
         super().__init__(config)
@@ -2090,6 +2092,7 @@ def get_decoder(self):
 )
 class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: ProphetNetConfig):
         # set config for CLM
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 4586756f8008..47a34e959072 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -1014,6 +1014,7 @@ def forward(
 class QDQBertLMHeadModel(QDQBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1167,6 +1168,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 class QDQBertForMaskedLM(QDQBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 339cc27e9289..f68fc04105de 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1148,6 +1148,7 @@ def forward(
 )
 class RealmEmbedder(RealmPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1378,6 +1379,7 @@ def forward(
 )
 class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+    _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 4bd29e78eeba..98b4577b67d9 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -2186,6 +2186,7 @@ def _pad_to_mult_of_chunk_length(
 @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
 class ReformerModelWithLMHead(ReformerPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -2311,6 +2312,8 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
 class ReformerForMaskedLM(ReformerPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
     def __init__(self, config):
         super().__init__(config)
         assert not config.is_decoder, (
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 6dda764554f6..da4ad9608514 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -912,6 +912,8 @@ def forward(
 
 @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
 class RemBertForMaskedLM(RemBertPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight"]
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -1015,6 +1017,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
 )
 class RemBertForCausalLM(RemBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 75b728af6af6..b0f136924601 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -884,6 +884,7 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1038,6 +1039,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 5202042ba970..b1e02e27f138 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -889,6 +889,7 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1047,6 +1048,7 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
     def __init__(self, config):
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index d14d2bf60893..7647c14a9ea3 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -1082,6 +1082,7 @@ def forward(
 )
 class RoCBertForPreTraining(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1268,6 +1269,7 @@ def forward(
 class RoCBertForMaskedLM(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
@@ -1409,6 +1411,7 @@ def prepare_inputs_for_generation(
 class RoCBertForCausalLM(RoCBertPreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index d6752f98d725..b966bf4490a9 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -953,6 +953,7 @@ def forward(
 @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
 class RoFormerForMaskedLM(RoFormerPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1055,6 +1056,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
 )
 class RoFormerForCausalLM(RoFormerPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index e79f99a309be..255f0e3abf74 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -732,6 +732,8 @@ def _rescale_layers(self):
     RWKV_START_DOCSTRING,
 )
 class RwkvForCausalLM(RwkvPreTrainedModel):
+    _tied_weights_keys = ["head.weight"]
+
     def __init__(self, config):
         super().__init__(config)
         self.rwkv = RwkvModel(config)
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 29111c144362..c3cbaa9176f0 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -1191,6 +1191,7 @@ def _init_weights(self, module):
 )
 class SamModel(SamPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
+    _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index b5037faf4c9f..d8a19084eb38 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -1272,6 +1272,7 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
         r"model.encoder.embed_positions.weights",
         r"model.decoder.embed_positions.weights",
     ]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: Speech2TextConfig):
         super().__init__(config)
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index cfb86143b881..c13b04642d9d 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -757,6 +757,7 @@ def forward(self, *args, **kwargs):
 )
 class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 84a78de2aef3..3e8ce5a23b7e 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -2331,6 +2331,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
     _keys_to_ignore_on_save = [
         r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
     ]
+    _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]
 
     def __init__(self, config: SpeechT5Config):
         super().__init__(config)
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index 4f666bb7f8f0..3264d16ebbb3 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -648,6 +648,7 @@ class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
         "cls.predictions.decoder.weight",
         "embeddings.position_ids",
     ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 2a24d82f6aa3..3b36b7732b09 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -1338,6 +1338,7 @@ def custom_forward(*inputs):
 )
 class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
         super().__init__(config)
@@ -1510,6 +1511,7 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod
         r"decoder.embed_tokens.weight",
         r"lm_head.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
         super().__init__(config)
@@ -1823,6 +1825,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 )
 class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
         super().__init__(config)
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 8ecae8dc8506..050309fa9a33 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1329,6 +1329,7 @@ class T5Model(T5PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: T5Config):
         super().__init__(config)
@@ -1533,6 +1534,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: T5Config):
         super().__init__(config)
@@ -1840,6 +1842,7 @@ def _reorder_cache(self, past_key_values, beam_idx):
 )
 class T5EncoderModel(T5PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: T5Config):
         super().__init__(config)
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index c215f0e0d3a5..dddc51917f45 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -992,6 +992,7 @@ def forward(
 @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
 class TapasForMaskedLM(TapasPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     config_class = TapasConfig
     base_model_prefix = "tapas"
 
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index e97091c35c07..493ab08d9ce4 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -1003,6 +1003,7 @@ def forward(
 )
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
+    _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 7e2e11c00e22..6276c68a425d 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -789,6 +789,7 @@ def forward(self, *args, **kwargs):
 )
 class TrOCRForCausalLM(TrOCRPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["output_projection.weight"]
+    _tied_weights_keys = ["output_projection.weight"]
 
     def __init__(self, config):
         config = copy.deepcopy(config)
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 364bc15b905f..19a1454c6e0b 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -894,6 +894,7 @@ def forward(self, hidden_states):
 )
 class ViltForMaskedLM(ViltPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["mlm_score.decoder.bias"]
+    _tied_weights_keys = ["mlm_score.decoder.weight", "mlm_score.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 38ec21aff57b..0bef6e4af9d9 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -872,6 +872,7 @@ def forward(
 )
 class VisualBertForPreTraining(VisualBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1462,6 +1463,7 @@ def forward(self, query, key, attention_mask):
 )
 class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 71c32a185955..515b886f98db 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1324,6 +1324,7 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     _keys_to_ignore_on_save = [
         r"proj_out.weight",
     ]
+    _tied_weights_keys = ["proj_out.weight"]
 
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 7f1fca9f94a8..4a72b785a024 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -757,6 +757,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
     _keys_to_ignore_on_save = [
         r"model.embed_positions.weights",
     ]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 2d0adcddf95c..a448b4b11631 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -671,6 +671,7 @@ def forward(self, x, y=None):
 )
 class XLMWithLMHeadModel(XLMPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
+    _tied_weights_keys = ["pred_layer.proj.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index 0e2567f99c9b..2d14bfb6a7b5 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -1769,6 +1769,7 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
         super().__init__(config)
@@ -1903,6 +1904,7 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
         "encoder.word_embeddings.weight",
         "lm_head.weight",
     ]
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
         super().__init__(config)
@@ -2118,6 +2120,7 @@ def get_decoder(self):
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
         # set config for CLM
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index fbd9dbb151c0..ae8d51a3f8eb 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -888,6 +888,7 @@ class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1046,6 +1047,7 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index e96e20f89695..fb86717e1d7f 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -853,6 +853,7 @@ class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1003,6 +1004,7 @@ class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 79197f49f10b..bea8ab643b19 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1293,6 +1293,7 @@ def forward(
 )
 class XLNetLMHeadModel(XLNetPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"lm_loss.weight"]
+    _tied_weights_keys = ["lm_loss.weight"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index 6453701fe93c..d99b77fedda3 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -992,6 +992,7 @@ class XmodForCausalLM(XmodPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod
     def __init__(self, config):
@@ -1154,6 +1155,7 @@ class XmodForMaskedLM(XmodPreTrainedModel):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod
     def __init__(self, config):
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index d549b0080a3e..8c2ff9fa4e07 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -854,6 +854,7 @@ class YosoForMaskedLM(YosoPreTrainedModel):
         "cls.predictions.decoder.weight",
         "embeddings.position_ids",
     ]
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
         super().__init__(config)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index e6e7968ae531..1a55aa6b754f 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import copy
 import gc
 import glob
@@ -22,6 +23,7 @@
 import os.path
 import pickle
 import random
+import re
 import sys
 import tempfile
 import unittest
@@ -127,6 +129,7 @@
         T5ForConditionalGeneration,
     )
     from transformers.modeling_utils import shard_checkpoint
+    from transformers.pytorch_utils import id_tensor_storage
 
     # Fake pretrained models for tests
     class BaseModel(PreTrainedModel):
@@ -1662,6 +1665,33 @@ def test_can_use_safetensors(self):
                         f"The shared pointers are incorrect, found different pointers for keys {shared_names}",
                     )
 
+    def test_tied_weights_keys(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config.tie_word_embeddings = True
+        for model_class in self.all_model_classes:
+            model_tied = model_class(config)
+
+            ptrs = collections.defaultdict(list)
+            for name, tensor in model_tied.state_dict().items():
+                ptrs[id_tensor_storage(tensor)].append(name)
+
+            # These are all the pointers of shared tensors.
+            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+
+            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
+            # Detect we get a hit for each key
+            for key in tied_weight_keys:
+                if not any(re.search(key, p) for group in tied_params for p in group):
+                    raise ValueError(f"{key} is not a tied weight key for {model_class}.")
+
+            # Removed tied weights found from tied params -> there should only be one left after
+            for key in tied_weight_keys:
+                for i in range(len(tied_params)):
+                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
+
+            tied_params = [group for group in tied_params if len(group) > 1]
+            self.assertListEqual(tied_params, [])
+
     def test_tied_model_weights_key_ignore(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:

From 4ed075280c30051126ba6e8d1634867abeb0fbc4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 13 Jun 2023 18:46:05 +0200
Subject: [PATCH 406/935] [Time Series] use mean scaler when scaling is a
 boolean True (#24237)

* use mean scaler when scaling is boolean True

* remove debug
---
 src/transformers/models/autoformer/modeling_autoformer.py       | 2 +-
 src/transformers/models/informer/modeling_informer.py           | 2 +-
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index a77920fb9d6e..3e482cadf670 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -1495,7 +1495,7 @@ class AutoformerModel(AutoformerPreTrainedModel):
     def __init__(self, config: AutoformerConfig):
         super().__init__(config)
 
-        if config.scaling == "mean" or config.scaling:
+        if config.scaling == "mean" or config.scaling is True:
             self.scaler = AutoformerMeanScaler(dim=1, keepdim=True)
         elif config.scaling == "std":
             self.scaler = AutoformerStdScaler(dim=1, keepdim=True)
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4c8edcbc1564..2bf3f208a903 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1504,7 +1504,7 @@ class InformerModel(InformerPreTrainedModel):
     def __init__(self, config: InformerConfig):
         super().__init__(config)
 
-        if config.scaling == "mean" or config.scaling:
+        if config.scaling == "mean" or config.scaling is True:
             self.scaler = InformerMeanScaler(dim=1, keepdim=True)
         elif config.scaling == "std":
             self.scaler = InformerStdScaler(dim=1, keepdim=True)
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index d5ffa069d95a..8986ef6729ca 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1229,7 +1229,7 @@ class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
-        if config.scaling == "mean" or config.scaling:
+        if config.scaling == "mean" or config.scaling is True:
             self.scaler = TimeSeriesMeanScaler(dim=1, keepdim=True)
         elif config.scaling == "std":
             self.scaler = TimeSeriesStdScaler(dim=1, keepdim=True)

From 7bb6933b9df1454c115911165b14ad6bcdf54a8f Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 13 Jun 2023 17:51:37 +0100
Subject: [PATCH 407/935] TF: standardize `test_model_common_attributes` for
 language models (#23457)

---
 .../models/ctrl/modeling_tf_ctrl.py           |  5 +++
 .../transfo_xl/modeling_tf_transfo_xl.py      |  5 +++
 ...tf_{{cookiecutter.lowercase_modelname}}.py | 20 ----------
 .../models/albert/test_modeling_tf_albert.py  | 21 -----------
 tests/models/bart/test_modeling_tf_bart.py    | 20 ----------
 tests/models/bert/test_modeling_tf_bert.py    | 21 -----------
 .../blenderbot/test_modeling_tf_blenderbot.py | 20 ----------
 .../test_modeling_tf_blenderbot_small.py      | 20 ----------
 tests/models/gpt2/test_modeling_tf_gpt2.py    | 18 ---------
 tests/models/gptj/test_modeling_tf_gptj.py    | 18 ---------
 tests/models/led/test_modeling_tf_led.py      | 20 ----------
 .../models/lxmert/test_modeling_tf_lxmert.py  | 21 -----------
 .../models/marian/test_modeling_tf_marian.py  | 20 ----------
 tests/models/mbart/test_modeling_tf_mbart.py  | 20 ----------
 .../mobilebert/test_modeling_tf_mobilebert.py | 21 -----------
 .../models/openai/test_modeling_tf_openai.py  | 18 ---------
 tests/models/opt/test_modeling_tf_opt.py      | 14 -------
 .../pegasus/test_modeling_tf_pegasus.py       | 20 ----------
 tests/models/t5/test_modeling_tf_t5.py        | 18 ---------
 tests/models/xglm/test_modeling_tf_xglm.py    | 18 ---------
 tests/test_modeling_tf_common.py              | 37 ++++++++++---------
 21 files changed, 30 insertions(+), 365 deletions(-)

diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index 18c8c2c5883d..cd6887390319 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -725,6 +725,11 @@ def __init__(self, config, *inputs, **kwargs):
         self.transformer = TFCTRLMainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
         return self.transformer.w
 
     @unpack_inputs
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index 2ef67426f87c..b8b9d8555594 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -1032,6 +1032,11 @@ def __init__(self, config, *inputs, **kwargs):
         self.transformer = TFTransfoXLMainLayer(config, name="transformer")
 
     def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
         return self.transformer.word_emb
 
     @unpack_inputs
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 28c28884e248..a92a900947cc 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -869,26 +869,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @unittest.skip(reason="Template classes interact badly with this test.")
     def test_keras_fit(self):
         pass
diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
index 97b073e850e2..eaa6791f9284 100644
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ b/tests/models/albert/test_modeling_tf_albert.py
@@ -300,27 +300,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFAlbertForPreTraining, TFAlbertForMaskedLM]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index 2068c5f6651c..dfd953f8cf09 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -225,26 +225,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index a8a2159fe13b..b68a81a1030b 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -726,27 +726,6 @@ def test_model_from_pretrained(self):
         model = TFBertModel.from_pretrained("jplu/tiny-tf-bert-random")
         self.assertIsNotNone(model)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFBertForMaskedLM, TFBertForPreTraining, TFBertLMHeadModel]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_custom_load_tf_weights(self):
         model, output_loading_info = TFBertForTokenClassification.from_pretrained(
             "jplu/tiny-tf-bert-random", output_loading_info=True
diff --git a/tests/models/blenderbot/test_modeling_tf_blenderbot.py b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
index 5fd6faefec86..ac4649cfc263 100644
--- a/tests/models/blenderbot/test_modeling_tf_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
@@ -207,26 +207,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
index 5bc5c4afe91d..31e4c94c670c 100644
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -209,26 +209,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index c0aeeddafd19..cce37aa95e60 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -416,24 +416,6 @@ def test_gpt2_double_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_gpt2_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
index 0e4dc9f5831d..649eab2889f9 100644
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ b/tests/models/gptj/test_modeling_tf_gptj.py
@@ -363,24 +363,6 @@ def test_gptj_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gptj_lm_head_model(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @slow
     @unittest.skipIf(
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) > 0,
diff --git a/tests/models/led/test_modeling_tf_led.py b/tests/models/led/test_modeling_tf_led.py
index 8735aeb721d7..480d4520d901 100644
--- a/tests/models/led/test_modeling_tf_led.py
+++ b/tests/models/led/test_modeling_tf_led.py
@@ -222,26 +222,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         inputs_dict["global_attention_mask"] = tf.zeros_like(inputs_dict["attention_mask"])
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
index 411de960f31d..9d97ddb462cc 100644
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ b/tests/models/lxmert/test_modeling_tf_lxmert.py
@@ -581,27 +581,6 @@ def test_compile_tf_model(self):
             extended_model = tf.keras.Model(inputs=[input_ids, visual_feats, visual_pos], outputs=[outputs])
             extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFLxmertForPreTraining]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
index 5a624fda9a38..8833938f3b2e 100644
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ b/tests/models/marian/test_modeling_tf_marian.py
@@ -240,26 +240,6 @@ def test_compile_tf_model(self):
         extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
         extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index 6c36d705e81b..70a93acf8175 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -222,26 +222,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
index 293126ab6147..7e67c56a4fb6 100644
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
@@ -311,27 +311,6 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mobilebert_for_token_classification(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_lm_models = [TFMobileBertForMaskedLM, TFMobileBertForPreTraining]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in list_lm_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @slow
     def test_keras_fit(self):
         # Override as it is a slow test on this model
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index a82da911a4c8..cf4c81dd6473 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -247,24 +247,6 @@ def test_openai_gpt_double_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     def test_openai_gpt_sequence_classification_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_openai_gpt_for_sequence_classification(*config_and_inputs)
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
index 0fd3a22bf37e..fb7c842c9c3f 100644
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ b/tests/models/opt/test_modeling_tf_opt.py
@@ -171,20 +171,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-
     def test_resize_token_embeddings(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
index b34a3dcfb5cc..0bd1ed25e7bb 100644
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/models/pegasus/test_modeling_tf_pegasus.py
@@ -238,26 +238,6 @@ def test_compile_tf_model(self):
         extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
         extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 7a75f51cd768..a2caa664ff39 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -300,24 +300,6 @@ def test_t5_decoder_model_past_large_inputs(self):
 
         self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index 3582209cc7d1..7b964f69911d 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -160,24 +160,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-
-            if model_class in self.all_generative_model_classes:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 586a2a761dc1..51ef490c5e69 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1013,7 +1013,7 @@ def check_hidden_states_output(config, inputs_dict, model_class):
             check_hidden_states_output(config, inputs_dict, model_class)
 
     def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         text_in_text_out_models = (
             get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
             + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
@@ -1023,24 +1023,27 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-            if model_class in text_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert isinstance(name, dict)
-                for k, v in name.items():
-                    assert isinstance(v, tf.Variable)
+            self.assertIsInstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            legacy_text_in_text_out = model.get_lm_head() is not None
+            if model_class in text_in_text_out_models or legacy_text_in_text_out:
+                out_embeddings = model.get_output_embeddings()
+                self.assertIsInstance(out_embeddings, tf.keras.layers.Layer)
+                bias = model.get_bias()
+                if bias is not None:
+                    self.assertIsInstance(bias, dict)
+                    for _, v in bias.items():
+                        self.assertIsInstance(v, tf.Variable)
             elif model_class in speech_in_text_out_models:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
+                out_embeddings = model.get_output_embeddings()
+                self.assertIsInstance(out_embeddings, tf.keras.layers.Layer)
+                bias = model.get_bias()
+                self.assertIsNone(bias)
             else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
+                out_embeddings = model.get_output_embeddings()
+                assert out_embeddings is None
+                bias = model.get_bias()
+                self.assertIsNone(bias)
 
     def test_determinism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From b1ea6b4bf57a9117a6bb24b4b7c8856b1e05dee3 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 13 Jun 2023 17:59:21 +0100
Subject: [PATCH 408/935] Generate: GenerationConfig can overwrite attributes
 at from_pretrained time (#24238)

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../generation/configuration_utils.py         |  9 ++++---
 tests/generation/test_configuration_utils.py  | 25 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 9f3bedcdebd8..d024ff4718eb 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -288,7 +288,8 @@ def __init__(self, **kwargs):
 
         # Additional attributes without default values
         if not self._from_model_config:
-            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a model's default configuration file
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
             for key, value in kwargs.items():
                 try:
                     setattr(self, key, value)
@@ -569,9 +570,9 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
         if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
             kwargs["_commit_hash"] = config_dict["_commit_hash"]
 
-        # remove all the arguments that are in the config_dict
-
-        config = cls(**config_dict, **kwargs)
+        # The line below allows model-specific config to be loaded as well through kwargs, with safety checks.
+        # See https://github.com/huggingface/transformers/pull/21269
+        config = cls(**{**config_dict, **kwargs})
         unused_kwargs = config.update(**kwargs)
 
         logger.info(f"Generate config {config}")
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index a12b35968283..c2dd7b005b75 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -93,6 +93,31 @@ def test_initialize_new_kwargs(self):
         generation_config = GenerationConfig.from_model_config(new_config)
         assert not hasattr(generation_config, "foo")  # no new kwargs should be initialized if from config
 
+    def test_kwarg_init(self):
+        """Tests that we can overwrite attributes at `from_pretrained` time."""
+        default_config = GenerationConfig()
+        self.assertEqual(default_config.temperature, 1.0)
+        self.assertEqual(default_config.do_sample, False)
+        self.assertEqual(default_config.num_beams, 1)
+
+        config = GenerationConfig(
+            do_sample=True,
+            temperature=0.7,
+            length_penalty=1.0,
+            bad_words_ids=[[1, 2, 3], [4, 5]],
+        )
+        self.assertEqual(config.temperature, 0.7)
+        self.assertEqual(config.do_sample, True)
+        self.assertEqual(config.num_beams, 1)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir)
+            loaded_config = GenerationConfig.from_pretrained(tmp_dir, temperature=1.0)
+
+        self.assertEqual(loaded_config.temperature, 1.0)
+        self.assertEqual(loaded_config.do_sample, True)
+        self.assertEqual(loaded_config.num_beams, 1)  # default value
+
 
 @is_staging_test
 class ConfigPushToHubTester(unittest.TestCase):

From cf561d7cf1cb6002b80622b826ec02d522e7f9e2 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 13 Jun 2023 19:19:40 +0200
Subject: [PATCH 409/935] Add `torch >=1.12` requirement for `Tapas` (#24251)

* fix

* fix

* fix

* Update src/transformers/models/tapas/modeling_tapas.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/tapas/modeling_tapas.py       | 7 +++++++
 src/transformers/pytorch_utils.py                     | 1 +
 tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py | 7 +++++++
 tests/models/tapas/test_modeling_tapas.py             | 6 ++++++
 4 files changed, 21 insertions(+)

diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index dddc51917f45..1621653f3ee0 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -32,6 +32,7 @@
 from ...pytorch_utils import (
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
+    is_torch_greater_or_equal_than_1_12,
     prune_linear_layer,
 )
 from ...utils import (
@@ -46,6 +47,12 @@
 
 logger = logging.get_logger(__name__)
 
+if not is_torch_greater_or_equal_than_1_12:
+    logger.warning(
+        f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
+        "TapasModel. Please upgrade torch."
+    )
+
 _CONFIG_FOR_DOC = "TapasConfig"
 _CHECKPOINT_FOR_DOC = "google/tapas-base"
 
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index f076eb445cb6..839261663457 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -29,6 +29,7 @@
 parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
 
 is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
+is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
 is_torch_greater_or_equal_than_1_10 = parsed_torch_version_base >= version.parse("1.10")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
 
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 01e6ceef9e90..8beddc0abad1 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -37,6 +37,9 @@
         GPTBigCodeModel,
     )
     from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeAttention
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
 
 
 class GPTBigCodeModelTester:
@@ -530,6 +533,10 @@ class GPTBigCodeMHAModelTest(GPTBigCodeModelTest):
     multi_query = False
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_12,
+    reason="`GPTBigCode` checkpoints use `PytorchGELUTanh` which requires `torch>=1.12.0`.",
+)
 @slow
 @require_torch
 class GPTBigCodeModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 644307e3f917..619a5d26128f 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -60,6 +60,9 @@
         reduce_mean,
         reduce_sum,
     )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
 
 
 class TapasModelTester:
@@ -408,6 +411,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
 @require_torch
 class TapasModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
@@ -562,6 +566,7 @@ def prepare_tapas_batch_inputs_for_training():
     return table, queries, answer_coordinates, answer_text, float_answer
 
 
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
 @require_torch
 class TapasModelIntegrationTest(unittest.TestCase):
     @cached_property
@@ -916,6 +921,7 @@ def test_inference_classification_head(self):
 # Below: tests for Tapas utilities which are defined in modeling_tapas.py.
 # These are based on segmented_tensor_test.py of the original implementation.
 # URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py
+@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
 @require_torch
 class TapasUtilitiesTest(unittest.TestCase):
     def _prepare_tables(self):

From e64d99fa6b2aa7855d7af3c7388bd690feba1ec8 Mon Sep 17 00:00:00 2001
From: Ivan Reznikov <ivanreznikov@gmail.com>
Date: Tue, 13 Jun 2023 21:23:30 +0400
Subject: [PATCH 410/935] Update urls in warnings for rich rendering (#24136)

* fixing typo in url in warnings

* fixing typo in url in warnings

* multi-line fix

* multi-line fix

* Update src/transformers/generation/utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/generation/flax_utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/generation/tf_utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/generation/flax_utils.py | 2 +-
 src/transformers/generation/tf_utils.py   | 2 +-
 src/transformers/generation/utils.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 65d65869afd2..f18cc0ea84ff 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -319,7 +319,7 @@ def generate(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
                         " Please use a generation configuration file (see"
-                        " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+                        " https://huggingface.co/docs/transformers/main_classes/text_generation )"
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 5e4bc58c8407..ff8be3dc0aea 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -746,7 +746,7 @@ def generate(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
                         " Please use a generation configuration file (see"
-                        " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+                        " https://huggingface.co/docs/transformers/main_classes/text_generation )"
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index d077597ac6d4..eed177af9ef7 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1260,7 +1260,7 @@ def generate(
                         "You have modified the pretrained model configuration to control generation. This is a"
                         " deprecated strategy to control generation and will be removed soon, in a future version."
                         " Please use a generation configuration file (see"
-                        " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+                        " https://huggingface.co/docs/transformers/main_classes/text_generation )"
                     )
                     self.generation_config = new_generation_config
             generation_config = self.generation_config

From b979a2064d10a786afbf43514e33c5b1942e67bd Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 13 Jun 2023 18:57:50 +0100
Subject: [PATCH 411/935] Fix how we detect the TF package (#24255)

* Fix how we detect the TF package

* Add a comment as a talisman warding against future harm

* Actually put the comment in the right place
---
 src/transformers/utils/import_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index eb9cdcc67b0f..03b564876898 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -147,7 +147,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
     _tf_available = True
 else:
     if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-        _tf_available = _is_package_available("tensorflow")
+        # Note: _is_package_available("tensorflow") fails for tensorflow-cpu. Please test any changes to the line below
+        # with tensorflow-cpu to make sure it still works!
+        _tf_available = importlib.util.find_spec("tensorflow") is not None
         if _tf_available:
             candidates = (
                 "tensorflow",

From 3bd1fe431585e233efb4564d12d751b3174996c3 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 13 Jun 2023 19:04:22 +0100
Subject: [PATCH 412/935] Stop storing references to bound methods via
 tf.function (#24146)

* Stop storing references to bound methods in tf.functions

* Remove the gc.collect calls now that we resolved the underlying problem

* Remove the default signature from model.serving entirely, big cleanup

* Remove _prune_signature as self.input_signature can prune itself

* Restore serving docstring

* Update int support test to check the input signature

* Make sure other tests also use model.input_signature and not serving.input_signature

* Restore _prune_signature

* Remove the doctest GC now it's no longer needed

* Correct core tests to use the pruned sig

* order lines correctly in core tests

* Add eager_serving back with a deprecation warning
---
 src/transformers/modeling_tf_utils.py         | 38 +++++++++++++------
 src/transformers/testing_utils.py             |  7 ----
 ...tf_{{cookiecutter.lowercase_modelname}}.py |  2 +-
 tests/models/rag/test_modeling_tf_rag.py      |  6 ---
 tests/models/sam/test_modeling_tf_sam.py      |  6 ---
 tests/models/xglm/test_modeling_tf_xglm.py    |  6 ---
 tests/test_modeling_tf_common.py              | 12 ++----
 tests/utils/test_modeling_tf_core.py          |  5 ++-
 8 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index b23d2bae6610..73ba70806ca6 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1171,12 +1171,8 @@ def __init__(self, config, *inputs, **kwargs):
         self.config = config
         self.name_or_path = config.name_or_path
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
-        if not hasattr(self, "serving"):  # Don't overwrite existing serving signatures
-            self.serving = tf.function(
-                self.eager_serving, input_signature=[self._prune_signature(self.input_signature)]
-            )
         # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
-        self._set_save_spec(self.serving.input_signature[0])
+        self._set_save_spec(self._prune_signature(self.input_signature))
 
     def get_config(self):
         return self.config.to_dict()
@@ -1226,15 +1222,31 @@ def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
         head_mask = tf.cast(head_mask, tf.float32)  # switch to float if need + fp16 compatibility
         return head_mask
 
+    @tf.function
+    def serving(self, inputs):
+        """
+        Args:
+        Method used for serving the model. Does not have a specific signature, but will be specialized as concrete
+        functions when saving with `save_pretrained`.
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        output = self.call(inputs)
+
+        return self.serving_output(output)
+
     def eager_serving(self, inputs):
         """
-        Method used for serving the model. Intended not to be compiled with a tf.function decorator so that we can use
-        it to generate multiple signatures later.
+        Method used for serving the model. This method is deprecated, and will be removed.
 
         Args:
             inputs (`Dict[str, tf.Tensor]`):
                 The input of the saved model as a dictionary of tensors.
         """
+        warnings.warn(
+            "The function `eager_serving` is deprecated and will be removed in version 4.32.0 of Transformers",
+            FutureWarning,
+        )
         output = self.call(inputs)
 
         return self.serving_output(output)
@@ -2409,17 +2421,19 @@ def save_pretrained(
             if getattr(self.config, "torch_dtype", None) is not None and not isinstance(self.config.torch_dtype, str):
                 self.config.torch_dtype = str(self.config.torch_dtype).split(".")[1]
             if signatures is None:
-                if any(spec.dtype == tf.int32 for spec in self.serving.input_signature[0].values()):
+                sig = self._prune_signature(self.input_signature)
+                serving_default = self.serving.get_concrete_function(sig)
+                if any(spec.dtype == tf.int32 for spec in sig.values()):
                     int64_spec = {
                         key: tf.TensorSpec(
                             shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name
                         )
-                        for key, spec in self.serving.input_signature[0].items()
+                        for key, spec in sig.items()
                     }
-                    int64_serving = tf.function(self.eager_serving, input_signature=[int64_spec])
-                    signatures = {"serving_default": self.serving, "int64_serving": int64_serving}
+                    int64_serving = self.serving.get_concrete_function(int64_spec)
+                    signatures = {"serving_default": serving_default, "int64_serving": int64_serving}
                 else:
-                    signatures = self.serving
+                    signatures = serving_default
             saved_model_dir = os.path.join(save_directory, "saved_model", str(version))
             self.save(saved_model_dir, include_optimizer=False, signatures=signatures)
             logger.info(f"Saved model created in {saved_model_dir}")
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index ce2dec08e897..a3a4baa1f0d1 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1882,13 +1882,6 @@ def preprocess_string(string, skip_cuda_tests):
     if not is_cuda_found:
         modified_string = "".join(codeblocks)
 
-        if ">>>" in modified_string:
-            lines = modified_string.split("\n")
-            indent = len(lines[-1]) - len(lines[-1].lstrip())
-
-            cleanup = ">>> import gc; gc.collect()  # doctest: +IGNORE_RESULT"
-            modified_string += "\n" + " " * indent + cleanup
-
     return modified_string
 
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 982a5807d206..c7c69aa18fd4 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -2676,7 +2676,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
-        self.model._set_save_spec(inputs=self.serving.input_signature)
+        self.model._set_save_spec(self._prune_signature(self.input_signature))
         self.use_cache = config.use_cache
         # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.bias_layer = BiasLayer(
diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
index a7edf6e0f1b7..b4720f7c7f0d 100644
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ b/tests/models/rag/test_modeling_tf_rag.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import gc
 import json
 import os
 import shutil
@@ -551,11 +550,6 @@ def config_and_inputs(self):
 @require_sentencepiece
 @require_tokenizers
 class TFRagModelIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
     @cached_property
     def token_model(self):
         return TFRagTokenForGeneration.from_pretrained_question_encoder_generator(
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index 863d4bd842b9..a14b99128671 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -17,7 +17,6 @@
 
 from __future__ import annotations
 
-import gc
 import inspect
 import unittest
 
@@ -431,11 +430,6 @@ def prepare_dog_img():
 @require_tf
 @slow
 class TFSamModelIntegrationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
     def test_inference_mask_generation_no_point(self):
         model = TFSamModel.from_pretrained("facebook/sam-vit-base")
         processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index 7b964f69911d..3c9e4770f730 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import gc
 import unittest
 
 from transformers import XGLMConfig, XGLMTokenizer, is_tf_available
@@ -173,11 +172,6 @@ def test_resize_token_embeddings(self):
 
 @require_tf
 class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
     @slow
     def test_lm_generate_xglm(self, verify_outputs=True):
         model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 51ef490c5e69..9ebbe86bf031 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1687,14 +1687,10 @@ def test_int_support(self):
                 if tensor.dtype.is_integer:
                     self.assertTrue(tensor.dtype == tf.int32, "Integer dummy inputs should be tf.int32!")
 
-            # Also confirm that the serving sig uses int32
-            if hasattr(model, "serving"):
-                serving_sig = model.serving.input_signature
-                for key, tensor_spec in serving_sig[0].items():
-                    if tensor_spec.dtype.is_integer:
-                        self.assertTrue(
-                            tensor_spec.dtype == tf.int32, "Serving signatures should use tf.int32 for ints!"
-                        )
+            # Also confirm that the input_signature uses int32
+            for key, tensor_spec in model.input_signature.items():
+                if tensor_spec.dtype.is_integer:
+                    self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")
 
     def test_generate_with_headmasking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index ea5bc26986ae..17d68a4de59a 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -217,17 +217,18 @@ def test_saved_model_creation_extended(self):
         for model_class in self.all_model_classes:
             class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
+            class_sig = model._prune_signature(model.input_signature)
             num_out = len(model(class_inputs_dict))
 
             for key in list(class_inputs_dict.keys()):
                 # Remove keys not in the serving signature, as the SavedModel will not be compiled to deal with them
-                if key not in model.serving.input_signature[0]:
+                if key not in class_sig:
                     del class_inputs_dict[key]
                 # Check it's a tensor, in case the inputs dict has some bools in it too
                 elif isinstance(class_inputs_dict[key], tf.Tensor) and class_inputs_dict[key].dtype.is_integer:
                     class_inputs_dict[key] = tf.cast(class_inputs_dict[key], tf.int32)
 
-            if set(class_inputs_dict.keys()) != set(model.serving.input_signature[0].keys()):
+            if set(class_inputs_dict.keys()) != set(class_sig.keys()):
                 continue  # Some models have inputs that the preparation functions don't create, we skip those
 
             with tempfile.TemporaryDirectory() as tmpdirname:

From 233113149b02756460fe07cfc047aff1d9c7dfeb Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 13 Jun 2023 20:33:26 +0200
Subject: [PATCH 413/935] Skip `GPT-J` fx tests for torch < 1.12 (#24256)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/gptj/test_modeling_gptj.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 5fe0fec39181..7fd6a40e1732 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -37,6 +37,9 @@
         GPTJForSequenceClassification,
         GPTJModel,
     )
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
 
 
 class GPTJModelTester:
@@ -385,6 +388,18 @@ class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     test_model_parallel = False
     test_head_masking = False
 
+    @unittest.skipIf(
+        not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+."
+    )
+    def test_torch_fx(self):
+        super().test_torch_fx()
+
+    @unittest.skipIf(
+        not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+."
+    )
+    def test_torch_fx_output_loss(self):
+        super().test_torch_fx_output_loss()
+
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
         self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name

From e0603d894df58a19679212ca59db21536490bf69 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 14 Jun 2023 00:31:06 +0530
Subject: [PATCH 414/935] docs wrt using accelerate launcher with trainer
 (#24250)

* update docs

* missing part

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* address comments

* address Zach's comment

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/main_classes/trainer.mdx | 150 ++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.mdx
index 728d7555cf4e..9a941964a57c 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.mdx
@@ -688,6 +688,156 @@ Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, ther
 have any problems or questions with regards to MPS backend usage, please, 
 file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
 
+
+## Using Accelerate Launcher with Trainer
+
+Accelerate now powers Trainer. In terms of what users should expect:
+- They can keep using the Trainer ingterations such as FSDP, DeepSpeed vis trainer arguments without any changes on their part.
+- They can now use Accelerate Launcher with Trainer (recommended).
+
+Steps to use Accelerate Launcher with Trainer:
+1. Make sure 🤗 Accelerate is installed, you can't use the `Trainer` without it anyway. If not `pip install accelerate`. You may also need to update your version of Accelerate: `pip install accelerate --upgrade`
+2. Run `accelerate config` and fill the questionnaire. Below are example accelerate configs:
+  a. DDP Multi-node Multi-GPU config:
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    distributed_type: MULTI_GPU                                                                                                    
+    downcast_bf16: 'no'
+    gpu_ids: all
+    machine_rank: 0 #change rank as per the node
+    main_process_ip: 192.168.20.1
+    main_process_port: 9898
+    main_training_function: main
+    mixed_precision: fp16
+    num_machines: 2
+    num_processes: 8
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  b. FSDP config:
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    distributed_type: FSDP
+    downcast_bf16: 'no'
+    fsdp_config:
+      fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+      fsdp_backward_prefetch_policy: BACKWARD_PRE
+      fsdp_forward_prefetch: true
+      fsdp_offload_params: false
+      fsdp_sharding_strategy: 1
+      fsdp_state_dict_type: FULL_STATE_DICT
+      fsdp_sync_module_states: true
+      fsdp_transformer_layer_cls_to_wrap: BertLayer
+      fsdp_use_orig_params: true
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 2
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+  c. DeepSpeed config pointing to a file:
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    deepspeed_config:
+      deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+      zero3_init_flag: true
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  d. DeepSpeed config using accelerate plugin:
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    deepspeed_config:                                                                                                              
+      gradient_accumulation_steps: 1
+      gradient_clipping: 0.7
+      offload_optimizer_device: cpu
+      offload_param_device: cpu
+      zero3_init_flag: true
+      zero_stage: 2
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+3. Run the Trainer script with args other than the ones handled above by accelerate config or launcher args.
+Below is an example to run `run_glue.py` using `accelerate launcher` with FSDP config from above. 
+
+```bash
+cd transformers
+
+accelerate launch \
+./examples/pytorch/text-classification/run_glue.py \
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+4. You can also directly use the cmd args for `accelerate launch`. Above example would map to:
+
+```bash
+cd transformers
+
+accelerate launch --num_processes=2 \
+--use_fsdp \
+--mixed_precision=bf16 \
+--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+--fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+--fsdp_sharding_strategy=1 \
+--fsdp_state_dict_type=FULL_STATE_DICT \
+./examples/pytorch/text-classification/run_glue.py
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+For more information, please refer the 🤗 Accelerate CLI guide: [Launching your 🤗 Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch).
+
 Sections that were moved:
 
 [ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>

From b89fcccd44508110fd11579a554c1876bc10c0ad Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 14 Jun 2023 00:49:15 +0530
Subject: [PATCH 415/935] update FSDP save and load logic (#24249)

* update fsdp save and load logic

* fix

* see if this resolves the failing tests
---
 src/transformers/trainer.py | 56 ++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index cb8bc18c6c8d..502f380bf8db 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -216,6 +216,14 @@
     from accelerate import Accelerator
     from accelerate.utils import DistributedDataParallelKwargs
 
+    if version.parse(accelerate_version) > version.parse("0.20.3"):
+        from accelerate.utils import (
+            load_fsdp_model,
+            load_fsdp_optimizer,
+            save_fsdp_model,
+            save_fsdp_optimizer,
+        )
+
 
 if TYPE_CHECKING:
     import optuna
@@ -2035,7 +2043,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     # release memory
                     del state_dict
             elif self.is_fsdp_enabled:
-                self.accelerator.state.fsdp_plugin.load_model(self.accelerator, model, resume_from_checkpoint)
+                load_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, model, resume_from_checkpoint)
             else:
                 # We load the model state dict on the CPU to avoid an OOM error.
                 if self.args.save_safetensors and os.path.isfile(safe_weights_file):
@@ -2096,8 +2104,8 @@ def _load_best_model(self):
                         state_dict["_smp_is_partial"] = False
                         load_result = model.load_state_dict(state_dict, strict=True)
                 elif self.is_fsdp_enabled:
-                    self.accelerator.state.fsdp_plugin.load_model(
-                        self.accelerator, model, self.state.best_model_checkpoint
+                    load_fsdp_model(
+                        self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint
                     )
                 else:
                     if is_peft_available() and isinstance(model, PeftModel):
@@ -2269,11 +2277,16 @@ def _save_checkpoint(self, model, trial, metrics=None):
         if self.sharded_ddp == ShardedDDPOption.SIMPLE:
             self.optimizer.consolidate_state_dict()
 
-        if self.fsdp:
-            # FSDP has a different interface for saving optimizer states.
-            # Needs to be called on all ranks to gather all states.
-            # full_optim_state_dict will be deprecated after Pytorch 2.2!
-            full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
+        if self.fsdp or self.is_fsdp_enabled:
+            if self.is_fsdp_enabled:
+                save_fsdp_optimizer(
+                    self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
+                )
+            else:
+                # FSDP has a different interface for saving optimizer states.
+                # Needs to be called on all ranks to gather all states.
+                # full_optim_state_dict will be deprecated after Pytorch 2.2!
+                full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
 
         if is_torch_tpu_available():
             xm.rendezvous("saving_optimizer_states")
@@ -2413,14 +2426,23 @@ def opt_load_hook(mod, opt):
                     # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
                     # likely to get OOM on CPU (since we load num_gpu times the optimizer state
                     map_location = self.args.device if self.args.world_size > 1 else "cpu"
-                    if self.fsdp:
-                        full_osd = None
-                        # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it
-                        if self.args.process_index == 0:
-                            full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))
-                        # call scatter_full_optim_state_dict on all ranks
-                        sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model)
-                        self.optimizer.load_state_dict(sharded_osd)
+                    if self.fsdp or self.is_fsdp_enabled:
+                        if self.is_fsdp_enabled:
+                            load_fsdp_optimizer(
+                                self.accelerator.state.fsdp_plugin,
+                                self.accelerator,
+                                self.optimizer,
+                                self.model,
+                                checkpoint,
+                            )
+                        else:
+                            full_osd = None
+                            # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it
+                            if self.args.process_index == 0:
+                                full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))
+                            # call scatter_full_optim_state_dict on all ranks
+                            sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model)
+                            self.optimizer.load_state_dict(sharded_osd)
                     else:
                         self.optimizer.load_state_dict(
                             torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
@@ -2724,7 +2746,7 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
         ):
             if self.is_fsdp_enabled:
                 os.makedirs(output_dir, exist_ok=True)
-                self.accelerator.state.fsdp_plugin.save_model(self.accelerator, self.model, output_dir)
+                save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
             else:
                 state_dict = self.model.state_dict()
 

From 6ab045d6fe7a859ddc219cd144e638bb4d8ab2fe Mon Sep 17 00:00:00 2001
From: TAE YOUNGDON <taepd1@gmail.com>
Date: Wed, 14 Jun 2023 19:08:31 +0900
Subject: [PATCH 416/935] Fix URL in comment for contrastive loss function
 (#24271)

* Update language_modeling.py

in "class TextDatasetForNextSentencePrediction(Dataset)", double considering "self.tokenizer.num_special_tokens_to_add(pair=True)"

so, i remove self.block_size, and add parameter for "def create_examples_from_document". like "class LineByLineWithSOPTextDataset" do

* Update language_modeling.py

* Fix URL in comment for contrastive loss function
---
 src/transformers/models/clip/modeling_clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index b6ea28617c02..ee9d660ef713 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -61,7 +61,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 
 # contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+# https://sachinruk.github.io/blog/2021-03-07-clip.html
 def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
     return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
 

From 91b62f5a785264b626ec7ba708c894ac0276f5cb Mon Sep 17 00:00:00 2001
From: ByronHsu <byronhsu1230@gmail.com>
Date: Wed, 14 Jun 2023 03:23:55 -0700
Subject: [PATCH 417/935] QA doc: import torch before it is used (#24228)

* import torch before it is used

* style

Signed-off-by: byhsu <byhsu@linkedin.com>

---------

Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
---
 docs/source/en/tasks/question_answering.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.mdx
index fa39b640494d..ae31a070fc7e 100644
--- a/docs/source/en/tasks/question_answering.mdx
+++ b/docs/source/en/tasks/question_answering.mdx
@@ -369,6 +369,7 @@ Tokenize the text and return PyTorch tensors:
 Pass your inputs to the model and return the `logits`:
 
 ```py
+>>> import torch
 >>> from transformers import AutoModelForQuestionAnswering
 
 >>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")

From eac8dede838c6ef965866689e85916d81613400a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 14 Jun 2023 14:25:24 +0200
Subject: [PATCH 418/935] Skip some `TQAPipelineTests` tests in past CI
 (#24267)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../test_pipelines_table_question_answering.py        | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/pipelines/test_pipelines_table_question_answering.py b/tests/pipelines/test_pipelines_table_question_answering.py
index 6c427d840c5e..a30763fc096d 100644
--- a/tests/pipelines/test_pipelines_table_question_answering.py
+++ b/tests/pipelines/test_pipelines_table_question_answering.py
@@ -20,6 +20,7 @@
     AutoTokenizer,
     TableQuestionAnsweringPipeline,
     TFAutoModelForTableQuestionAnswering,
+    is_torch_available,
     pipeline,
 )
 from transformers.testing_utils import (
@@ -32,6 +33,12 @@
 )
 
 
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
+
+
 @is_pipeline_test
 class TQAPipelineTests(unittest.TestCase):
     # Putting it there for consistency, but TQA do not have fast tokenizer
@@ -143,6 +150,7 @@ def test_small_model_tf(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
     def test_small_model_pt(self):
         model_id = "lysandre/tiny-tapas-random-wtq"
@@ -245,6 +253,7 @@ def test_small_model_pt(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
     def test_slow_tokenizer_sqa_pt(self):
         model_id = "lysandre/tiny-tapas-random-sqa"
@@ -486,6 +495,7 @@ def test_slow_tokenizer_sqa_tf(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
     def test_integration_wtq_pt(self):
@@ -580,6 +590,7 @@ def test_integration_wtq_tf(self):
         ]
         self.assertListEqual(results, expected_results)
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
     def test_integration_sqa_pt(self):

From 4626df5077e1baa7886b034f8dc326fef16f243a Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 14 Jun 2023 14:39:02 +0100
Subject: [PATCH 419/935] TF: CTRL with native embedding layers (#23456)

---
 .../models/ctrl/modeling_tf_ctrl.py           | 124 ++++++++++--------
 tests/models/ctrl/test_modeling_tf_ctrl.py    |   1 +
 2 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index cd6887390319..d8005868178b 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -15,10 +15,8 @@
 # limitations under the License.
 """ TF 2.0 CTRL model."""
 
-
 from __future__ import annotations
 
-import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -30,7 +28,6 @@
     TFModelInputType,
     TFPreTrainedModel,
     TFSequenceClassificationLoss,
-    TFSharedEmbeddings,
     get_initializer,
     keras_serializable,
     unpack_inputs,
@@ -224,8 +221,11 @@ def __init__(self, config, **kwargs):
 
         self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
 
-        self.w = TFSharedEmbeddings(
-            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
+        self.w = tf.keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="w",
         )
 
         self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
@@ -246,9 +246,8 @@ def __init__(self, config, **kwargs):
     def get_input_embeddings(self):
         return self.w
 
-    def set_input_embeddings(self, value):
-        self.w.weight = value
-        self.w.vocab_size = shape_list(value)[0]
+    def set_input_embeddings(self, new_embeddings):
+        self.w = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """
@@ -308,7 +307,7 @@ def call(
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
             # this attention mask is more simple than the triangular masking of causal attention
             # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))
+            attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1] + past_length))
 
             # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
             # masked positions, this operation will create a tensor which is 0.0 for
@@ -332,15 +331,15 @@ def call(
 
         if token_type_ids is not None:
             token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
-            token_type_embeds = self.w(token_type_ids, mode="embedding")
+            token_type_embeds = self.w(token_type_ids)
             token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype))
         else:
             token_type_embeds = tf.constant(0.0)
         position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
 
         if inputs_embeds is None:
-            check_embeddings_within_bounds(input_ids, self.w.vocab_size)
-            inputs_embeds = self.w(input_ids, mode="embedding")
+            check_embeddings_within_bounds(input_ids, self.w.input_dim)
+            inputs_embeds = self.w(input_ids)
         seq_len = input_shape[-1]
         mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
 
@@ -565,39 +564,26 @@ def call(
         return outputs
 
 
-class TFCTRLLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.config = config
-        # CTRL has numerical issues in XLA generate
-        self.supports_xla_generation = False
+class TFCTRLBiasLayer(tf.keras.layers.Layer):
+    """
+    Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis,
+    so all weights have to be registered in a layer.
+    """
 
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
+    def __init__(self, shape, initializer, trainable, name, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.shape = shape
+        self.initializer = initializer
+        self.trainable = trainable
 
-    def build(self, input_shape=None):
-        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+    def build(self, input_shape):
+        self.bias = self.add_weight(
+            name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable
+        )
         super().build(input_shape)
 
-    def get_output_embeddings(self):
-        return self.input_embeddings
-
-    def set_output_embeddings(self, value):
-        self.input_embeddings.weight = value
-        self.input_embeddings.vocab_size = shape_list(value)[0]
-
-    def get_bias(self):
-        return {"bias": self.bias}
-
-    def set_bias(self, value):
-        self.bias = value["bias"]
-        self.config.vocab_size = shape_list(value["bias"])[0]
-
-    def call(self, hidden_states):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
+    def call(self, x):
+        return x + self.bias
 
 
 @add_start_docstrings(
@@ -611,24 +597,53 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFCTRLMainLayer(config, name="transformer")
+        self.bias_layer = TFCTRLBiasLayer(
+            name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True
+        )
 
-        self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
-        # CTRL has numerical issues in XLA generate
-        self.supports_xla_generation = False
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
 
-    def get_lm_head(self):
-        return self.lm_head
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
 
-    def get_prefix_bias_name(self):
-        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
-        return self.name + "/" + self.lm_head.name
+    def get_bias(self):
+        return {"lm_head.bias": self.bias_layer.bias}
 
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
+    def set_bias(self, value):
+        # Replaces the existing layers containing bias for correct (de)serialization.
+        vocab_size = value["lm_head.bias"].shape[-1]
+        self.bias_layer = TFCTRLBiasLayer(
+            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True
+        )
+        self.bias_layer.build(None)
+        self.bias_layer.bias.assign(value["lm_head.bias"])
+
+    # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
-            input_ids = tf.expand_dims(input_ids[:, -1], -1)
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+            if token_type_ids is not None:
+                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
 
-        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "token_type_ids": token_type_ids,
+        }
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@@ -672,10 +687,9 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
         hidden_states = transformer_outputs[0]
-
-        logits = self.lm_head(hidden_states)
+        logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True)
+        logits = self.bias_layer(logits)
 
         loss = None
         if labels is not None:
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index 4d94a97828cf..5c9750d4dedc 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -225,6 +225,7 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
+            model.build()  # may be needed for the get_bias() call below
             assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
 
             if model_class in list_lm_models:

From c4fec38bc7f1827bae855a76e4828f4af17ab395 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 14 Jun 2023 16:02:36 +0200
Subject: [PATCH 420/935] Adapt Wav2Vec2 conversion for MMS lang identification
 (#24234)

* Add conversion for mms lid

* make style
---
 ..._original_pytorch_checkpoint_to_pytorch.py | 51 +++++++++++++++++--
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index 3e9cb3c03041..28554691c6e2 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -32,6 +32,7 @@
     Wav2Vec2Processor,
     logging,
 )
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ForSequenceClassification
 
 
 logging.set_verbosity_info()
@@ -57,6 +58,8 @@
     "final_proj": "project_hid",
     "w2v_encoder.proj": "lm_head",
     "mask_emb": "masked_spec_embed",
+    "pooling_layer.linear": "projector",
+    "pooling_layer.projection": "classifier",
 }
 TOP_LEVEL_KEYS = [
     "lm_head",
@@ -64,9 +67,24 @@
     "quantizer.codevectors",
     "project_q",
     "project_hid",
+    "projector",
+    "classifier",
 ]
 
 
+def read_txt_into_dict(filename):
+    result = {}
+    with open(filename, "r") as file:
+        for line_number, line in enumerate(file):
+            line = line.strip()
+            if line:
+                words = line.split()
+                key = line_number
+                value = words[0]
+                result[key] = value
+    return result
+
+
 def set_recursively(key, value, full_name, weight_type, hf_pointer):
     for attribute in key.split("."):
         hf_pointer = getattr(hf_pointer, attribute)
@@ -240,7 +258,7 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro
 
 @torch.no_grad()
 def convert_wav2vec2_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True, is_seq_class=False
 ):
     """
     Copy/paste/tweak model's weights to transformers design.
@@ -250,7 +268,20 @@ def convert_wav2vec2_checkpoint(
     else:
         config = Wav2Vec2Config()
 
-    if is_finetuned:
+    if is_seq_class:
+        id2label = read_txt_into_dict(dict_path)
+        config.id2label = id2label
+        hf_wav2vec = Wav2Vec2ForSequenceClassification(config)
+        feature_extractor = Wav2Vec2FeatureExtractor(
+            feature_size=1,
+            sampling_rate=16000,
+            padding_value=0,
+            do_normalize=True,
+            return_attention_mask=True,
+        )
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+    elif is_finetuned:
         if dict_path:
             target_dict = Dictionary.load(dict_path)
 
@@ -296,7 +327,7 @@ def convert_wav2vec2_checkpoint(
     else:
         hf_wav2vec = Wav2Vec2ForPreTraining(config)
 
-    if is_finetuned:
+    if is_finetuned or is_seq_class:
         model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
             [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
         )
@@ -322,7 +353,19 @@ def convert_wav2vec2_checkpoint(
     parser.add_argument(
         "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
     )
+    parser.add_argument(
+        "--is_seq_class",
+        action="store_true",
+        help="Whether the model to convert is a fine-tuned sequence classification model or not",
+    )
     args = parser.parse_args()
+
+    is_finetuned = not args.not_finetuned and not args.is_seq_class
     convert_wav2vec2_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.dict_path,
+        is_finetuned,
+        args.is_seq_class,
     )

From 8978b696d7516425416327ccdefc16dc0666224f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 14 Jun 2023 10:06:31 -0400
Subject: [PATCH 421/935] Update check of core deps (#24277)

---
 src/transformers/dependency_versions_check.py | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/transformers/dependency_versions_check.py b/src/transformers/dependency_versions_check.py
index bbf863222a52..1e51e839b49d 100644
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -23,9 +23,21 @@
 # order specific notes:
 # - tqdm must be checked before tokenizers
 
-pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
-if sys.version_info < (3, 7):
-    pkgs_to_check_at_runtime.append("dataclasses")
+pkgs_to_check_at_runtime = [
+    "python",
+    "tqdm",
+    "regex",
+    "requests",
+    "packaging",
+    "filelock",
+    "numpy",
+    "tokenizers",
+    "huggingface-hub",
+    "safetensors",
+    "accelerate",
+    "pyyaml",
+]
+
 if sys.version_info < (3, 8):
     pkgs_to_check_at_runtime.append("importlib_metadata")
 
@@ -37,6 +49,14 @@
 
             if not is_tokenizers_available():
                 continue  # not required, check version only if installed
+        elif pkg == "accelerate":
+            # must be loaded here, or else tqdm check may fail
+            from .utils import is_accelerate_available
+
+            # Maybe switch to is_torch_available in the future here so that Accelerate is hard dep of
+            # Transformers with PyTorch
+            if not is_accelerate_available():
+                continue  # not required, check version only if installed
 
         require_version_core(deps[pkg])
     else:

From a04ebc8b33314c42349c3e12885960a292c9c9dd Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 14 Jun 2023 17:05:40 +0200
Subject: [PATCH 422/935] `Pix2StructImageProcessor` requires `torch>=1.11.0`
 (#24270)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/clap/feature_extraction_clap.py         |  2 +-
 .../pix2struct/image_processing_pix2struct.py      | 14 ++++++++++++++
 src/transformers/pytorch_utils.py                  |  1 +
 .../pix2struct/test_image_processing_pix2struct.py | 12 ++++++++++++
 .../models/pix2struct/test_modeling_pix2struct.py  |  7 +++++++
 .../models/pix2struct/test_processor_pix2struct.py | 11 ++++++++++-
 6 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index d33307ffbd22..5b9df8225b70 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -192,7 +192,7 @@ def _random_mel_fusion(self, mel, total_frames, chunk_frames):
 
         mel = torch.tensor(mel[None, None, :])
         mel_shrink = torch.nn.functional.interpolate(
-            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False, antialias=False
+            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False
         )
         mel_shrink = mel_shrink[0][0].numpy()
         mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index b85d1238417e..3e72bc5ad54a 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -43,11 +43,23 @@
 if is_torch_available():
     import torch
 
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
 
 logger = logging.get_logger(__name__)
 DEFAULT_FONT_PATH = "ybelkada/fonts"
 
 
+def _check_torch_version():
+    if is_torch_available() and not is_torch_greater_or_equal_than_1_11:
+        raise ImportError(
+            f"You are using torch=={torch.__version__}, but torch>=1.11.0 is required to use "
+            "Pix2StructImageProcessor. Please upgrade torch."
+        )
+
+
 # adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2
 def torch_extract_patches(image_tensor, patch_height, patch_width):
     """
@@ -63,6 +75,7 @@ def torch_extract_patches(image_tensor, patch_height, patch_width):
             The width of the patches to extract.
     """
     requires_backends(torch_extract_patches, ["torch"])
+    _check_torch_version()
 
     image_tensor = image_tensor.unsqueeze(0)
     patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
@@ -240,6 +253,7 @@ def extract_flattened_patches(self, image: np.ndarray, max_patches: int, patch_s
                 A sequence of `max_patches` flattened patches.
         """
         requires_backends(self.extract_flattened_patches, "torch")
+        _check_torch_version()
 
         # convert to torch
         image = to_channel_dimension_format(image, ChannelDimension.FIRST)
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 839261663457..3beaf31efa6d 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -30,6 +30,7 @@
 
 is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
 is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
+is_torch_greater_or_equal_than_1_11 = parsed_torch_version_base >= version.parse("1.11")
 is_torch_greater_or_equal_than_1_10 = parsed_torch_version_base >= version.parse("1.10")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
 
diff --git a/tests/models/pix2struct/test_image_processing_pix2struct.py b/tests/models/pix2struct/test_image_processing_pix2struct.py
index fd805da696b9..51a4708a76b6 100644
--- a/tests/models/pix2struct/test_image_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -28,6 +28,10 @@
 if is_torch_available():
     import torch
 
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
 if is_vision_available():
     from PIL import Image
 
@@ -70,6 +74,10 @@ def prepare_dummy_image(self):
         return raw_image
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
 @require_torch
 @require_vision
 class Pix2StructImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
@@ -237,6 +245,10 @@ def test_call_pytorch(self):
             )
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
 @require_torch
 @require_vision
 class Pix2StructImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase):
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 4d028d111d0f..e40e6c81ea39 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -48,6 +48,9 @@
         Pix2StructVisionModel,
     )
     from transformers.models.pix2struct.modeling_pix2struct import PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
 
 
 if is_vision_available():
@@ -697,6 +700,10 @@ def prepare_img():
     return im
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
 @require_vision
 @require_torch
 @slow
diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py
index 318e6f301f6e..e0ee398b3a4d 100644
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -19,9 +19,14 @@
 import pytest
 
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
 if is_vision_available():
     from PIL import Image
 
@@ -34,6 +39,10 @@
     )
 
 
+@unittest.skipIf(
+    not is_torch_greater_or_equal_than_1_11,
+    reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`.",
+)
 @require_vision
 @require_torch
 class Pix2StructProcessorTest(unittest.TestCase):

From 860d11ff7c4235e0baaeee50d96cf1686781bdd3 Mon Sep 17 00:00:00 2001
From: Wissam Antoun <wissam.antoun@gmail.com>
Date: Wed, 14 Jun 2023 18:24:53 +0200
Subject: [PATCH 423/935] Fix Debertav2 embed_proj (#24205)

* MLM prediction head output size from embed_size

Take the output size of the dense projection layer from embedding_size instead of hidden_size since there could be a projection of the input embedding into hidden_size if they are different

* project TFDebertaV2 mlm output to embedding size

embedding size can be different that hidden_size, so the final layer needs to project back to embedding size. like in ELECTRA or DeBERTaV3 style pertaining.

This should solve an error that occurs when loading models like "almanach/camemberta-base-generator".

* fix the same issue for reshaping after projection

* fix layernorm size

* add self.embedding_size to scope

* fix embed_proj scope name

* apply the same changes to TF Deberta

* add the changes to deberta

* added self.embedding_size instead of config.embedding_size

* added the same change to debertav2

* added coppied from deberta to deberta2 model

* config.embedding_size fix

* black

* fix deberta config name
---
 .../models/deberta/modeling_deberta.py            | 11 ++++++-----
 .../models/deberta/modeling_tf_deberta.py         | 15 +++++++++++----
 .../models/deberta_v2/modeling_deberta_v2.py      | 13 ++++++++-----
 .../models/deberta_v2/modeling_tf_deberta_v2.py   | 15 +++++++++++----
 4 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index b2a4e8152234..9a0d43db3a0a 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -1100,16 +1100,17 @@ def forward(
         )
 
 
-# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
 class DebertaPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
         if isinstance(config.hidden_act, str):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1118,15 +1119,15 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
 class DebertaLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.transform = DebertaPredictionHeadTransform(config)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 5fc4ce783ccb..29c5a256d305 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -726,7 +726,12 @@ def __init__(self, config, **kwargs):
         self.position_biased_input = getattr(config, "position_biased_input", True)
         self.initializer_range = config.initializer_range
         if self.embedding_size != config.hidden_size:
-            self.embed_proj = tf.keras.layers.Dense(config.hidden_size, use_bias=False)
+            self.embed_proj = tf.keras.layers.Dense(
+                config.hidden_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="embed_proj",
+                use_bias=False,
+            )
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
 
@@ -820,8 +825,10 @@ class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer):
     def __init__(self, config: DebertaConfig, **kwargs):
         super().__init__(**kwargs)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
         self.dense = tf.keras.layers.Dense(
-            units=config.hidden_size,
+            units=self.embedding_size,
             kernel_initializer=get_initializer(config.initializer_range),
             name="dense",
         )
@@ -845,7 +852,7 @@ def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Laye
         super().__init__(**kwargs)
 
         self.config = config
-        self.hidden_size = config.hidden_size
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
 
         self.transform = TFDebertaPredictionHeadTransform(config, name="transform")
 
@@ -875,7 +882,7 @@ def set_bias(self, value: tf.Variable):
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 434f3d208aec..1596ad4ffad4 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -1199,16 +1199,18 @@ def forward(
         )
 
 
-# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+# Copied from transformers.models.deberta.modeling_deberta.DebertaPredictionHeadTransform with Deberta->DebertaV2
 class DebertaV2PredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
         if isinstance(config.hidden_act, str):
             self.transform_act_fn = ACT2FN[config.hidden_act]
         else:
             self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1217,15 +1219,16 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLMPredictionHead with Deberta->DebertaV2
 class DebertaV2LMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.transform = DebertaV2PredictionHeadTransform(config)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index d3c254f7ce68..e53d86dcb830 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -816,7 +816,12 @@ def __init__(self, config, **kwargs):
         self.position_biased_input = getattr(config, "position_biased_input", True)
         self.initializer_range = config.initializer_range
         if self.embedding_size != config.hidden_size:
-            self.embed_proj = tf.keras.layers.Dense(config.hidden_size, use_bias=False)
+            self.embed_proj = tf.keras.layers.Dense(
+                config.hidden_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="embed_proj",
+                use_bias=False,
+            )
         self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
         self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
 
@@ -911,8 +916,10 @@ class TFDebertaV2PredictionHeadTransform(tf.keras.layers.Layer):
     def __init__(self, config: DebertaV2Config, **kwargs):
         super().__init__(**kwargs)
 
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
         self.dense = tf.keras.layers.Dense(
-            units=config.hidden_size,
+            units=self.embedding_size,
             kernel_initializer=get_initializer(config.initializer_range),
             name="dense",
         )
@@ -937,7 +944,7 @@ def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.La
         super().__init__(**kwargs)
 
         self.config = config
-        self.hidden_size = config.hidden_size
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
 
         self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform")
 
@@ -967,7 +974,7 @@ def set_bias(self, value: tf.Variable):
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
         hidden_states = self.transform(hidden_states=hidden_states)
         seq_length = shape_list(hidden_states)[1]
-        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
         hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
         hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
         hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

From 26a2ec56d743bb232f93c75ec47164974064a20a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 14 Jun 2023 12:44:09 -0400
Subject: [PATCH 424/935] Clean up old Accelerate checks (#24279)

* Clean up old Accelerate checks

* Put back imports
---
 setup.py                                      |  2 +-
 src/transformers/dependency_versions_table.py |  2 +-
 src/transformers/modeling_utils.py            | 23 +++----------
 src/transformers/trainer.py                   | 32 ++++---------------
 4 files changed, 14 insertions(+), 45 deletions(-)

diff --git a/setup.py b/setup.py
index b395adf6f586..d6e213798dbb 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
     "Pillow",
-    "accelerate>=0.20.2",
+    "accelerate>=0.20.3",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
     "black~=23.1",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 64ec3c2a5400..39d8d0cb30ca 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
     "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.20.2",
+    "accelerate": "accelerate>=0.20.3",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",
     "black": "black~=23.1",
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 08bf79a3ff7f..b0d52f10d938 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -82,27 +82,17 @@
 XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
 
 if is_accelerate_available():
-    from accelerate import __version__ as accelerate_version
     from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights
     from accelerate.utils import (
+        check_tied_parameters_on_same_device,
         find_tied_parameters,
+        get_balanced_memory,
         load_offloaded_weights,
         offload_weight,
         save_offload_index,
         set_module_tensor_to_device,
     )
 
-    if version.parse(accelerate_version) > version.parse("0.11.0"):
-        from accelerate.utils import get_balanced_memory
-    else:
-        get_balanced_memory = None
-    if version.parse(accelerate_version) > version.parse("0.19.0"):
-        from accelerate.utils import check_tied_parameters_on_same_device
-    else:
-        check_tied_parameters_on_same_device = None
-else:
-    find_tied_parameters = None
-
 if is_safetensors_available():
     from safetensors import safe_open
     from safetensors.torch import load_file as safe_load_file
@@ -2792,8 +2782,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
                     "'sequential'."
                 )
-            elif device_map in ["balanced", "balanced_low_0"] and get_balanced_memory is None:
-                raise ValueError(f"`device_map={device_map}` requires a source install of Accelerate.")
 
             kwargs = {"no_split_module_classes": no_split_modules}
             if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
@@ -2803,7 +2791,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     "This model has some weights that should be kept in higher precision, you need to upgrade "
                     "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)."
                 )
-            if device_map != "sequential" and get_balanced_memory is not None:
+            if device_map != "sequential":
                 max_memory = get_balanced_memory(
                     model,
                     dtype=target_dtype,
@@ -2838,8 +2826,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             model.tie_weights()
             tied_params = find_tied_parameters(model)
             # check if we don't have tied param in different devices
-            if check_tied_parameters_on_same_device is not None:
-                check_tied_parameters_on_same_device(tied_params, device_map)
+            check_tied_parameters_on_same_device(tied_params, device_map)
 
         if from_tf:
             if resolved_archive_file.endswith(".index"):
@@ -3031,7 +3018,7 @@ def _fix_key(key):
         missing_keys = list(set(expected_keys) - set(loaded_keys))
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
-        if find_tied_parameters is not None:
+        if is_accelerate_available():
             tied_params = find_tied_parameters(model)
         else:
             tied_params = []
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 502f380bf8db..9429b12ff96f 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -32,8 +32,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
-from tqdm.auto import tqdm
-
 
 # Integrations must be imported before ML frameworks:
 # isort: off
@@ -206,14 +204,9 @@
     from peft import PeftModel
 
 
-skip_first_batches = None
 if is_accelerate_available():
+    from accelerate import Accelerator, skip_first_batches
     from accelerate import __version__ as accelerate_version
-
-    if version.parse(accelerate_version) >= version.parse("0.16"):
-        from accelerate import skip_first_batches
-
-    from accelerate import Accelerator
     from accelerate.utils import DistributedDataParallelKwargs
 
     if version.parse(accelerate_version) > version.parse("0.20.3"):
@@ -322,6 +315,7 @@ class Trainer:
 
     """
 
+    # Those are used as methods of the Trainer in examples.
     from .trainer_pt_utils import _get_learning_rate, log_metrics, metrics_format, save_metrics, save_state
 
     def __init__(
@@ -1714,22 +1708,10 @@ def _inner_training_loop(
             logger.info(f"  Continuing training from epoch {epochs_trained}")
             logger.info(f"  Continuing training from global step {self.state.global_step}")
             if not args.ignore_data_skip:
-                if skip_first_batches is None:
-                    logger.info(
-                        f"  Will skip the first {epochs_trained} epochs then the first"
-                        f" {steps_trained_in_current_epoch} batches in the first epoch. If this takes a lot of time,"
-                        " you can install the latest version of Accelerate with `pip install -U accelerate`.You can"
-                        " also add the `--ignore_data_skip` flag to your launch command, but you will resume the"
-                        " training on data already seen by your model."
-                    )
-                else:
-                    logger.info(
-                        f"  Will skip the first {epochs_trained} epochs then the first"
-                        f" {steps_trained_in_current_epoch} batches in the first epoch."
-                    )
-                if self.is_local_process_zero() and not args.disable_tqdm and skip_first_batches is None:
-                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
-                    steps_trained_progress_bar.set_description("Skipping the first batches")
+                logger.info(
+                    f"  Will skip the first {epochs_trained} epochs then the first"
+                    f" {steps_trained_in_current_epoch} batches in the first epoch."
+                )
 
         # Update the references
         self.callback_handler.model = self.model
@@ -1787,7 +1769,7 @@ def _inner_training_loop(
 
             rng_to_sync = False
             steps_skipped = 0
-            if skip_first_batches is not None and steps_trained_in_current_epoch > 0:
+            if steps_trained_in_current_epoch > 0:
                 epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
                 steps_skipped = steps_trained_in_current_epoch
                 steps_trained_in_current_epoch = 0

From 0c3fdccf2f271fb7c44f6ea6e9f4ee234795f2c5 Mon Sep 17 00:00:00 2001
From: Matthijs Hollemans <mail@hollance.com>
Date: Wed, 14 Jun 2023 18:57:23 +0200
Subject: [PATCH 425/935] [WIP] add EnCodec model (#23655)

* boilerplate stuff

* messing around with the feature extractor

* fix feature extractor

* unit tests for feature extractor

* rename speech to audio

* quick-and-dirty import of Meta's code

* import weights (sort of)

* cleaning up

* more cleaning up

* move encoder/decoder args into config

* cleanup model

* rename EnCodec -> Encodec

* RVQ parameters in config

* add slow test

* add lstm init and test_init

* Add save & load

* finish EncodecModel

* remove decoder_input_values as they are ont used anywhere (not removed from doc yet)

* fix test feature extraction model name

* Add better slow test

* Fix tests

* some fixup and cleaning

* Improve further

* cleaning up quantizer

* fix up conversion script

* test don't pass, _encode_fram does not work

* update tests with output per encode and decode

* more cleanup

* rename _codebook

* remove old config cruft

* ratios & hop_length

* use ModuleList instead of Sequential

* clean up resnet block

* update types

* update tests

* fixup

* quick cleanup

* fix padding

* more styl,ing

* add patrick feedback

* fix copies

* fixup

* fix lstm

* fix shape issues

* fixup

* rename conv layers

* fixup

* fix decoding

* small conv refactoring

* remove norm_params

* simplify conv layers

* rename conv layers

* stuff

* Clean up

* Add padding logic

use padding mask

small conv refactoring

remove norm_params

simplify conv layers

rename conv layers

stuff

add batched test

update

Clean up

merge and update for padding

fix padding

fixup

* clean up more

* clean up more

* More clean ups

* cleanup convolutions

* typo

* fix typos

* fixup

* build PR doc?

* start refactoring docstring

* fix don't pad when no strid and chunk

* update docstring

* update docstring

* nits

* update going to lunch

* update config and model

* fix broken testse (becaue of the config changes)

* fix scale computation

* fixu[

* only return dict if speciefied or if config returns it

* remove todos

* update defaults in config

* update conversion script

* fix doctest

* more docstring + fixup

* nits on batched_tests

* more nits

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* update basxed on review

* fix update

* updaet tests

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fixup

* add overlap and chunl_length_s

* cleanup feature extraction

* teste edge cases truncation and padding

* correct processor values

* update config encodec, nits

* fix tests

* fixup

* fix 24Hz test

* elle tests are green

* fix fixup

* Apply suggestions from code review

* revert readme changes

* fixup

* add example

* use facebook checkpoints

* fix typo

* no pipeline tests

* use slef.pad everywhere we can

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update based on review

* update

* update mdx

* fix bug and tests

* fixup

* fix doctest

* remove comment

* more nits

* add more coverage for `test_truncation_and_padding`

* fixup

* add last test

* fix text

* nits

* Update tests/models/encodec/test_modeling_encodec.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* take care of the last comments

* typo

* fix test

* nits

* fixup

* Update src/transformers/models/encodec/feature_extraction_encodec.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: arthur.zucker@gmail.com <arthur.zucker@gmail.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.mdx                      |   2 +
 docs/source/en/model_doc/encodec.mdx          |  59 ++
 src/transformers/__init__.py                  |  22 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 src/transformers/models/encodec/__init__.py   |  65 ++
 .../models/encodec/configuration_encodec.py   | 189 ++++
 .../convert_encodec_checkpoint_to_pytorch.py  | 352 ++++++++
 .../encodec/feature_extraction_encodec.py     | 206 +++++
 .../models/encodec/modeling_encodec.py        | 808 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  17 +
 .../utils/dummy_speech_objects.py             |   7 +
 tests/models/encodec/__init__.py              |   0
 .../test_feature_extraction_encodec.py        | 255 ++++++
 tests/models/encodec/test_modeling_encodec.py | 571 +++++++++++++
 utils/check_config_attributes.py              |   2 +
 utils/documentation_tests.txt                 |   2 +
 27 files changed, 2572 insertions(+)
 create mode 100644 docs/source/en/model_doc/encodec.mdx
 create mode 100644 src/transformers/models/encodec/__init__.py
 create mode 100644 src/transformers/models/encodec/configuration_encodec.py
 create mode 100644 src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/encodec/feature_extraction_encodec.py
 create mode 100644 src/transformers/models/encodec/modeling_encodec.py
 create mode 100644 tests/models/encodec/__init__.py
 create mode 100644 tests/models/encodec/test_feature_extraction_encodec.py
 create mode 100644 tests/models/encodec/test_modeling_encodec.py

diff --git a/README.md b/README.md
index b5ce283be465..7d6ea2787880 100644
--- a/README.md
+++ b/README.md
@@ -346,6 +346,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
diff --git a/README_es.md b/README_es.md
index 32f955dbdab0..76270b2c7cd9 100644
--- a/README_es.md
+++ b/README_es.md
@@ -321,6 +321,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
diff --git a/README_hd.md b/README_hd.md
index 2195e7cc638d..02fdadf751cc 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -293,6 +293,7 @@ conda install -c huggingface transformers
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google रिसर्च/स्टैनफोर्ड यूनिवर्सिटी से) साथ में दिया गया पेपर [इलेक्ट्रा: जेनरेटर के बजाय भेदभाव करने वाले के रूप में टेक्स्ट एन्कोडर्स का पूर्व-प्रशिक्षण] (https://arxiv.org/abs/2003.10555) केविन क्लार्क, मिन्ह-थांग लुओंग, क्वोक वी. ले, क्रिस्टोफर डी. मैनिंग द्वारा पोस्ट किया गया।
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (Meta AI से) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. द्वाराअनुसंधान पत्र [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) के साथ जारी किया गया
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google रिसर्च से) साथ में दिया गया पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https:/ /arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा।
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया।
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया
diff --git a/README_ja.md b/README_ja.md
index 2331bb06cf55..2116fee408c7 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -355,6 +355,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (Meta AI から) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. から公開された研究論文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
diff --git a/README_ko.md b/README_ko.md
index b1fffac0225f..34ffed005b66 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -270,6 +270,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다.
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (Meta AI 에서 제공)은 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.의 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)논문과 함께 발표했습니다.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index a463c171e5c6..469fd867602e 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -294,6 +294,7 @@ conda install -c huggingface transformers
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (来自 Meta AI) 伴随论文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) 由 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi 发布。
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 7d28c925a752..56b4bfad6dba 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -306,6 +306,7 @@ conda install -c huggingface transformers
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 60f98607bf62..51ef7287d8f7 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -541,6 +541,8 @@
         title: Audio Spectrogram Transformer
       - local: model_doc/clap
         title: CLAP
+      - local: model_doc/encodec
+        title: EnCodec
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 6075c0ecf3d2..1690b4cb7c51 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -107,6 +107,7 @@ The documentation is organized into five sections:
 1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
@@ -318,6 +319,7 @@ Flax), PyTorch, and/or TensorFlow.
 |        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            EnCodec            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ErnieM             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/encodec.mdx b/docs/source/en/model_doc/encodec.mdx
new file mode 100644
index 000000000000..f98156db1d91
--- /dev/null
+++ b/docs/source/en/model_doc/encodec.mdx
@@ -0,0 +1,59 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# EnCodec
+
+## Overview
+
+The EnCodec neural codec model was proposed in [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+
+The abstract from the paper is the following:
+
+*We introduce a state-of-the-art real-time, high-fidelity, audio codec leveraging neural networks. It consists in a streaming encoder-decoder architecture with quantized latent space trained in an end-to-end fashion. We simplify and speed-up the training by using a single multiscale spectrogram adversary that efficiently reduces artifacts and produce high-quality samples. We introduce a novel loss balancer mechanism to stabilize training: the weight of a loss now defines the fraction of the overall gradient it should represent, thus decoupling the choice of this hyper-parameter from the typical scale of the loss. Finally, we study how lightweight Transformer models can be used to further compress the obtained representation by up to 40%, while staying faster than real time. We provide a detailed description of the key design choices of the proposed model including: training objective, architectural changes and a study of various perceptual loss functions. We present an extensive subjective evaluation (MUSHRA tests) together with an ablation study for a range of bandwidths and audio domains, including speech, noisy-reverberant speech, and music. Our approach is superior to the baselines methods across all evaluated settings, considering both 24 kHz monophonic and 48 kHz stereophonic audio.*
+
+This model was contributed by [Matthijs](https://huggingface.co/Matthijs), [Patrick Von Platen](https://huggingface.co/patrickvonplaten) and [Arthur Zucker](https://huggingface.co/ArthurZ). 
+The original code can be found [here](https://github.com/facebookresearch/encodec).
+Here is a quick example of how to encode and decode an audio using this model:
+
+```python 
+>>> from datasets import load_dataset, Audio
+>>> from transformers import EncodecModel, AutoProcessor
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+>>> model = EncodecModel.from_pretrained("facebook/encodec_24khz")
+>>> processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
+
+>>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
+>>> audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
+>>> # or the equivalent with a forward pass
+>>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
+```
+
+
+## EncodecConfig
+
+[[autodoc]] EncodecConfig
+
+## EncodecFeatureExtractor
+
+[[autodoc]] EncodecFeatureExtractor
+    - __call__
+
+## EncodecModel
+
+[[autodoc]] EncodecModel
+    - decode
+    - encode
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c2966423da53..f4e53099b198 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -281,6 +281,10 @@
     "models.efficientformer": ["EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientFormerConfig"],
     "models.efficientnet": ["EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientNetConfig"],
     "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"],
+    "models.encodec": [
+        "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EncodecConfig",
+    ],
     "models.encoder_decoder": ["EncoderDecoderConfig"],
     "models.ernie": [
         "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -825,6 +829,7 @@
     ]
 else:
     _import_structure["models.audio_spectrogram_transformer"].append("ASTFeatureExtractor")
+    _import_structure["models.encodec"].append("EncodecFeatureExtractor")
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
     _import_structure["models.speecht5"].append("SpeechT5FeatureExtractor")
@@ -1568,6 +1573,13 @@
             "load_tf_weights_in_electra",
         ]
     )
+    _import_structure["models.encodec"].extend(
+        [
+            "ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "EncodecModel",
+            "EncodecPreTrainedModel",
+        ]
+    )
     _import_structure["models.encoder_decoder"].append("EncoderDecoderModel")
     _import_structure["models.ernie"].extend(
         [
@@ -4100,6 +4112,10 @@
     from .models.efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
     from .models.efficientnet import EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientNetConfig
     from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer
+    from .models.encodec import (
+        ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EncodecConfig,
+    )
     from .models.encoder_decoder import EncoderDecoderConfig
     from .models.ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig
     from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
@@ -4598,6 +4614,7 @@
         from .utils.dummy_speech_objects import *
     else:
         from .models.audio_spectrogram_transformer import ASTFeatureExtractor
+        from .models.encodec import EncodecFeatureExtractor
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
         from .models.speecht5 import SpeechT5FeatureExtractor
@@ -5210,6 +5227,11 @@
             ElectraPreTrainedModel,
             load_tf_weights_in_electra,
         )
+        from .models.encodec import (
+            ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EncodecModel,
+            EncodecPreTrainedModel,
+        )
         from .models.encoder_decoder import EncoderDecoderModel
         from .models.ernie import (
             ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0ee88e0ba392..c41059698558 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -72,6 +72,7 @@
     efficientformer,
     efficientnet,
     electra,
+    encodec,
     encoder_decoder,
     ernie,
     ernie_m,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6445f62acb58..f2c1c8829994 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -80,6 +80,7 @@
         ("efficientformer", "EfficientFormerConfig"),
         ("efficientnet", "EfficientNetConfig"),
         ("electra", "ElectraConfig"),
+        ("encodec", "EncodecConfig"),
         ("encoder-decoder", "EncoderDecoderConfig"),
         ("ernie", "ErnieConfig"),
         ("ernie_m", "ErnieMConfig"),
@@ -273,6 +274,7 @@
         ("efficientformer", "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("efficientnet", "EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("encodec", "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ernie_m", "ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("esm", "ESM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -461,6 +463,7 @@
         ("efficientformer", "EfficientFormer"),
         ("efficientnet", "EfficientNet"),
         ("electra", "ELECTRA"),
+        ("encodec", "EnCodec"),
         ("encoder-decoder", "Encoder decoder"),
         ("ernie", "ERNIE"),
         ("ernie_m", "ErnieM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 681a59e0b28b..b100b62ac024 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -54,6 +54,7 @@
         ("dinat", "ViTFeatureExtractor"),
         ("donut-swin", "DonutFeatureExtractor"),
         ("dpt", "DPTFeatureExtractor"),
+        ("encodec", "EncodecFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2c23da3c556f..4c675ff05097 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -79,6 +79,7 @@
         ("efficientformer", "EfficientFormerModel"),
         ("efficientnet", "EfficientNetModel"),
         ("electra", "ElectraModel"),
+        ("encodec", "EncodecModel"),
         ("ernie", "ErnieModel"),
         ("ernie_m", "ErnieMModel"),
         ("esm", "EsmModel"),
diff --git a/src/transformers/models/encodec/__init__.py b/src/transformers/models/encodec/__init__.py
new file mode 100644
index 000000000000..d3d9488968bf
--- /dev/null
+++ b/src/transformers/models/encodec/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_encodec": [
+        "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EncodecConfig",
+    ],
+    "feature_extraction_encodec": ["EncodecFeatureExtractor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_encodec"] = [
+        "ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "EncodecModel",
+        "EncodecPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_encodec import (
+        ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EncodecConfig,
+    )
+    from .feature_extraction_encodec import EncodecFeatureExtractor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_encodec import (
+            ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST,
+            EncodecModel,
+            EncodecPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
new file mode 100644
index 000000000000..9ea2cfee945b
--- /dev/null
+++ b/src/transformers/models/encodec/configuration_encodec.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EnCodec model configuration"""
+
+
+import math
+from typing import Optional
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/encodec_24khz": "https://huggingface.co/facebook/encodec_24khz/resolve/main/config.json",
+    "facebook/encodec_48khz": "https://huggingface.co/facebook/encodec_48khz/resolve/main/config.json",
+}
+
+
+class EncodecConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`EncodecModel`]. It is used to instantiate a
+    Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
+            The range of diffent bandwiths the model can encode audio with.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        audio_channels (`int`, *optional*, defaults to 1):
+            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+        normalize (`bool`, *optional*, defaults to `False`):
+            Whether the audio shall be normalized when passed.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+        hidden_size (`int`, *optional*, defaults to 128):
+            Intermediate representation dimension.
+        num_filters (`int`, *optional*, defaults to 32):
+            Number of convolution kernels of first `EncodecConv1d` down sampling layer.
+        num_residual_layers (`int`,  *optional*, defaults to 1):
+            Number of residual layers.
+        upsampling_ratios (`Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]`):
+            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
+            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
+        norm_type (`str`, *optional*, defaults to `"weight_norm"`):
+            Normalization method. Should be in `["weight_norm", "time_group_norm"]`
+        kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the initial convolution.
+        last_kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the last convolution layer.
+        residual_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the residual layers.
+        dilation_growth_rate (`int`, *optional*, defaults to 2):
+            How much to increase the dilation with each layer.
+        use_causal_conv (`bool`, *optional*, defaults to `True`):
+            Whether to use fully causal convolution.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode for the convolutions.
+        compress (`int`, *optional*, defaults to 2):
+            Reduced dimensionality in residual branches (from Demucs v3).
+        num_lstm_layers (`int`, *optional*, defaults to 2):
+            Number of LSTM layers at the end of the encoder.
+        trim_right_ratio (`float`, *optional*, defaults to 1.0):
+            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
+            equal to 1.0, it means that all the trimming is done at the right.
+        codebook_size (`int`, *optional*, defaults to 1024):
+            Number of discret codes that make up VQVAE.
+        codebook_dim (`int`, *optional*):
+            Dimension of the codebook vectors. If not defined, uses `hidden_size`.
+
+    Example:
+
+    ```python
+    >>> from transformers import EncodecModel, EncodecConfig
+
+    >>> # Initializing a "facebook/encodec_24khz" style configuration
+    >>> configuration = EncodecConfig()
+
+    >>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
+    >>> model = EncodecModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "encodec"
+
+    def __init__(
+        self,
+        target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0],
+        sampling_rate=24_000,
+        audio_channels=1,
+        normalize=False,
+        chunk_length_s=None,
+        overlap=None,
+        hidden_size=128,
+        num_filters=32,
+        num_residual_layers=1,
+        upsampling_ratios=[8, 5, 4, 2],
+        norm_type="weight_norm",
+        kernel_size=7,
+        last_kernel_size=7,
+        residual_kernel_size=3,
+        dilation_growth_rate=2,
+        use_causal_conv=True,
+        pad_mode="reflect",
+        compress=2,
+        num_lstm_layers=2,
+        trim_right_ratio=1.0,
+        codebook_size=1024,
+        codebook_dim=None,
+        **kwargs,
+    ):
+        self.target_bandwidths = target_bandwidths
+        self.sampling_rate = sampling_rate
+        self.audio_channels = audio_channels
+        self.normalize = normalize
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.last_kernel_size = last_kernel_size
+        self.residual_kernel_size = residual_kernel_size
+        self.dilation_growth_rate = dilation_growth_rate
+        self.use_causal_conv = use_causal_conv
+        self.pad_mode = pad_mode
+        self.compress = compress
+        self.num_lstm_layers = num_lstm_layers
+        self.trim_right_ratio = trim_right_ratio
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        super().__init__(**kwargs)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    @property
+    def frame_rate(self) -> int:
+        hop_length = np.prod(self.upsampling_ratios)
+        return math.ceil(self.sampling_rate / hop_length)
+
+    @property
+    def num_quantizers(self) -> int:
+        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * 10))
diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..cd7ead3d7274
--- /dev/null
+++ b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
@@ -0,0 +1,352 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert EnCodec checkpoints."""
+
+import argparse
+
+import torch
+
+from transformers import (
+    EncodecConfig,
+    EncodecFeatureExtractor,
+    EncodecModel,
+    logging,
+)
+
+
+# checkpoints downloaded from:
+# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th
+# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.encodec")
+
+MAPPING_QUANTIZER = {
+    "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited",
+    "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size",
+    "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed",
+    "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg",
+}
+MAPPING_ENCODER = {
+    "encoder.model.0.conv.conv": "encoder.layers.0.conv",
+    "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv",
+    "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv",
+    "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv",
+    "encoder.model.3.conv.conv": "encoder.layers.3.conv",
+    "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv",
+    "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv",
+    "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv",
+    "encoder.model.6.conv.conv": "encoder.layers.6.conv",
+    "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv",
+    "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv",
+    "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv",
+    "encoder.model.9.conv.conv": "encoder.layers.9.conv",
+    "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv",
+    "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv",
+    "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv",
+    "encoder.model.12.conv.conv": "encoder.layers.12.conv",
+    "encoder.model.13.lstm": "encoder.layers.13.lstm",
+    "encoder.model.15.conv.conv": "encoder.layers.15.conv",
+}
+MAPPING_ENCODER_48K = {
+    "encoder.model.0.conv.norm": "encoder.layers.0.norm",
+    "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm",
+    "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm",
+    "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm",
+    "encoder.model.3.conv.norm": "encoder.layers.3.norm",
+    "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm",
+    "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm",
+    "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm",
+    "encoder.model.6.conv.norm": "encoder.layers.6.norm",
+    "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm",
+    "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm",
+    "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm",
+    "encoder.model.9.conv.norm": "encoder.layers.9.norm",
+    "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm",
+    "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm",
+    "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm",
+    "encoder.model.12.conv.norm": "encoder.layers.12.norm",
+    "encoder.model.15.conv.norm": "encoder.layers.15.norm",
+}
+MAPPING_DECODER = {
+    "decoder.model.0.conv.conv": "decoder.layers.0.conv",
+    "decoder.model.1.lstm": "decoder.layers.1.lstm",
+    "decoder.model.3.convtr.convtr": "decoder.layers.3.conv",
+    "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv",
+    "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv",
+    "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv",
+    "decoder.model.6.convtr.convtr": "decoder.layers.6.conv",
+    "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv",
+    "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv",
+    "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv",
+    "decoder.model.9.convtr.convtr": "decoder.layers.9.conv",
+    "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv",
+    "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv",
+    "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv",
+    "decoder.model.12.convtr.convtr": "decoder.layers.12.conv",
+    "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv",
+    "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv",
+    "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv",
+    "decoder.model.15.conv.conv": "decoder.layers.15.conv",
+}
+MAPPING_DECODER_48K = {
+    "decoder.model.0.conv.norm": "decoder.layers.0.norm",
+    "decoder.model.3.convtr.norm": "decoder.layers.3.norm",
+    "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm",
+    "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm",
+    "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm",
+    "decoder.model.6.convtr.norm": "decoder.layers.6.norm",
+    "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm",
+    "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm",
+    "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm",
+    "decoder.model.9.convtr.norm": "decoder.layers.9.norm",
+    "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm",
+    "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm",
+    "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm",
+    "decoder.model.12.convtr.norm": "decoder.layers.12.norm",
+    "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm",
+    "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm",
+    "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm",
+    "decoder.model.15.conv.norm": "decoder.layers.15.norm",
+}
+MAPPING_24K = {
+    **MAPPING_QUANTIZER,
+    **MAPPING_ENCODER,
+    **MAPPING_DECODER,
+}
+MAPPING_48K = {
+    **MAPPING_QUANTIZER,
+    **MAPPING_ENCODER,
+    **MAPPING_ENCODER_48K,
+    **MAPPING_DECODER,
+    **MAPPING_DECODER_48K,
+}
+TOP_LEVEL_KEYS = []
+IGNORE_KEYS = []
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    elif weight_type == "weight_ih_l0":
+        hf_pointer.weight_ih_l0.data = value
+    elif weight_type == "weight_hh_l0":
+        hf_pointer.weight_hh_l0.data = value
+    elif weight_type == "bias_ih_l0":
+        hf_pointer.bias_ih_l0.data = value
+    elif weight_type == "bias_hh_l0":
+        hf_pointer.bias_hh_l0.data = value
+    elif weight_type == "weight_ih_l1":
+        hf_pointer.weight_ih_l1.data = value
+    elif weight_type == "weight_hh_l1":
+        hf_pointer.weight_hh_l1.data = value
+    elif weight_type == "bias_ih_l1":
+        hf_pointer.bias_ih_l1.data = value
+    elif weight_type == "bias_hh_l1":
+        hf_pointer.bias_hh_l1.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
+
+
+def should_ignore(name, ignore_keys):
+    for key in ignore_keys:
+        if key.endswith(".*"):
+            if name.startswith(key[:-1]):
+                return True
+        elif ".*." in key:
+            prefix, suffix = key.split(".*.")
+            if prefix in name and suffix in name:
+                return True
+        elif key in name:
+            return True
+    return False
+
+
+def recursively_load_weights(orig_dict, hf_model, model_name):
+    unused_weights = []
+
+    if model_name == "encodec_24khz":
+        MAPPING = MAPPING_24K
+    elif model_name == "encodec_48khz":
+        MAPPING = MAPPING_48K
+    else:
+        raise ValueError(f"Unsupported model: {model_name}")
+
+    for name, value in orig_dict.items():
+        if should_ignore(name, IGNORE_KEYS):
+            logger.info(f"{name} was ignored")
+            continue
+
+        is_used = False
+        for key, mapped_key in MAPPING.items():
+            if "*" in key:
+                prefix, suffix = key.split(".*.")
+                if prefix in name and suffix in name:
+                    key = suffix
+
+            if key in name:
+                # HACK otherwise .embed gets initialized with .embed_avg too
+                if key.endswith("embed") and name.endswith("embed_avg"):
+                    continue
+
+                is_used = True
+                if "*" in mapped_key:
+                    layer_index = name.split(key)[0].split(".")[-2]
+                    mapped_key = mapped_key.replace("*", layer_index)
+                if "weight_g" in name:
+                    weight_type = "weight_g"
+                elif "weight_v" in name:
+                    weight_type = "weight_v"
+                elif "weight_ih_l0" in name:
+                    weight_type = "weight_ih_l0"
+                elif "weight_hh_l0" in name:
+                    weight_type = "weight_hh_l0"
+                elif "bias_ih_l0" in name:
+                    weight_type = "bias_ih_l0"
+                elif "bias_hh_l0" in name:
+                    weight_type = "bias_hh_l0"
+                elif "weight_ih_l1" in name:
+                    weight_type = "weight_ih_l1"
+                elif "weight_hh_l1" in name:
+                    weight_type = "weight_hh_l1"
+                elif "bias_ih_l1" in name:
+                    weight_type = "bias_ih_l1"
+                elif "bias_hh_l1" in name:
+                    weight_type = "bias_hh_l1"
+                elif "bias" in name:
+                    weight_type = "bias"
+                elif "weight" in name:
+                    weight_type = "weight"
+                elif "running_mean" in name:
+                    weight_type = "running_mean"
+                elif "running_var" in name:
+                    weight_type = "running_var"
+                elif "num_batches_tracked" in name:
+                    weight_type = "num_batches_tracked"
+                else:
+                    weight_type = None
+                set_recursively(hf_model, mapped_key, value, name, weight_type)
+            continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+@torch.no_grad()
+def convert_checkpoint(
+    model_name,
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = EncodecConfig.from_pretrained(config_path)
+    else:
+        config = EncodecConfig()
+
+    if model_name == "encodec_24khz":
+        pass  # config is already correct
+    elif model_name == "encodec_48khz":
+        config.upsampling_ratios = [8, 5, 4, 2]
+        config.target_bandwidths = [3.0, 6.0, 12.0, 24.0]
+        config.sampling_rate = 48_000
+        config.audio_channels = 2
+        config.use_causal_conv = False
+        config.norm_type = "time_group_norm"
+        config.normalize = True
+        config.chunk_length_s = 1.0
+        config.overlap = 0.01
+    else:
+        raise ValueError(f"Unknown model name: {model_name}")
+
+    model = EncodecModel(config)
+
+    feature_extractor = EncodecFeatureExtractor(
+        feature_size=config.audio_channels,
+        sampling_rate=config.sampling_rate,
+        chunk_length_s=config.chunk_length_s,
+        overlap=config.overlap,
+    )
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+    original_checkpoint = torch.load(checkpoint_path)
+    recursively_load_weights(original_checkpoint, model, model_name)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        feature_extractor.push_to_hub(repo_id)
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model",
+        default="encodec_24khz",
+        type=str,
+        help="The model to convert. Should be one of 'encodec_24khz', 'encodec_48khz'.",
+    )
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_checkpoint(
+        args.model,
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/encodec/feature_extraction_encodec.py b/src/transformers/models/encodec/feature_extraction_encodec.py
new file mode 100644
index 000000000000..6f7536a52e9f
--- /dev/null
+++ b/src/transformers/models/encodec/feature_extraction_encodec.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for EnCodec."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EncodecFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an EnCodec feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
+    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        chunk_length_s (`float`, *optional*):
+            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
+        overlap (`float`, *optional*):
+            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
+            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
+    """
+
+    model_input_names = ["input_values", "padding_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 24000,
+        padding_value: float = 0.0,
+        chunk_length_s: float = None,
+        overlap: float = None,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.chunk_length_s = chunk_length_s
+        self.overlap = overlap
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_length(self) -> Optional[int]:
+        if self.chunk_length_s is None:
+            return None
+        else:
+            return int(self.chunk_length_s * self.sampling_rate)
+
+    # This is a property because you might want to change the chunk_length_s on the fly
+    @property
+    def chunk_stride(self) -> Optional[int]:
+        if self.chunk_length_s is None or self.overlap is None:
+            return None
+        else:
+            return max(1, int((1.0 - self.overlap) * self.chunk_length))
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.shape[-1] != 2:
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        padded_inputs = None
+        input_values = BatchFeature({"input_values": raw_audio})
+        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
+            if truncation:
+                max_length = min(array.shape[0] for array in raw_audio)
+                nb_step = int(np.floor(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+            elif padding:
+                max_length = max(array.shape[0] for array in raw_audio)
+                nb_step = int(np.ceil(max_length / self.chunk_stride))
+                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
+                padding = "max_length"
+            else:
+                padded_inputs = input_values
+
+        # normal padding on batch
+        if padded_inputs is None:
+            padded_inputs = self.pad(
+                input_values,
+                max_length=max_length,
+                truncation=truncation,
+                padding=padding,
+                return_attention_mask=padding,
+            )
+            if padding:
+                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
new file mode 100644
index 000000000000..ad1f6a0ee83a
--- /dev/null
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -0,0 +1,808 @@
+# coding=utf-8
+# Copyright 2023 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch EnCodec model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_encodec import EncodecConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "EncodecConfig"
+
+
+ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/encodec_24khz",
+    "facebook/encodec_48khz",
+    # See all EnCodec models at https://huggingface.co/models?filter=encodec
+]
+
+
+@dataclass
+class EncodecOutput(ModelOutput):
+    """
+    Args:
+        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
+            Decoded audio values, obtained using the decoder part of Encodec.
+    """
+
+    audio_codes: torch.FloatTensor = None
+    audio_values: torch.FloatTensor = None
+
+
+@dataclass
+class EncodecEncoderOutput(ModelOutput):
+    """
+    Args:
+        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
+    """
+
+    audio_codes: torch.FloatTensor = None
+    audio_scales: torch.FloatTensor = None
+
+
+@dataclass
+class EncodecDecoderOutput(ModelOutput):
+    """
+    Args:
+        audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
+            Decoded audio values, obtained using the decoder part of Encodec.
+    """
+
+    audio_values: torch.FloatTensor = None
+
+
+class EncodecConv1d(nn.Module):
+    """Conv1d with asymmetric or causal padding and normalization."""
+
+    def __init__(
+        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
+    ):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.pad_mode = config.pad_mode
+        self.norm_type = config.norm_type
+
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            logger.warning(
+                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
+        if self.norm_type == "weight_norm":
+            self.conv = nn.utils.weight_norm(self.conv)
+        elif self.norm_type == "time_group_norm":
+            self.norm = nn.GroupNorm(1, out_channels)
+
+    @staticmethod
+    def _get_extra_padding_for_conv1d(
+        hidden_states: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+    ) -> int:
+        """See `pad_for_conv1d`."""
+        length = hidden_states.shape[-1]
+        n_frames = (length - kernel_size + padding_total) / stride + 1
+        ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+        return ideal_length - length
+
+    @staticmethod
+    def _pad1d(hidden_states: torch.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
+        """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
+        If this is the case, we insert extra 0 padding to the right before the reflection happens.
+        """
+        length = hidden_states.shape[-1]
+        padding_left, padding_right = paddings
+        if not mode == "reflect":
+            return nn.functional.pad(hidden_states, paddings, mode, value)
+
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
+        padded = nn.functional.pad(hidden_states, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+
+    def forward(self, hidden_states):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        dilation = self.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total)
+
+        if self.causal:
+            # Left padding for causal
+            hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            hidden_states = self._pad1d(
+                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+            )
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm_type == "time_group_norm":
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class EncodecConvTranspose1d(nn.Module):
+    """ConvTranspose1d with asymmetric or causal padding and normalization."""
+
+    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.trim_right_ratio = config.trim_right_ratio
+        self.norm_type = config.norm_type
+        if self.norm_type not in ["weight_norm", "time_group_norm"]:
+            raise ValueError(
+                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
+            )
+
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+        if config.norm_type == "weight_norm":
+            self.conv = nn.utils.weight_norm(self.conv)
+        elif config.norm_type == "time_group_norm":
+            self.norm = nn.GroupNorm(1, out_channels)
+
+        if not (self.causal or self.trim_right_ratio == 1.0):
+            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")
+
+    def forward(self, hidden_states):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        padding_total = kernel_size - stride
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.norm_type == "time_group_norm":
+            hidden_states = self.norm(hidden_states)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+
+        padding_left = padding_total - padding_right
+
+        # unpad
+        end = hidden_states.shape[-1] - padding_right
+        hidden_states = hidden_states[..., padding_left:end]
+        return hidden_states
+
+
+class EncodecLSTM(nn.Module):
+    """
+    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
+    """
+
+    def __init__(self, config, dimension):
+        super().__init__()
+        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(2, 0, 1)
+        hidden_states = self.lstm(hidden_states)[0] + hidden_states
+        hidden_states = hidden_states.permute(1, 2, 0)
+        return hidden_states
+
+
+class EncodecResnetBlock(nn.Module):
+    """
+    Residual block from SEANet model as used by EnCodec.
+    """
+
+    def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
+        super().__init__()
+        kernel_sizes = (config.residual_kernel_size, 1)
+        if len(kernel_sizes) != len(dilations):
+            raise ValueError("Number of kernel sizes should match number of dilations")
+
+        hidden = dim // config.compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [nn.ELU()]
+            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
+        self.block = nn.ModuleList(block)
+
+        self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        for layer in self.block:
+            hidden_states = layer(hidden_states)
+
+        return self.shortcut(residual) + hidden_states
+
+
+class EncodecEncoder(nn.Module):
+    """SEANet encoder as used by EnCodec."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
+        scaling = 1
+
+        # Downsample to raw audio scale
+        for ratio in reversed(config.upsampling_ratios):
+            current_scale = scaling * config.num_filters
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
+            # Add downsampling layers
+            model += [nn.ELU()]
+            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
+            scaling *= 2
+
+        model += [EncodecLSTM(config, scaling * config.num_filters)]
+        model += [nn.ELU()]
+        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]
+
+        self.layers = nn.ModuleList(model)
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class EncodecDecoder(nn.Module):
+    """SEANet decoder as used by EnCodec."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        scaling = int(2 ** len(config.upsampling_ratios))
+        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]
+
+        model += [EncodecLSTM(config, scaling * config.num_filters)]
+
+        # Upsample to raw audio scale
+        for ratio in config.upsampling_ratios:
+            current_scale = scaling * config.num_filters
+            # Add upsampling layers
+            model += [nn.ELU()]
+            model += [
+                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
+            ]
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
+            scaling //= 2
+
+        # Add final layers
+        model += [nn.ELU()]
+        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
+        self.layers = nn.ModuleList(model)
+
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class EncodecEuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        embed = torch.zeros(config.codebook_size, config.codebook_dim)
+
+        self.codebook_size = config.codebook_size
+
+        self.register_buffer("inited", torch.Tensor([True]))
+        self.register_buffer("cluster_size", torch.zeros(config.codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+
+    def quantize(self, hidden_states):
+        embed = self.embed.t()
+        scaled_states = hidden_states.pow(2).sum(1, keepdim=True)
+        dist = -(scaled_states - 2 * hidden_states @ embed + embed.pow(2).sum(0, keepdim=True))
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def encode(self, hidden_states):
+        shape = hidden_states.shape
+        # pre-process
+        hidden_states = hidden_states.reshape((-1, shape[-1]))
+        # quantize
+        embed_ind = self.quantize(hidden_states)
+        # post-process
+        embed_ind = embed_ind.view(*shape[:-1])
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = nn.functional.embedding(embed_ind, self.embed)
+        return quantize
+
+
+class EncodecVectorQuantization(nn.Module):
+    """
+    Vector quantization implementation. Currently supports only euclidean distance.
+    """
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        self.codebook = EncodecEuclideanCodebook(config)
+
+    def encode(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        embed_in = self.codebook.encode(hidden_states)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self.codebook.decode(embed_ind)
+        quantize = quantize.permute(0, 2, 1)
+        return quantize
+
+
+class EncodecResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer."""
+
+    def __init__(self, config: EncodecConfig):
+        super().__init__()
+        self.codebook_size = config.codebook_size
+        self.frame_rate = config.frame_rate
+        self.num_quantizers = config.num_quantizers
+        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])
+
+    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
+        """Return num_quantizers based on specified target bandwidth."""
+        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
+        num_quantizers = self.num_quantizers
+        if bandwidth is not None and bandwidth > 0.0:
+            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
+        return num_quantizers
+
+    def encode(self, embeddings: torch.Tensor, bandwidth: Optional[float] = None) -> torch.Tensor:
+        """
+        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
+        the appropriate number of quantizers to use and returns indices for each quantizer.
+        """
+        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
+        residual = embeddings
+        all_indices = []
+        for layer in self.layers[:num_quantizers]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        quantized_out = torch.tensor(0.0, device=codes.device)
+        for i, indices in enumerate(codes):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+
+
+class EncodecPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EncodecConfig
+    base_model_prefix = "encodec"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LSTM):
+            for name, param in module.named_parameters():
+                if "weight" in name:
+                    nn.init.xavier_uniform_(param)
+                elif "bias" in name:
+                    nn.init.constant_(param, 0.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncodecEncoder, EncodecDecoder)):
+            module.gradient_checkpointing = value
+
+
+ENCODEC_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`EncodecConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+ENCODEC_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Raw audio input converted to Float and padded to the approriate length in order to be encoded using chunks
+            of length self.chunk_length and a stride of `config.chunk_stride`.
+        padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+).
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            <Tip warning={true}>
+
+             `padding_mask` should always be passed, unless the input was truncated or not padded. This is because in
+             order to process tensors effectively, the input audio should be padded so that `input_length % stride =
+             step` with `step = chunk_length-stride`. This ensures that all chunks are of the same shape
+
+            </Tip>
+
+        bandwidth (`float`, *optional*):
+            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
+            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
+            `bandwidth == 6.0`
+        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+            Scaling factor for each `audio_codes` input.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The EnCodec neural audio codec model.",
+    ENCODEC_START_DOCSTRING,
+)
+class EncodecModel(EncodecPreTrainedModel):
+    def __init__(self, config: EncodecConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = EncodecEncoder(config)
+        self.decoder = EncodecDecoder(config)
+
+        self.quantizer = EncodecResidualVectorQuantizer(config)
+
+        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+        if 2**self.bits_per_codebook != self.config.codebook_size:
+            raise ValueError("The codebook_size must be a power of 2.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _encode_frame(
+        self, input_values: torch.Tensor, bandwidth: float, padding_mask: int
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
+        normalized. The padding mask is required to compute the correct scale.
+        """
+        length = input_values.shape[-1]
+        duration = length / self.config.sampling_rate
+
+        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
+            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")
+
+        scale = None
+        if self.config.normalize:
+            # if the padding is non zero
+            input_values = input_values * padding_mask
+            mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
+            scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
+            input_values = input_values / scale
+
+        embeddings = self.encoder(input_values)
+        codes = self.quantizer.encode(embeddings, bandwidth)
+        codes = codes.transpose(0, 1)
+        return codes, scale
+
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: torch.Tensor = None,
+        bandwidth: Optional[float] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], EncodecEncoderOutput]:
+        """
+        Encodes the input audio waveform into discrete codes.
+
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            bandwidth (`float`, *optional*):
+                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
+                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
+                as bandwidth == 6.0
+
+        Returns:
+            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
+            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
+            `codebook` of shape `[batch_size, num_codebooks, frames]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if bandwidth is None:
+            bandwidth = self.config.target_bandwidths[0]
+        if bandwidth not in self.config.target_bandwidths:
+            raise ValueError(
+                f"This model doesn't support the bandwidth {bandwidth}. "
+                f"Select one of {self.config.target_bandwidths}."
+            )
+
+        _, channels, input_length = input_values.shape
+
+        if channels < 1 or channels > 2:
+            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+        chunk_length = self.config.chunk_length
+        if chunk_length is None:
+            chunk_length = input_length
+            stride = input_length
+        else:
+            stride = self.config.chunk_stride
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        encoded_frames = []
+        scales = []
+
+        step = chunk_length - stride
+        if (input_length % stride) - step != 0:
+            raise ValueError(
+                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
+            )
+
+        for offset in range(0, input_length - step, stride):
+            mask = padding_mask[..., offset : offset + chunk_length].bool()
+            frame = input_values[:, :, offset : offset + chunk_length]
+            encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
+            encoded_frames.append(encoded_frame)
+            scales.append(scale)
+
+        encoded_frames = torch.stack(encoded_frames)
+
+        if not return_dict:
+            return (encoded_frames, scales)
+
+        return EncodecEncoderOutput(encoded_frames, scales)
+
+    @staticmethod
+    def _linear_overlap_add(frames: List[torch.Tensor], stride: int):
+        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
+        # e.g., more than 2 frames per position.
+        # The core idea is to use a weight function that is a triangle,
+        # with a maximum value at the middle of the chunk.
+        # We use this weighting when summing the frames, and divide by the sum of weights
+        # for each positions at the end. Thus:
+        #   - if a frame is the only one to cover a position, the weighting is a no-op.
+        #   - if 2 frames cover a position:
+        #          ...  ...
+        #         /   \/   \
+        #        /    /\    \
+        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
+        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
+        # After the final normalization, the weight of the second frame at position `t` is
+        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
+        #
+        #   - if more than 2 frames overlap at a given point, we hope that by induction
+        #      something sensible happens.
+        if len(frames) == 0:
+            raise ValueError("`frames` cannot be an empty list.")
+
+        device = frames[0].device
+        dtype = frames[0].dtype
+        shape = frames[0].shape[:-1]
+        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]
+
+        frame_length = frames[0].shape[-1]
+        time_vec = torch.linspace(0, 1, frame_length + 2, device=device, dtype=dtype)[1:-1]
+        weight = 0.5 - (time_vec - 0.5).abs()
+
+        sum_weight = torch.zeros(total_size, device=device, dtype=dtype)
+        out = torch.zeros(*shape, total_size, device=device, dtype=dtype)
+        offset: int = 0
+
+        for frame in frames:
+            frame_length = frame.shape[-1]
+            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
+            sum_weight[offset : offset + frame_length] += weight[:frame_length]
+            offset += stride
+
+        if sum_weight.min() == 0:
+            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")
+
+        return out / sum_weight
+
+    def _decode_frame(self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+        codes = codes.transpose(0, 1)
+        embeddings = self.quantizer.decode(codes)
+        outputs = self.decoder(embeddings)
+        if scale is not None:
+            outputs = outputs * scale.view(-1, 1, 1)
+        return outputs
+
+    def decode(
+        self,
+        audio_codes: torch.Tensor,
+        audio_scales: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], EncodecDecoderOutput]:
+        """
+        Decodes the given frames into an output audio waveform.
+
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+
+        Args:
+            audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+                Discret code embeddings computed using `model.encode`.
+            audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+                Scaling factor for each `audio_codes` input.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        """
+        return_dict = return_dict or self.config.return_dict
+
+        chunk_length = self.config.chunk_length
+        if chunk_length is None:
+            if len(audio_codes) != 1:
+                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+            audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
+        else:
+            decoded_frames = []
+
+            for frame, scale in zip(audio_codes, audio_scales):
+                frames = self._decode_frame(frame, scale)
+                decoded_frames.append(frames)
+
+            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)
+
+        # truncate based on padding mask
+        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+            audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+        if not return_dict:
+            return (audio_values,)
+        return EncodecDecoderOutput(audio_values)
+
+    @add_start_docstrings_to_model_forward(ENCODEC_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=EncodecOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        bandwidth: Optional[float] = None,
+        audio_codes: Optional[torch.Tensor] = None,
+        audio_scales: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], EncodecOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, EncodecModel
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model_id = "facebook/encodec_24khz"
+        >>> model = EncodecModel.from_pretrained(model_id)
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+
+        >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> audio_codes = outputs.audio_codes
+        >>> audio_values = outputs.audio_values
+        ```"""
+        return_dict = return_dict or self.config.return_dict
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        if audio_codes is not None and audio_scales is None:
+            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")
+
+        if audio_scales is not None and audio_codes is None:
+            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")
+
+        if audio_scales is None and audio_codes is None:
+            audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)
+
+        audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
+        if not return_dict:
+            return (audio_codes, audio_values)
+
+        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1eba8c0ca754..11e763e2a08e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2655,6 +2655,23 @@ def load_tf_weights_in_electra(*args, **kwargs):
     requires_backends(load_tf_weights_in_electra, ["torch"])
 
 
+ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class EncodecModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EncodecPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class EncoderDecoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index f85182c8bc40..6c591588581d 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -9,6 +9,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["speech"])
 
 
+class EncodecFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
+
+
 class MCTCTFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 
diff --git a/tests/models/encodec/__init__.py b/tests/models/encodec/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py
new file mode 100644
index 000000000000..95639fcda5ff
--- /dev/null
+++ b/tests/models/encodec/test_feature_extraction_encodec.py
@@ -0,0 +1,255 @@
+# coding=utf-8
+# Copyright 2021-2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the EnCodec feature extractor."""
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import is_speech_available
+from transformers.testing_utils import require_torch
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_speech_available():
+    from transformers import EncodecFeatureExtractor
+
+if is_torch_available():
+    import torch
+
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+class EnCodecFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        padding_value=0.0,
+        sampling_rate=24000,
+        return_attention_mask=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            audio_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            # make sure that inputs increase in size
+            audio_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            audio_inputs = [np.asarray(x) for x in audio_inputs]
+
+        return audio_inputs
+
+
+@require_torch
+class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = EncodecFeatureExtractor if is_speech_available() else None
+
+    def setUp(self):
+        self.feat_extract_tester = EnCodecFeatureExtractionTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        audio_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_audio_inputs = [np.asarray(audio_input) for audio_input in audio_inputs]
+
+        # Test not batched input
+        encoded_sequences_1 = feat_extract(audio_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_audio_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feat_extract(audio_inputs, padding=True, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_audio_inputs, padding=True, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_double_precision_pad(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_audio_inputs = np.random.rand(100).astype(np.float64)
+        py_audio_inputs = np_audio_inputs.tolist()
+
+        for inputs in [py_audio_inputs, np_audio_inputs]:
+            np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_values.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_values.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in audio_samples]
+
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
+             3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
+             2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
+             4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
+             7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
+             4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
+        )
+        # fmt: on
+        input_audio = self._load_datasamples(1)
+        feature_extractor = EncodecFeatureExtractor()
+        input_values = feature_extractor(input_audio, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 1, 93680))
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
+
+    def test_integration_stereo(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
+             3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
+             2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
+             4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
+             7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
+             4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
+        )
+        # fmt: on
+        input_audio = self._load_datasamples(1)
+        input_audio = [np.tile(input_audio[0][None], reps=(2, 1))]
+        input_audio[0][1] *= 0.5
+        feature_extractor = EncodecFeatureExtractor(feature_size=2)
+        input_values = feature_extractor(input_audio, return_tensors="pt").input_values
+        self.assertEquals(input_values.shape, (1, 2, 93680))
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
+        self.assertTrue(torch.allclose(input_values[0, 1, :30], EXPECTED_INPUT_VALUES * 0.5, atol=1e-6))
+
+    def test_truncation_and_padding(self):
+        input_audio = self._load_datasamples(2)
+        # would be easier if the stride was like
+        feature_extractor = EncodecFeatureExtractor(feature_size=1, chunk_length_s=1, overlap=0.01)
+
+        # pad and trunc raise an error ?
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Both padding and truncation were set. Make sure you only set one.$",
+        ):
+            truncated_outputs = feature_extractor(
+                input_audio, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_values
+
+        # truncate to chunk
+        truncated_outputs = feature_extractor(input_audio, truncation=True, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 71520))  # 2 chunks
+
+        # force truncate to max_length
+        truncated_outputs = feature_extractor(
+            input_audio, truncation=True, max_length=48000, return_tensors="pt"
+        ).input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 48000))
+
+        # pad to chunk
+        padded_outputs = feature_extractor(input_audio, padding=True, return_tensors="pt").input_values
+        self.assertEquals(padded_outputs.shape, (2, 1, 95280))
+
+        # pad to chunk
+        truncated_outputs = feature_extractor(input_audio, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 95280))
+
+        # force pad to max length
+        truncated_outputs = feature_extractor(
+            input_audio, padding="max_length", max_length=100000, return_tensors="pt"
+        ).input_values
+        self.assertEquals(truncated_outputs.shape, (2, 1, 100000))
+
+        # force no pad
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$",
+        ):
+            truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values
+
+        truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (1, 1, 93680))
+
+        # no pad if no chunk_length_s
+        feature_extractor.chunk_length_s = None
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$",
+        ):
+            truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values
+
+        truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (1, 1, 93680))
+
+        # no pad if no overlap
+        feature_extractor.chunk_length_s = 2
+        feature_extractor.overlap = None
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$",
+        ):
+            truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values
+
+        truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values
+        self.assertEquals(truncated_outputs.shape, (1, 1, 93680))
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
new file mode 100644
index 000000000000..23b2114a5db3
--- /dev/null
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -0,0 +1,571 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Encodec model. """
+
+import copy
+import inspect
+import os
+import tempfile
+import unittest
+from typing import Dict, List, Tuple
+
+import numpy as np
+from datasets import Audio, load_dataset
+
+from transformers import AutoProcessor, EncodecConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import EncodecModel
+
+
+def prepare_inputs_dict(
+    config,
+    input_ids=None,
+    input_values=None,
+    decoder_input_ids=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if input_ids is not None:
+        encoder_dict = {"input_ids": input_ids}
+    else:
+        encoder_dict = {"input_values": input_values}
+
+    decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {}
+
+    return {**encoder_dict, **decoder_dict}
+
+
+@require_torch
+class EncodecModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_channels=2,
+        is_training=False,
+        num_hidden_layers=4,
+        intermediate_size=40,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
+        config = self.get_config()
+        inputs_dict = {"input_values": input_values}
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_config(self):
+        return EncodecConfig(audio_channels=self.num_channels, chunk_in_sec=None)
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = EncodecModel(config=config).to(torch_device).eval()
+
+        input_values = inputs_dict["input_values"]
+        result = model(input_values)
+        self.parent.assertEqual(
+            result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
+        )
+
+
+@require_torch
+class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (EncodecModel,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+    test_resize_embeddings = False
+    pipeline_model_mapping = {}
+    input_name = "input_values"
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        # model does not have attention and does not support returning hidden states
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        if "output_attentions" in inputs_dict:
+            inputs_dict.pop("output_attentions")
+        if "output_hidden_states" in inputs_dict:
+            inputs_dict.pop("output_hidden_states")
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = EncodecModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=EncodecConfig, hidden_size=37, common_properties=[], has_text_modality=False
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_values", "padding_mask", "bandwidth"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    def test_torchscript_output_attentions(self):
+        pass
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
+    def test_torchscript_output_hidden_state(self):
+        pass
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            main_input_name = model_class.main_input_name
+
+            try:
+                main_input = inputs[main_input_name]
+                model(main_input)
+                traced_model = torch.jit.trace(model, main_input)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                if layer_name in loaded_model_state_dict:
+                    p2 = loaded_model_state_dict[layer_name]
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
+    def test_attention_outputs(self):
+        pass
+
+    def test_feed_forward_chunking(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            torch.manual_seed(0)
+            config = copy.deepcopy(original_config)
+            config.chunk_length_s = None
+            config.overlap = None
+            config.sampling_rate = 10
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            inputs["input_values"] = inputs["input_values"].repeat(1, 1, 10)
+
+            hidden_states_no_chunk = model(**inputs)[0]
+
+            torch.manual_seed(0)
+            config.chunk_length_s = 1
+            config.overlap = 0
+            config.sampling_rate = 10
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            hidden_states_with_chunk = model(**inputs)[0]
+            self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
+
+    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
+    def test_hidden_states_output(self):
+        pass
+
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_determinism(first, second):
+            # outputs are not tensors but list (since each sequence don't have the same frame_length)
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            if isinstance(first, tuple) and isinstance(second, tuple):
+                for tensor1, tensor2 in zip(first, second):
+                    check_determinism(tensor1, tensor2)
+            else:
+                check_determinism(first, second)
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs)
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = ["conv"]
+                ignore_init = ["lstm"]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    elif not any([x in name for x in ignore_init]):
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+def normalize(arr):
+    norm = np.linalg.norm(arr)
+    normalized_arr = arr / norm
+    return normalized_arr
+
+
+def compute_rmse(arr1, arr2):
+    arr1_normalized = normalize(arr1)
+    arr2_normalized = normalize(arr2)
+    return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean())
+
+
+@slow
+@require_torch
+class EncodecIntegrationTest(unittest.TestCase):
+    def test_integration_24kHz(self):
+        expected_rmse = {
+            "1.5": 0.0025,
+            "24.0": 0.0015,
+        }
+        expected_codesums = {
+            "1.5": [367184],
+            "24.0": [6648961],
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "facebook/encodec_24khz"
+
+        model = EncodecModel.from_pretrained(model_id).to(torch_device)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+        inputs = processor(
+            raw_audio=audio_sample,
+            sampling_rate=processor.sampling_rate,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        for bandwidth, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
+
+                audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
+
+                # make sure audio encoded codes are correct
+                self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
+
+                audio_codes, scales = encoder_outputs.to_tuple()
+                input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0]
+                input_values_enc_dec = model(
+                    inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth)
+                )[-1]
+
+            # make sure forward and decode gives same result
+            self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
+
+            # make sure shape matches
+            self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+            arr = inputs["input_values"][0].cpu().numpy()
+            arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(arr, arr_enc_dec)
+            self.assertTrue(rmse < expected_rmse)
+
+    def test_integration_48kHz(self):
+        expected_rmse = {
+            "3.0": 0.001,
+            "24.0": 0.0005,
+        }
+        expected_codesums = {
+            "3.0": [142174, 147901, 154090, 178965, 161879],
+            "24.0": [1561048, 1284593, 1278330, 1487220, 1659404],
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "facebook/encodec_48khz"
+
+        model = EncodecModel.from_pretrained(model_id).to(torch_device)
+        model = model.eval()
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+        # transform mono to stereo
+        audio_sample = np.array([audio_sample, audio_sample])
+
+        inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt").to(
+            torch_device
+        )
+
+        for bandwidth, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(
+                    inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
+                )
+                audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
+
+                # make sure audio encoded codes are correct
+                self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
+                audio_codes, scales = encoder_outputs
+                input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0]
+                input_values_enc_dec = model(
+                    inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth)
+                )[-1]
+
+            # make sure forward and decode gives same result
+            self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
+
+            # make sure shape matches
+            self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+            arr = inputs["input_values"][0].cpu().numpy()
+            arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(arr, arr_enc_dec)
+            self.assertTrue(rmse < expected_rmse)
+
+    def test_batch_48kHz(self):
+        expected_rmse = {
+            "3.0": 0.001,
+            "24.0": 0.0005,
+        }
+        expected_codesums = {
+            "3.0": [
+                [71689, 78549, 75644, 88889, 73100, 82509, 71449, 82835],
+                [84427, 82356, 75809, 52509, 80137, 87672, 87436, 70456],
+            ],
+            "24.0": [
+                [71689, 78549, 75644, 88889, 73100, 82509, 71449, 82835],
+                [84427, 82356, 75809, 52509, 80137, 87672, 87436, 70456],
+            ],
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "facebook/encodec_48khz"
+
+        model = EncodecModel.from_pretrained(model_id).to(torch_device)
+        processor = AutoProcessor.from_pretrained(model_id, chunk_length_s=1, overlap=0.01)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+
+        audio_samples = [
+            np.array([audio_sample["array"], audio_sample["array"]])
+            for audio_sample in librispeech_dummy[-2:]["audio"]
+        ]
+
+        inputs = processor(raw_audio=audio_samples, sampling_rate=processor.sampling_rate, return_tensors="pt")
+        input_values = inputs["input_values"].to(torch_device)
+        for bandwidth, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
+                audio_code_sums_0 = [a[0][0].sum().cpu().item() for a in encoder_outputs[0]]
+                audio_code_sums_1 = [a[0][1].sum().cpu().item() for a in encoder_outputs[0]]
+
+                # make sure audio encoded codes are correct
+                self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0])
+                self.assertListEqual(audio_code_sums_1, expected_codesums[bandwidth][1])
+
+                audio_codes, scales = encoder_outputs
+                input_values_dec = model.decode(audio_codes, scales)[0]
+                input_values_enc_dec = model(input_values, bandwidth=float(bandwidth))[-1]
+
+            # make sure forward and decode gives same result
+            self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
+
+            # make sure shape matches
+            self.assertTrue(input_values.shape == input_values_enc_dec.shape)
+
+            arr = input_values[0].cpu().numpy()
+            arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(arr, arr_enc_dec)
+            self.assertTrue(rmse < expected_rmse)
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 02c3d2276f5d..63b1bacbbe74 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -31,6 +31,8 @@
 CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING
 
 SPECIAL_CASES_TO_ALLOW = {
+    # used to compute the property `self.chunk_length`
+    "EncodecConfig": ["overlap"],
     # used as `self.bert_model = BertModel(config, ...)`
     "DPRConfig": True,
     # not used in modeling files, but it's an important information
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 5cdded64bbe8..63eb4a000265 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -83,6 +83,7 @@ src/transformers/models/electra/configuration_electra.py
 src/transformers/models/electra/modeling_electra.py
 src/transformers/models/electra/modeling_tf_electra.py
 src/transformers/models/efficientformer/modeling_tf_efficientformer.py
+src/transformers/models/encodec/modeling_encodec.py
 src/transformers/models/ernie/configuration_ernie.py
 src/transformers/models/ernie_m/configuration_ernie_m.py
 src/transformers/models/ernie_m/modeling_ernie_m.py
@@ -395,6 +396,7 @@ src/transformers/models/deit/feature_extraction_deit.py
 src/transformers/models/detr/feature_extraction_detr.py
 src/transformers/models/donut/feature_extraction_donut.py
 src/transformers/models/dpt/feature_extraction_dpt.py
+src/transformers/models/encodec/feature_extraction_encodec.py
 src/transformers/models/flava/feature_extraction_flava.py
 src/transformers/models/glpn/feature_extraction_glpn.py
 src/transformers/models/imagegpt/feature_extraction_imagegpt.py

From 1609a436eca115853b5a4cfd80b9ec2302bb9fcc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Jun 2023 01:10:27 +0200
Subject: [PATCH 426/935] Add MMS CTC Fine-Tuning (#24281)

* Add mms ctc fine tuning

* make style

* More fixes that are needed

* make fix-copies

* make draft for README

* add new file

* move to new file

* make style

* make style

* add quick test

* make style

* make style
---
 examples/pytorch/speech-recognition/README.md | 109 +++
 .../run_speech_recognition_ctc_adapter.py     | 799 ++++++++++++++++++
 examples/pytorch/test_pytorch_examples.py     |  33 +
 .../models/hubert/modeling_hubert.py          |   8 +
 src/transformers/models/sew/modeling_sew.py   |   8 +
 .../models/sew_d/modeling_sew_d.py            |   8 +
 .../models/unispeech/modeling_unispeech.py    |   8 +
 .../unispeech_sat/modeling_unispeech_sat.py   |   8 +
 .../models/wav2vec2/modeling_wav2vec2.py      |  21 +
 .../models/wavlm/modeling_wavlm.py            |   8 +
 10 files changed, 1010 insertions(+)
 create mode 100755 examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py

diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md
index cf5a05c01783..6ae2e1abef60 100644
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@@ -26,6 +26,10 @@ limitations under the License.
 		- [Librispeech](#librispeech-ctc)
 		- [Common Voice](#common-voice-ctc)
 		- [Multilingual Librispeech](#multilingual-librispeech-ctc)
+- [Automatic Speech Recognition with CTC and Adapter Layers](#connectionist-temporal-classification-with-adapters)
+	- [Massive Multilingual Speech (MMS)](#mms-model)
+	- [Examples](#examples-ctc-adapter)
+		- [Common Voice](#common-voice-ctc-adapter)
 - [Automatic Speech Recognition with Sequence-to-Sequence](#sequence-to-sequence)
 	- [Whisper Model](#whisper-model)
 	- [Speech-Encoder-Decoder Model](#warm-started-speech-encoder-decoder-model)
@@ -243,6 +247,111 @@ they can serve as a baseline to improve upon.
 | [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)| `"german"`  | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)  | 0.13  | -     | 1 GPU Titan 24 GB RAM  |  15h04                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-xlsr-53-300m-mls-german-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-xlsr-53-300m-mls-german-ft/blob/main/run.sh) |
 | [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)| `"german"`  | [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m)  | 0.15 | -     | 1 GPU Titan 24 GB RAM  |  15h04                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-300m-mls-german-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-300m-mls-german-ft/blob/main/run.sh) |
 
+## Connectionist Temporal Classification With Adapters
+
+The script [`run_speech_recognition_ctc_adapter.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py) can be used to fine-tune adapter layers for [Wav2Vec2-like models like MMS](https://huggingface.co/docs/transformers/main/en/model_doc/mms) for automatic speech recognition.
+
+### MMS Model
+
+The [Massive Multilingual Speech (MMS) model](https://huggingface.co/facebook/mms-1b-all) has been pre-trained and fine-tuned
+on 1000+ languages. The model makes use of adapter attention layers to fine-tune only a small part 
+of the model on a specific language. The model already comes with fine-tuned adapter layers for 1000+ languages and 
+can be used for inference for 1000+ languages out of the box.
+
+However, for improved performance or more specific use cases one can re-initialize the adapter weights, freeze all 
+other weights and fine-tune them on a specific dataset as shown in the [example below](#examples-ctc-adapter).
+
+Note that the adapter weights include low dimensional linear layers for every attention block as well as the final language
+model head layers.
+
+### Examples CTC Adapter
+
+In the following we will look at how one can fine-tune adapter weights for any of the 
+[MMS CTC checkpoints](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&other=mms&sort=downloads) in less than 1 hour.
+
+#### Common Voice CTC Adapter
+
+As in the examples [above](#examples-ctc), we fine-tune on Common Voice's 6 dataset in Turkish as an example.
+Contrary to [`run_speech_recognition_ctc.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) before there is a `--target_language` which has to be defined to state for which 
+language or concept the adapter layers shall be trained. The adapter weights will then 
+accordingly be called `adapter.{<target_language}.safetensors`.
+
+Let's run an example script. Make sure to be logged in so that your model can be directly uploaded to the Hub.
+```
+huggingface-cli login
+```
+
+Now, let's run an example and upload it to the Hub under `wav2vec2-common_voice-tr-mms-demo`.
+
+```sh
+python run_speech_recognition_ctc.py \
+	--dataset_name="common_voice" \
+	--model_name_or_path="facebook/mms-1b-all" \
+	--dataset_config_name="tr" \
+	--output_dir="./wav2vec2-common_voice-tr-mms-demo" \
+	--num_train_epochs="4" \
+	--per_device_train_batch_size="32" \
+	--learning_rate="1e-3" \
+	--warmup_steps="100" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--length_column_name="input_length" \
+	--save_steps="200" \
+	--eval_steps="100" \
+	--save_total_limit="3" \
+  --target_language="tur" \
+	--gradient_checkpointing \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+  --push_to_hub
+```
+
+This should take less than 10 minutes on most GPUs and you should very quickly get word error rates 
+below 27%.
+
+For an example run, you can have a look at [`patrickvonplaten/wav2vec2-common_voice-tr-mms-demo`](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-mms-demo).
+
+
+If you'd like to train another adapter model with the same base model, you can simply re-use the same `--output_dir`,
+but make sure to pass the `--output_dir` folder also to `--tokenizer_name_or_path` so that the vocabulary is not 
+overwritten but **extended**. Assuming you would like to train adapter weights on Swedish in addition to Turkish and save 
+the adapter weights in the same model repo, you can run:
+
+```sh
+python run_speech_recognition_ctc.py \
+	--dataset_name="common_voice" \
+	--model_name_or_path="facebook/mms-1b-all" \
+	--dataset_config_name="sw" \
+	--output_dir="./wav2vec2-common_voice-tr-mms-demo" \
+	--tokenizer_name_or_path="./wav2vec2-common_voice-tr-mms-demo" \
+	--num_train_epochs="4" \
+	--per_device_train_batch_size="32" \
+	--learning_rate="1e-3" \
+	--warmup_steps="100" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--length_column_name="input_length" \
+	--save_steps="200" \
+	--eval_steps="100" \
+	--save_total_limit="3" \
+  --target_language="swe" \
+	--gradient_checkpointing \
+	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+  --push_to_hub
+```
+
+Now you should have both `adapter.tur.safetensors` and `adapter.swe.safetensors` in the model repo
+and you can load the respective language with:
+```py
+model.load_adapter("tur")  # or "swe"
+```
+respectively.
+
 ## Sequence to Sequence
 
 The script [`run_speech_recognition_seq2seq.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py) can be used to fine-tune any [Speech Sequence-to-Sequence Model](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSpeechSeq2Seq) for automatic speech 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
new file mode 100755
index 000000000000..786b7635a33d
--- /dev/null
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -0,0 +1,799 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Fine-tuning a 🤗 Transformers CTC adapter model for automatic speech recognition"""
+
+import functools
+import json
+import logging
+import os
+import re
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+import datasets
+import evaluate
+import numpy as np
+import torch
+from datasets import DatasetDict, load_dataset
+from safetensors.torch import save_file as safe_save_file
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForCTC,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2Processor,
+    set_seed,
+)
+from transformers.models.wav2vec2.modeling_wav2vec2 import WAV2VEC2_ADAPTER_SAFE_FILE
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.31.0.dev0")
+
+require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
+
+
+logger = logging.getLogger(__name__)
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout probability for the final projection layer."},
+    )
+    mask_time_prob: float = field(
+        default=0.05,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the time axis to be chosen as the start of the vector"
+                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+                "vectors will be masked along the time axis."
+            )
+        },
+    )
+    mask_time_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: float = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
+                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
+                " bins will be masked along the time axis."
+            )
+        },
+    )
+    mask_feature_length: int = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
+    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+    ctc_loss_reduction: Optional[str] = field(
+        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
+    )
+    adapter_attn_dim: int = field(
+        default=16,
+        metadata={
+            "help": "The hidden dimension of the adapter layers that will be randomly initialized and trained. The higher the dimension, the more capacity is given to the adapter weights. Note that only the adapter weights are fine-tuned."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: str = field(
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    target_language: Optional[str] = field(
+        metadata={
+            "help": (
+                "The target language on which the adapter attention layers"
+                " should be trained on in ISO 693-3 code, e.g. `tur` for Turkish"
+                " Wav2Vec2's MMS ISO codes can be looked up here: https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html"
+                " If you are not training the adapter layers on a language, simply choose"
+                " another accronym that fits your data."
+            )
+        },
+    )
+    dataset_config_name: str = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: str = field(
+        default="train+validation",
+        metadata={
+            "help": (
+                "The name of the training data set split to use (via the datasets library). Defaults to "
+                "'train+validation'"
+            )
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    chars_to_ignore: Optional[List[str]] = list_field(
+        default=None,
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+    eval_metrics: List[str] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
+                " 'max_duration_in_seconds`"
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training"
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "If :obj:`True`, will use the token generated when running"
+                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
+            )
+        },
+    )
+    unk_token: str = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: str = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: str = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    overwrite_lang_vocab: bool = field(
+        default=False,
+        metadata={"help": ("If :obj:`True`, will overwrite existing `target_language` vocabulary of tokenizer.")},
+    )
+
+
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.AutoProcessor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    processor: AutoProcessor
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lenghts and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
+
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+        batch["labels"] = labels
+        if "attention_mask" in batch:
+            batch["attention_mask"] = batch["attention_mask"].to(torch.long)
+
+        return batch
+
+
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
+    # Given training and test labels create vocabulary
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["target_text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+
+    vocabs = datasets.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=datasets["train"].column_names,
+    )
+
+    # take union of all unique characters in each dataset
+    vocab_set = functools.reduce(
+        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
+    )
+
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
+
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+
+    # add unk and pad token
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)
+
+    return vocab_dict
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_speech_recognition_ctc_adapter", model_args, data_args)
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # 1. First, let's load the dataset
+    raw_datasets = DatasetDict()
+
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            use_auth_token=data_args.use_auth_token,
+        )
+
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
+                " Make sure to set `--audio_column_name` to the correct audio column - one of"
+                f" {', '.join(raw_datasets['train'].column_names)}."
+            )
+
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            use_auth_token=data_args.use_auth_token,
+        )
+
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+
+    # 2. We remove some special characters from the datasets
+    # that make training complicated and do not help in transcribing the speech
+    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
+    # that could be easily picked up by the model
+    chars_to_ignore_regex = (
+        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+    )
+    text_column_name = data_args.text_column_name
+
+    def remove_special_characters(batch):
+        if chars_to_ignore_regex is not None:
+            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
+        else:
+            batch["target_text"] = batch[text_column_name].lower() + " "
+        return batch
+
+    with training_args.main_process_first(desc="dataset map special characters removal"):
+        raw_datasets = raw_datasets.map(
+            remove_special_characters,
+            remove_columns=[text_column_name],
+            desc="remove special characters from datasets",
+        )
+
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token
+
+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
+    # load config
+    config = AutoConfig.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_kwargs = {}
+
+    vocab_dict = {}
+    if tokenizer_name_or_path is not None:
+        # load vocabulary of other adapter languages so that new language can be appended
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_auth_token=data_args.use_auth_token)
+        vocab_dict = tokenizer.vocab.copy()
+        if tokenizer.target_lang is None:
+            raise ValueError("Make sure to load a multi-lingual tokenizer with a set target language.")
+
+        if data_args.target_language in tokenizer.vocab and not data_args.overwrite_lang_vocab:
+            logger.info(
+                "Adapter language already exists."
+                " Skipping vocabulary creating. If you want to create a new vocabulary"
+                f" for {data_args.target_language} make sure to add '--overwrite_lang_vocab'"
+            )
+        else:
+            tokenizer_name_or_path = None
+
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir
+
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                try:
+                    os.remove(vocab_file)
+                except OSError:
+                    # in shared file-systems it might be the case that
+                    # two processes try to delete the vocab file at the some time
+                    pass
+
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                lang_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+
+                # if we doing adapter language training, save
+                # vocab with adpter language
+                if data_args.target_language is not None:
+                    vocab_dict[data_args.target_language] = lang_dict
+
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_kwargs = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+            "unk_token": unk_token,
+            "pad_token": pad_token,
+            "word_delimiter_token": word_delimiter_token,
+            "target_lang": data_args.target_language,
+        }
+
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+
+    # load feature_extractor and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        use_auth_token=data_args.use_auth_token,
+        **tokenizer_kwargs,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+    )
+
+    # adapt config
+    config.update(
+        {
+            "final_dropout": model_args.final_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "layerdrop": model_args.layerdrop,
+            "ctc_loss_reduction": model_args.ctc_loss_reduction,
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
+            "adapter_attn_dim": model_args.adapter_attn_dim,
+        }
+    )
+
+    # create model
+    model = AutoModelForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        use_auth_token=data_args.use_auth_token,
+        ignore_mismatched_sizes=True,
+    )
+
+    # if attn adapter is defined, freeze all non-adapter weights
+    if model.config.adapter_attn_dim is not None:
+        model.init_adapter_layers()
+        # first we freeze the whole base model
+        model.freeze_base_model()
+
+        # next we unfreeze all adapter layers
+        adapter_weights = model._get_adapters()
+        for param in adapter_weights.values():
+            param.requires_grad = True
+
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+
+    # make sure that dataset decodes audio with correct sampling rate
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
+
+    # derive max & min input length for sample rate & max duration
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+
+    # Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    def prepare_dataset(batch):
+        # load audio
+        sample = batch[audio_column_name]
+
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_values"] = inputs.input_values[0]
+        batch["input_length"] = len(batch["input_values"])
+
+        # encode targets
+        batch["labels"] = tokenizer(batch["target_text"]).input_ids
+        return batch
+
+    with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = raw_datasets.map(
+            prepare_dataset,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length
+
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+
+    # 7. Next, we can prepare the training.
+    # Let's use word error rate (WER) as our evaluation metric,
+    # instantiate a data collator and the trainer
+
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: evaluate.load(metric) for metric in data_args.eval_metrics}
+
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
+        return
+
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
+
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
+
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
+
+        return metrics
+
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    with training_args.main_process_first():
+        # only the main process saves them
+        if is_main_process(training_args.local_rank):
+            # save feature extractor, tokenizer and config
+            feature_extractor.save_pretrained(training_args.output_dir)
+            tokenizer.save_pretrained(training_args.output_dir)
+            config.save_pretrained(training_args.output_dir)
+
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+
+    # Instantiate custom data collator
+    data_collator = DataCollatorCTCWithPadding(processor=processor)
+
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
+        tokenizer=processor,
+    )
+
+    # 8. Finally, we can start training
+
+    # Training
+    if training_args.do_train:
+        # use last checkpoint if exist
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples
+            if data_args.max_train_samples is not None
+            else len(vectorized_datasets["train"])
+        )
+        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_eval_samples = (
+            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
+        )
+        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Write model card and (optionally) push to hub
+    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "automatic-speech-recognition",
+        "tags": ["automatic-speech-recognition", data_args.dataset_name, "mms"],
+        "dataset_args": (
+            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
+            f" {data_args.eval_split_name}"
+        ),
+        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
+    }
+    if "common_voice" in data_args.dataset_name:
+        kwargs["language"] = config_name
+
+    # make sure that adapter weights are saved seperately
+    adapter_file = WAV2VEC2_ADAPTER_SAFE_FILE.format(data_args.target_language)
+    adapter_file = os.path.join(training_args.output_dir, adapter_file)
+    logger.info(f"Saving adapter weights under {adapter_file}...")
+    safe_save_file(model._get_adapters(), adapter_file, metadata={"format": "pt"})
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index f4682b8933e7..a94e23e614d5 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -63,6 +63,7 @@
     import run_semantic_segmentation
     import run_seq2seq_qa as run_squad_seq2seq
     import run_speech_recognition_ctc
+    import run_speech_recognition_ctc_adapter
     import run_speech_recognition_seq2seq
     import run_summarization
     import run_swag
@@ -446,6 +447,38 @@ def test_run_speech_recognition_ctc(self):
             result = get_results(tmp_dir)
             self.assertLess(result["eval_loss"], result["train_loss"])
 
+    def test_run_speech_recognition_ctc_adapter(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_speech_recognition_ctc_adapter.py
+            --output_dir {tmp_dir}
+            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
+            --dataset_name hf-internal-testing/librispeech_asr_dummy
+            --dataset_config_name clean
+            --train_split_name validation
+            --eval_split_name validation
+            --do_train
+            --do_eval
+            --learning_rate 1e-4
+            --per_device_train_batch_size 2
+            --per_device_eval_batch_size 1
+            --remove_unused_columns False
+            --overwrite_output_dir True
+            --preprocessing_num_workers 16
+            --max_steps 10
+            --target_language tur
+            --seed 42
+        """.split()
+
+        if is_cuda_and_apex_available():
+            testargs.append("--fp16")
+
+        with patch.object(sys, "argv", testargs):
+            run_speech_recognition_ctc_adapter.main()
+            result = get_results(tmp_dir)
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "./adapter.tur.safetensors")))
+            self.assertLess(result["eval_loss"], result["train_loss"])
+
     def test_run_speech_recognition_seq2seq(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index cd8b5f165d94..c7436c5ea4ad 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -1181,6 +1181,14 @@ def freeze_feature_encoder(self):
         """
         self.hubert.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.hubert.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 1d45facf22e8..928b2fd3ade7 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -1016,6 +1016,14 @@ def freeze_feature_encoder(self):
         """
         self.sew.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index b39ac0466f33..403c865cdd4c 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -1556,6 +1556,14 @@ def freeze_feature_encoder(self):
         """
         self.sew_d.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.sew_d.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(SEWD_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index c156886fe25c..da81bc4b1404 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1425,6 +1425,14 @@ def freeze_feature_encoder(self):
         """
         self.unispeech.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.unispeech.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 5fff3265085d..58ba244ade0f 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1432,6 +1432,14 @@ def freeze_feature_encoder(self):
         """
         self.unispeech_sat.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.unispeech_sat.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 80d05d3777f0..26cfe3c6cb49 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1194,6 +1194,19 @@ def _get_adapters(self):
 
         return adapter_weights
 
+    def init_adapter_layers(self):
+        """
+        (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
+        """
+        # init attention adapters
+        for module in self.modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                self._init_weights(module)
+
+        # init lm head
+        if isinstance(self, Wav2Vec2ForCTC):
+            self._init_weights(self.lm_head)
+
     def load_adapter(self, target_lang: str, **kwargs):
         r"""
         Load a language adapter model from a pre-trained adapter model.
@@ -1888,6 +1901,14 @@ def freeze_feature_encoder(self):
         """
         self.wav2vec2.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 59662053ba75..4181794ce169 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -1319,6 +1319,14 @@ def freeze_feature_encoder(self):
         """
         self.wavlm.feature_extractor._freeze_parameters()
 
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,

From 6793f0cfe0006d7cedfb9b6081f55d9d38eae18a Mon Sep 17 00:00:00 2001
From: Stephan Tulkens <stephantul@gmail.com>
Date: Thu, 15 Jun 2023 10:41:57 +0200
Subject: [PATCH 427/935] Fix bug in slow tokenizer conversion, make it a lot
 faster (#24266)

* Make conversion faster, fix None vs 0 bug

* Add second sort for consistency

* Update src/transformers/convert_slow_tokenizer.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/convert_slow_tokenizer.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 8ff40c573e3c..082ccb47bdfa 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -54,12 +54,15 @@ def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
 
         # Merges
         merges = []
-        for piece_l in vocab.keys():
-            for piece_r in vocab.keys():
-                merge = f"{piece_l}{piece_r}"
-                piece_score = vocab_scores.get(merge, None)
-                if piece_score:
-                    merges += [(piece_l, piece_r, piece_score)]
+        for merge, piece_score in vocab_scores.items():
+            local = []
+            for index in range(1, len(merge)):
+                piece_l, piece_r = merge[:index], merge[index:]
+                if piece_l in vocab and piece_r in vocab:
+                    local.append((piece_l, piece_r, piece_score))
+            local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
+            merges.extend(local)
+
         merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
         merges = [(val[0], val[1]) for val in merges]
         return vocab, merges

From 7504be35ab88639ec3def32f0392f0a43c9551ee Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 11:39:20 +0200
Subject: [PATCH 428/935] Fix `check_config_attributes`: check all
 configuration classes (#24231)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/align/configuration_align.py       |  4 ---
 .../models/blip/configuration_blip.py         |  2 --
 .../models/blip_2/configuration_blip_2.py     | 17 -----------
 .../bridgetower/configuration_bridgetower.py  |  8 -----
 .../models/clap/configuration_clap.py         |  8 -----
 .../pix2struct/configuration_pix2struct.py    | 16 ----------
 .../models/sam/configuration_sam.py           | 16 ----------
 utils/check_config_attributes.py              | 30 ++++++++++++++++---
 8 files changed, 26 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 0436b278f0a7..cb7baddfe19e 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -206,8 +206,6 @@ class AlignVisionConfig(PretrainedConfig):
             The epsilon used by the batch normalization layers.
         batch_norm_momentum (`float`, *optional*, defaults to 0.99):
             The momentum used by the batch normalization layers.
-        dropout_rate (`float`, *optional*, defaults to 0.5):
-            The dropout rate to be applied before final classifier layer.
         drop_connect_rate (`float`, *optional*, defaults to 0.2):
             The drop rate for skip connections.
 
@@ -249,7 +247,6 @@ def __init__(
         initializer_range: float = 0.02,
         batch_norm_eps: float = 0.001,
         batch_norm_momentum: float = 0.99,
-        dropout_rate: float = 0.5,
         drop_connect_rate: float = 0.2,
         **kwargs,
     ):
@@ -274,7 +271,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.batch_norm_eps = batch_norm_eps
         self.batch_norm_momentum = batch_norm_momentum
-        self.dropout_rate = dropout_rate
         self.drop_connect_rate = drop_connect_rate
         self.num_hidden_layers = sum(num_block_repeats) * 4
 
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index f03f167c2941..53a778528285 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -234,7 +234,6 @@ def __init__(
         projection_dim=512,
         num_hidden_layers=12,
         num_attention_heads=12,
-        num_channels=3,
         image_size=384,
         patch_size=16,
         hidden_act="gelu",
@@ -250,7 +249,6 @@ def __init__(
         self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
         self.patch_size = patch_size
         self.image_size = image_size
         self.initializer_range = initializer_range
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 8a80510db9af..6adf85e6115f 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -58,15 +58,10 @@ class Blip2VisionConfig(PretrainedConfig):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
             to 1e-5): The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries and values in the self-attention layers.
 
@@ -91,18 +86,14 @@ def __init__(
         self,
         hidden_size=1408,
         intermediate_size=6144,
-        projection_dim=512,
         num_hidden_layers=39,
         num_attention_heads=16,
-        num_channels=3,
         image_size=224,
         patch_size=14,
         hidden_act="gelu",
         layer_norm_eps=0.00001,
-        dropout=0.0,
         attention_dropout=0.0,
         initializer_range=1e-10,
-        initializer_factor=1.0,
         qkv_bias=True,
         **kwargs,
     ):
@@ -110,15 +101,11 @@ def __init__(
 
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
-        self.dropout = dropout
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
         self.patch_size = patch_size
         self.image_size = image_size
         self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
@@ -184,8 +171,6 @@ class Blip2QFormerConfig(PretrainedConfig):
             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         cross_attention_frequency (`int`, *optional*, defaults to 2):
             The frequency of adding cross-attention to the Transformer layers.
         encoder_hidden_size (`int`, *optional*, defaults to 1408):
@@ -221,7 +206,6 @@ def __init__(
         layer_norm_eps=1e-12,
         pad_token_id=0,
         position_embedding_type="absolute",
-        classifier_dropout=None,
         cross_attention_frequency=2,
         encoder_hidden_size=1408,
         **kwargs,
@@ -240,7 +224,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
-        self.classifier_dropout = classifier_dropout
         self.cross_attention_frequency = cross_attention_frequency
         self.encoder_hidden_size = encoder_hidden_size
 
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
index 3149b34efaac..17c9cadaf838 100644
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -155,8 +155,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
         initializer_factor (`float``, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
@@ -170,8 +168,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
 
     Example:
 
@@ -199,14 +195,12 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=514,
         type_vocab_size=1,
-        initializer_range=0.02,
         layer_norm_eps=1e-05,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -222,11 +216,9 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 13d1f7b7e059..9886c614da7b 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -65,8 +65,6 @@ class ClapTextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
             The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
@@ -80,8 +78,6 @@ class ClapTextConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.
         projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
@@ -116,7 +112,6 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=514,
         type_vocab_size=1,
-        initializer_range=0.02,
         initializer_factor=1.0,
         layer_norm_eps=1e-12,
         projection_dim=512,
@@ -125,7 +120,6 @@ def __init__(
         eos_token_id=2,
         position_embedding_type="absolute",
         use_cache=True,
-        classifier_dropout=None,
         projection_hidden_act="relu",
         **kwargs,
     ):
@@ -141,12 +135,10 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout
         self.projection_hidden_act = projection_hidden_act
         self.projection_dim = projection_dim
 
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index 32aa34941f8f..cf1dd91b4635 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -187,16 +187,10 @@ class Pix2StructVisionConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         d_kv (`int`, *optional*, defaults to 64):
             Dimensionality of the key, query, value projections per attention head.
-        projection_dim (`int`, *optional*, defaults to 768):
-            Dimensionality of the projection layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of channels of the input images.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
         dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
@@ -213,8 +207,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
             testing).
         seq_len (`int`, *optional*, defaults to 4096):
             Maximum sequence length (here number of patches) supported by the model.
-        layer_norm_bias (`bool`, *optional*, defaults to `False`):
-            Whether or not to add a bias to the layer normalization layers.
         relative_attention_num_buckets (`int`, *optional*, defaults to 32):
             The number of buckets to use for each attention layer.
         relative_attention_max_distance (`int`, *optional*, defaults to 128):
@@ -243,11 +235,8 @@ def __init__(
         patch_embed_hidden_size=768,
         d_ff=2048,
         d_kv=64,
-        projection_dim=768,
         num_hidden_layers=12,
         num_attention_heads=12,
-        num_channels=3,
-        patch_size=16,
         dense_act_fn="gelu_new",
         layer_norm_eps=1e-6,
         dropout_rate=0.0,
@@ -255,7 +244,6 @@ def __init__(
         initializer_range=1e-10,
         initializer_factor=1.0,
         seq_len=4096,
-        layer_norm_bias=False,
         relative_attention_num_buckets=32,
         relative_attention_max_distance=128,
         **kwargs,
@@ -265,19 +253,15 @@ def __init__(
         self.hidden_size = hidden_size
         self.patch_embed_hidden_size = patch_embed_hidden_size
         self.d_ff = d_ff
-        self.projection_dim = projection_dim
         self.dropout_rate = dropout_rate
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.dense_act_fn = dense_act_fn
         self.seq_len = seq_len
-        self.layer_norm_bias = layer_norm_bias
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.relative_attention_max_distance = relative_attention_max_distance
         self.d_kv = d_kv
diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py
index fc3701fca734..87427d457310 100644
--- a/src/transformers/models/sam/configuration_sam.py
+++ b/src/transformers/models/sam/configuration_sam.py
@@ -150,10 +150,6 @@ class SamVisionConfig(PretrainedConfig):
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 6144):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        projection_dim (`int`, *optional*, defaults to 512):
-            Dimensionality of the projection layer in the Transformer encoder.
         output_channels (`int`, *optional*, defaults to 256):
             Dimensionality of the output channels in the Patch Encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -170,14 +166,10 @@ class SamVisionConfig(PretrainedConfig):
             The non-linear activation function (function or string)
         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 1e-10):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for multiplying the initializer range.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to query, key, value projections.
         mlp_ratio (`float`, *optional*, defaults to 4.0):
@@ -200,8 +192,6 @@ class SamVisionConfig(PretrainedConfig):
     def __init__(
         self,
         hidden_size=768,
-        intermediate_size=6144,
-        projection_dim=512,
         output_channels=256,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -210,10 +200,8 @@ def __init__(
         patch_size=16,
         hidden_act="gelu",
         layer_norm_eps=1e-06,
-        dropout=0.0,
         attention_dropout=0.0,
         initializer_range=1e-10,
-        initializer_factor=1.0,
         qkv_bias=True,
         mlp_ratio=4.0,
         use_abs_pos=True,
@@ -227,8 +215,6 @@ def __init__(
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
         self.output_channels = output_channels
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -237,10 +223,8 @@ def __init__(
         self.patch_size = patch_size
         self.hidden_act = hidden_act
         self.layer_norm_eps = layer_norm_eps
-        self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
         self.qkv_bias = qkv_bias
         self.mlp_ratio = mlp_ratio
         self.use_abs_pos = use_abs_pos
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 63b1bacbbe74..dad6888c8021 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -17,6 +17,7 @@
 import os
 import re
 
+from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import direct_transformers_import
 
 
@@ -77,6 +78,12 @@
     "TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
     # used internally to calculate the feature size
     "AutoformerConfig": ["num_static_real_features", "num_time_features"],
+    # used internally to calculate `mlp_dim`
+    "SamVisionConfig": ["mlp_ratio"],
+    # For (head) training, but so far not implemented
+    "ClapAudioConfig": ["num_classes"],
+    # Not used, but providing useful information to users
+    "SpeechT5HifiGanConfig": ["sampling_rate"],
 }
 
 
@@ -113,6 +120,10 @@
         "VanConfig": True,
         "WavLMConfig": True,
         "WhisperConfig": True,
+        # TODO: @Arthur (for `alignment_head` and `alignment_layer`)
+        "JukeboxPriorConfig": True,
+        # TODO: @Younes (for `is_decoder`)
+        "Pix2StructTextConfig": True,
     }
 )
 
@@ -254,10 +265,21 @@ def check_config_attributes_being_used(config_class):
 def check_config_attributes():
     """Check the arguments in `__init__` of all configuration classes are used in  python files"""
     configs_with_unused_attributes = {}
-    for config_class in list(CONFIG_MAPPING.values()):
-        unused_attributes = check_config_attributes_being_used(config_class)
-        if len(unused_attributes) > 0:
-            configs_with_unused_attributes[config_class.__name__] = unused_attributes
+    for _config_class in list(CONFIG_MAPPING.values()):
+        # Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
+        config_classes_in_module = [
+            cls
+            for name, cls in inspect.getmembers(
+                inspect.getmodule(_config_class),
+                lambda x: inspect.isclass(x)
+                and issubclass(x, PretrainedConfig)
+                and inspect.getmodule(x) == inspect.getmodule(_config_class),
+            )
+        ]
+        for config_class in config_classes_in_module:
+            unused_attributes = check_config_attributes_being_used(config_class)
+            if len(unused_attributes) > 0:
+                configs_with_unused_attributes[config_class.__name__] = unused_attributes
 
     if len(configs_with_unused_attributes) > 0:
         error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"

From 33196b459caa700272531537437f276005b3f82a Mon Sep 17 00:00:00 2001
From: Fei Wang <19998174+FeiWang96@users.noreply.github.com>
Date: Thu, 15 Jun 2023 03:28:48 -0700
Subject: [PATCH 429/935] Fix LLaMa beam search when using parallelize (#24224)

* Fix LLaMa beam search when using parallelize

same issue as T5 #11717

* fix code format in modeling_llama.py

* fix format of _reorder_cache in modeling_llama.py
---
 src/transformers/models/llama/modeling_llama.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index e0580880114b..c9debdd252dc 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -762,7 +762,9 @@ def prepare_inputs_for_generation(
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
 

From a611ac9b3f9493a80e2d0adf491f4868c71f71c5 Mon Sep 17 00:00:00 2001
From: JayL0321 <31190549+JayL0321@users.noreply.github.com>
Date: Thu, 15 Jun 2023 03:39:32 -0700
Subject: [PATCH 430/935] remove unused is_decoder parameter in DetrAttention
 (#24226)

* issue#24161 remove unused is_decoder parameter in DetrAttention

* #24161 fix check_repository_consistency fail
---
 .../models/conditional_detr/modeling_conditional_detr.py       | 1 -
 src/transformers/models/detr/modeling_detr.py                  | 3 ---
 src/transformers/models/maskformer/modeling_maskformer.py      | 3 ---
 .../models/table_transformer/modeling_table_transformer.py     | 3 ---
 4 files changed, 10 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 6110fb4491ce..023cb2784841 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -534,7 +534,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index bd4eae128394..c92c43e46d18 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -499,7 +499,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -697,7 +696,6 @@ def __init__(self, config: DetrConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -708,7 +706,6 @@ def __init__(self, config: DetrConfig):
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 4a73910f873f..830f8b23c816 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -416,7 +416,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -545,7 +544,6 @@ def __init__(self, config: DetrConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -556,7 +554,6 @@ def __init__(self, config: DetrConfig):
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 6716a78be3b5..733ff7b9b453 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -440,7 +440,6 @@ def __init__(
         embed_dim: int,
         num_heads: int,
         dropout: float = 0.0,
-        is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
@@ -642,7 +641,6 @@ def __init__(self, config: TableTransformerConfig):
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -653,7 +651,6 @@ def __init__(self, config: TableTransformerConfig):
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
-            is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)

From 372f50030bd422309d7b859d730842d7755d3371 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 15 Jun 2023 07:30:24 -0400
Subject: [PATCH 431/935] Split common test from core tests (#24284)

---
 tests/test_configuration_common.py      | 270 +------
 tests/test_configuration_utils.py       | 286 +++++++
 tests/test_feature_extraction_common.py | 127 +--
 tests/test_feature_extraction_utils.py  | 144 ++++
 tests/test_image_processing_common.py   | 149 +---
 tests/test_image_processing_utils.py    | 154 ++++
 tests/test_modeling_common.py           | 934 +----------------------
 tests/test_modeling_flax_common.py      | 175 +----
 tests/test_modeling_flax_utils.py       | 186 +++++
 tests/test_modeling_tf_common.py        | 575 +-------------
 tests/test_modeling_tf_utils.py         | 627 +++++++++++++++
 tests/test_modeling_utils.py            | 976 ++++++++++++++++++++++++
 tests/test_tokenization_common.py       | 258 +------
 tests/test_tokenization_utils.py        | 280 +++++++
 14 files changed, 2663 insertions(+), 2478 deletions(-)
 create mode 100644 tests/test_configuration_utils.py
 create mode 100644 tests/test_feature_extraction_utils.py
 create mode 100644 tests/test_image_processing_utils.py
 create mode 100644 tests/test_modeling_flax_utils.py
 create mode 100644 tests/test_modeling_tf_utils.py
 create mode 100755 tests/test_modeling_utils.py
 create mode 100644 tests/test_tokenization_utils.py

diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index fdb679529d2b..18d9a76e97dc 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -16,80 +16,11 @@
 import copy
 import json
 import os
-import shutil
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
+from transformers import is_torch_available
 
-from transformers import AutoConfig, BertConfig, GPT2Config, is_torch_available
-from transformers.configuration_utils import PretrainedConfig
-from transformers.testing_utils import TOKEN, USER, is_staging_test
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig  # noqa E402
-
-
-config_common_kwargs = {
-    "return_dict": False,
-    "output_hidden_states": True,
-    "output_attentions": True,
-    "torchscript": True,
-    "torch_dtype": "float16",
-    "use_bfloat16": True,
-    "tf_legacy_loss": True,
-    "pruned_heads": {"a": 1},
-    "tie_word_embeddings": False,
-    "is_decoder": True,
-    "cross_attention_hidden_size": 128,
-    "add_cross_attention": True,
-    "tie_encoder_decoder": True,
-    "max_length": 50,
-    "min_length": 3,
-    "do_sample": True,
-    "early_stopping": True,
-    "num_beams": 3,
-    "num_beam_groups": 3,
-    "diversity_penalty": 0.5,
-    "temperature": 2.0,
-    "top_k": 10,
-    "top_p": 0.7,
-    "typical_p": 0.2,
-    "repetition_penalty": 0.8,
-    "length_penalty": 0.8,
-    "no_repeat_ngram_size": 5,
-    "encoder_no_repeat_ngram_size": 5,
-    "bad_words_ids": [1, 2, 3],
-    "num_return_sequences": 3,
-    "chunk_size_feed_forward": 5,
-    "output_scores": True,
-    "return_dict_in_generate": True,
-    "forced_bos_token_id": 2,
-    "forced_eos_token_id": 3,
-    "remove_invalid_values": True,
-    "architectures": ["BertModel"],
-    "finetuning_task": "translation",
-    "id2label": {0: "label"},
-    "label2id": {"label": "0"},
-    "tokenizer_class": "BertTokenizerFast",
-    "prefix": "prefix",
-    "bos_token_id": 6,
-    "pad_token_id": 7,
-    "eos_token_id": 8,
-    "sep_token_id": 9,
-    "decoder_start_token_id": 10,
-    "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": [0, 1],
-    "begin_suppress_tokens": 2,
-    "task_specific_params": {"translation": "some_params"},
-    "problem_type": "regression",
-}
+from .test_configuration_utils import config_common_kwargs
 
 
 class ConfigTester(object):
@@ -220,200 +151,3 @@ def run_common_tests(self):
         self.create_and_test_config_with_num_labels()
         self.check_config_can_be_init_without_params()
         self.check_config_arguments_init()
-
-
-@is_staging_test
-class ConfigPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-config")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-config")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("test-config", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-config")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_dynamic_config(self):
-        CustomConfig.register_for_auto_class()
-        config = CustomConfig(attribute=42)
-
-        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
-
-        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
-        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
-        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
-        self.assertEqual(new_config.attribute, 42)
-
-
-class ConfigTestUtils(unittest.TestCase):
-    def test_config_from_string(self):
-        c = GPT2Config()
-
-        # attempt to modify each of int/float/bool/str config records and verify they were updated
-        n_embd = c.n_embd + 1  # int
-        resid_pdrop = c.resid_pdrop + 1.0  # float
-        scale_attn_weights = not c.scale_attn_weights  # bool
-        summary_type = c.summary_type + "foo"  # str
-        c.update_from_string(
-            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
-        )
-        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
-        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
-        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
-        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
-
-    def test_config_common_kwargs_is_complete(self):
-        base_config = PretrainedConfig()
-        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
-        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
-        self.assertListEqual(
-            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
-        )
-        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
-        if len(keys_with_defaults) > 0:
-            raise ValueError(
-                "The following keys are set with the default values in"
-                " `test_configuration_common.config_common_kwargs` pick another value for them:"
-                f" {', '.join(keys_with_defaults)}."
-            )
-
-    def test_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
-
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
-
-        self.assertIsNotNone(config)
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = BertConfig.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
-        )
-
-
-class ConfigurationVersioningTest(unittest.TestCase):
-    def test_local_versioning(self):
-        configuration = AutoConfig.from_pretrained("bert-base-cased")
-        configuration.configuration_files = ["config.4.0.0.json"]
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            configuration.save_pretrained(tmp_dir)
-            configuration.hidden_size = 2
-            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
-
-            # This should pick the new configuration file as the version of Transformers is > 4.0.0
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 2)
-
-            # Will need to be adjusted if we reach v42 and this test is still here.
-            # Should pick the old configuration file as the version of Transformers is < 4.42.0
-            configuration.configuration_files = ["config.42.0.0.json"]
-            configuration.hidden_size = 768
-            configuration.save_pretrained(tmp_dir)
-            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 768)
-
-    def test_repo_versioning_before(self):
-        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
-        repo = "hf-internal-testing/test-two-configs"
-
-        import transformers as new_transformers
-
-        new_transformers.configuration_utils.__version__ = "v4.0.0"
-        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
-            repo, return_unused_kwargs=True
-        )
-        self.assertEqual(new_configuration.hidden_size, 2)
-        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
-        self.assertDictEqual(kwargs, {})
-
-        # Testing an older version by monkey-patching the version in the module it's used.
-        import transformers as old_transformers
-
-        old_transformers.configuration_utils.__version__ = "v3.0.0"
-        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
-        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
new file mode 100644
index 000000000000..fe278cc2b149
--- /dev/null
+++ b/tests/test_configuration_utils.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoConfig, BertConfig, GPT2Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import TOKEN, USER, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+
+
+config_common_kwargs = {
+    "return_dict": False,
+    "output_hidden_states": True,
+    "output_attentions": True,
+    "torchscript": True,
+    "torch_dtype": "float16",
+    "use_bfloat16": True,
+    "tf_legacy_loss": True,
+    "pruned_heads": {"a": 1},
+    "tie_word_embeddings": False,
+    "is_decoder": True,
+    "cross_attention_hidden_size": 128,
+    "add_cross_attention": True,
+    "tie_encoder_decoder": True,
+    "max_length": 50,
+    "min_length": 3,
+    "do_sample": True,
+    "early_stopping": True,
+    "num_beams": 3,
+    "num_beam_groups": 3,
+    "diversity_penalty": 0.5,
+    "temperature": 2.0,
+    "top_k": 10,
+    "top_p": 0.7,
+    "typical_p": 0.2,
+    "repetition_penalty": 0.8,
+    "length_penalty": 0.8,
+    "no_repeat_ngram_size": 5,
+    "encoder_no_repeat_ngram_size": 5,
+    "bad_words_ids": [1, 2, 3],
+    "num_return_sequences": 3,
+    "chunk_size_feed_forward": 5,
+    "output_scores": True,
+    "return_dict_in_generate": True,
+    "forced_bos_token_id": 2,
+    "forced_eos_token_id": 3,
+    "remove_invalid_values": True,
+    "architectures": ["BertModel"],
+    "finetuning_task": "translation",
+    "id2label": {0: "label"},
+    "label2id": {"label": "0"},
+    "tokenizer_class": "BertTokenizerFast",
+    "prefix": "prefix",
+    "bos_token_id": 6,
+    "pad_token_id": 7,
+    "eos_token_id": 8,
+    "sep_token_id": 9,
+    "decoder_start_token_id": 10,
+    "exponential_decay_length_penalty": (5, 1.01),
+    "suppress_tokens": [0, 1],
+    "begin_suppress_tokens": 2,
+    "task_specific_params": {"translation": "some_params"},
+    "problem_type": "regression",
+}
+
+
+@is_staging_test
+class ConfigPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-config")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-config")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        config.push_to_hub("test-config", use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-config")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
+
+        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+        for k, v in config.to_dict().items():
+            if k != "transformers_version":
+                self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_dynamic_config(self):
+        CustomConfig.register_for_auto_class()
+        config = CustomConfig(attribute=42)
+
+        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
+
+        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
+        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
+        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
+        self.assertEqual(new_config.attribute, 42)
+
+
+class ConfigTestUtils(unittest.TestCase):
+    def test_config_from_string(self):
+        c = GPT2Config()
+
+        # attempt to modify each of int/float/bool/str config records and verify they were updated
+        n_embd = c.n_embd + 1  # int
+        resid_pdrop = c.resid_pdrop + 1.0  # float
+        scale_attn_weights = not c.scale_attn_weights  # bool
+        summary_type = c.summary_type + "foo"  # str
+        c.update_from_string(
+            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
+        )
+        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
+        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
+        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
+        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
+
+    def test_config_common_kwargs_is_complete(self):
+        base_config = PretrainedConfig()
+        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
+        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
+        self.assertListEqual(
+            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
+        )
+        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
+        if len(keys_with_defaults) > 0:
+            raise ValueError(
+                "The following keys are set with the default values in"
+                " `test_configuration_common.config_common_kwargs` pick another value for them:"
+                f" {', '.join(keys_with_defaults)}."
+            )
+
+    def test_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
+
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
+
+        self.assertIsNotNone(config)
+
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = BertConfig.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
+        )
+
+    def test_local_versioning(self):
+        configuration = AutoConfig.from_pretrained("bert-base-cased")
+        configuration.configuration_files = ["config.4.0.0.json"]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            configuration.save_pretrained(tmp_dir)
+            configuration.hidden_size = 2
+            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
+
+            # This should pick the new configuration file as the version of Transformers is > 4.0.0
+            new_configuration = AutoConfig.from_pretrained(tmp_dir)
+            self.assertEqual(new_configuration.hidden_size, 2)
+
+            # Will need to be adjusted if we reach v42 and this test is still here.
+            # Should pick the old configuration file as the version of Transformers is < 4.42.0
+            configuration.configuration_files = ["config.42.0.0.json"]
+            configuration.hidden_size = 768
+            configuration.save_pretrained(tmp_dir)
+            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
+            new_configuration = AutoConfig.from_pretrained(tmp_dir)
+            self.assertEqual(new_configuration.hidden_size, 768)
+
+    def test_repo_versioning_before(self):
+        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
+        repo = "hf-internal-testing/test-two-configs"
+
+        import transformers as new_transformers
+
+        new_transformers.configuration_utils.__version__ = "v4.0.0"
+        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
+            repo, return_unused_kwargs=True
+        )
+        self.assertEqual(new_configuration.hidden_size, 2)
+        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
+        self.assertDictEqual(kwargs, {})
+
+        # Testing an older version by monkey-patching the version in the module it's used.
+        import transformers as old_transformers
+
+        old_transformers.configuration_utils.__version__ = "v3.0.0"
+        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
+        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 6ffec669133c..49937309d0e0 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -16,25 +16,9 @@
 
 import json
 import os
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
-
-from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
-from transformers.testing_utils import TOKEN, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
-
-
-SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
+from transformers.testing_utils import check_json_file_has_correct_format
 
 
 class FeatureExtractionSavingTestMixin:
@@ -69,112 +53,3 @@ def test_feat_extract_from_and_save_pretrained(self):
     def test_init_without_params(self):
         feat_extract = self.feature_extraction_class()
         self.assertIsNotNone(feat_extract)
-
-
-class FeatureExtractorUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = Wav2Vec2FeatureExtractor.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/preprocessor_config.json"
-        )
-
-
-@is_staging_test
-class FeatureExtractorPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-feature-extractor")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-feature-extractor-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-feature-extractor")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-        feature_extractor.push_to_hub("test-feature-extractor", use_auth_token=self._token)
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-feature-extractor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            feature_extractor.save_pretrained(
-                tmp_dir, repo_id="test-feature-extractor", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-    def test_push_to_hub_in_organization(self):
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-        feature_extractor.push_to_hub("valid_org/test-feature-extractor", use_auth_token=self._token)
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            feature_extractor.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-feature-extractor-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
-        for k, v in feature_extractor.__dict__.items():
-            self.assertEqual(v, getattr(new_feature_extractor, k))
-
-    def test_push_to_hub_dynamic_feature_extractor(self):
-        CustomFeatureExtractor.register_for_auto_class()
-        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
-
-        feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(
-            feature_extractor.auto_map,
-            {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
-        )
-
-        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
-            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
-        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_feature_extraction_utils.py b/tests/test_feature_extraction_utils.py
new file mode 100644
index 000000000000..b17c48ff120d
--- /dev/null
+++ b/tests/test_feature_extraction_utils.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
+
+
+SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class FeatureExtractorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = Wav2Vec2FeatureExtractor.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/preprocessor_config.json"
+        )
+
+
+@is_staging_test
+class FeatureExtractorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-feature-extractor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-feature-extractor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-feature-extractor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        feature_extractor.push_to_hub("test-feature-extractor", use_auth_token=self._token)
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-feature-extractor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                tmp_dir, repo_id="test-feature-extractor", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_in_organization(self):
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        feature_extractor.push_to_hub("valid_org/test-feature-extractor", use_auth_token=self._token)
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            feature_extractor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-feature-extractor-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("valid_org/test-feature-extractor-org")
+        for k, v in feature_extractor.__dict__.items():
+            self.assertEqual(v, getattr(new_feature_extractor, k))
+
+    def test_push_to_hub_dynamic_feature_extractor(self):
+        CustomFeatureExtractor.register_for_auto_class()
+        feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+
+        feature_extractor.push_to_hub("test-dynamic-feature-extractor", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            feature_extractor.auto_map,
+            {"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor"},
+        )
+
+        new_feature_extractor = AutoFeatureExtractor.from_pretrained(
+            f"{USER}/test-dynamic-feature-extractor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_feature_extractor is from the CustomFeatureExtractor class of a dynamic module
+        self.assertEqual(new_feature_extractor.__class__.__name__, "CustomFeatureExtractor")
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 15a6688fbb9c..166440c4d525 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -13,34 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import json
 import os
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
-
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
-
-from transformers import AutoImageProcessor, ViTImageProcessor
-from transformers.testing_utils import (
-    TOKEN,
-    USER,
-    check_json_file_has_correct_format,
-    get_tests_dir,
-    is_staging_test,
-    require_torch,
-    require_vision,
-)
-from transformers.utils import is_torch_available, is_vision_available
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
 
-from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 
 if is_torch_available():
@@ -51,9 +29,6 @@
     from PIL import Image
 
 
-SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
-
-
 def prepare_image_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
     """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
     or a list of PyTorch tensors if one specifies torchify=True.
@@ -201,123 +176,3 @@ def test_cast_dtype_device(self):
             self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
             self.assertEqual(encoding.pixel_values.dtype, torch.float16)
             self.assertEqual(encoding.input_ids.dtype, torch.long)
-
-
-class ImageProcessorUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = ViTImageProcessor.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-vit/resolve/main/preprocessor_config.json"
-        )
-
-
-@is_staging_test
-class ImageProcessorPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-image-processor")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-        image_processor.push_to_hub("test-image-processor", use_auth_token=self._token)
-
-        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-image-processor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(
-                tmp_dir, repo_id="test-image-processor", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-    def test_push_to_hub_in_organization(self):
-        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-        image_processor.push_to_hub("valid_org/test-image-processor", use_auth_token=self._token)
-
-        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
-        for k, v in image_processor.__dict__.items():
-            self.assertEqual(v, getattr(new_image_processor, k))
-
-    def test_push_to_hub_dynamic_image_processor(self):
-        CustomImageProcessor.register_for_auto_class()
-        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
-
-        image_processor.push_to_hub("test-dynamic-image-processor", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(
-            image_processor.auto_map,
-            {"ImageProcessor": "custom_image_processing.CustomImageProcessor"},
-        )
-
-        new_image_processor = AutoImageProcessor.from_pretrained(
-            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
-        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
-
-    def test_image_processor_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
-
-        config = AutoImageProcessor.from_pretrained(
-            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
-        )
-
-        self.assertIsNotNone(config)
diff --git a/tests/test_image_processing_utils.py b/tests/test_image_processing_utils.py
new file mode 100644
index 000000000000..d74da63249ba
--- /dev/null
+++ b/tests/test_image_processing_utils.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import AutoImageProcessor, ViTImageProcessor
+from transformers.testing_utils import TOKEN, USER, get_tests_dir, is_staging_test
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
+
+
+SAMPLE_IMAGE_PROCESSING_CONFIG_DIR = get_tests_dir("fixtures")
+
+
+class ImageProcessorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = ViTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-vit")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = ViTImageProcessor.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-vit/resolve/main/preprocessor_config.json"
+        )
+
+
+@is_staging_test
+class ImageProcessorPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-image-processor")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-image-processor-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-image-processor")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("test-image-processor", use_auth_token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="test-image-processor", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained(f"{USER}/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_in_organization(self):
+        image_processor = ViTImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+        image_processor.push_to_hub("valid_org/test-image-processor", use_auth_token=self._token)
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            image_processor.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-image-processor-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_image_processor = ViTImageProcessor.from_pretrained("valid_org/test-image-processor-org")
+        for k, v in image_processor.__dict__.items():
+            self.assertEqual(v, getattr(new_image_processor, k))
+
+    def test_push_to_hub_dynamic_image_processor(self):
+        CustomImageProcessor.register_for_auto_class()
+        image_processor = CustomImageProcessor.from_pretrained(SAMPLE_IMAGE_PROCESSING_CONFIG_DIR)
+
+        image_processor.push_to_hub("test-dynamic-image-processor", use_auth_token=self._token)
+
+        # This has added the proper auto_map field to the config
+        self.assertDictEqual(
+            image_processor.auto_map,
+            {"ImageProcessor": "custom_image_processing.CustomImageProcessor"},
+        )
+
+        new_image_processor = AutoImageProcessor.from_pretrained(
+            f"{USER}/test-dynamic-image-processor", trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
+        self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
+
+    def test_image_processor_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
+
+        config = AutoImageProcessor.from_pretrained(
+            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
+        )
+
+        self.assertIsNotNone(config)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 1a55aa6b754f..07a8b16bfef7 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -16,32 +16,22 @@
 import collections
 import copy
 import gc
-import glob
 import inspect
-import json
 import os
 import os.path
 import pickle
 import random
 import re
-import sys
 import tempfile
-import unittest
-import unittest.mock as mock
 import warnings
 from collections import defaultdict
-from pathlib import Path
 from typing import Dict, List, Tuple
 
 import numpy as np
-from huggingface_hub import HfFolder, delete_repo
-from huggingface_hub.file_download import http_get
 from pytest import mark
-from requests.exceptions import HTTPError
 
 import transformers
 from transformers import (
-    AutoConfig,
     AutoModel,
     AutoModelForSequenceClassification,
     PretrainedConfig,
@@ -70,28 +60,20 @@
     MODEL_MAPPING_NAMES,
 )
 from transformers.testing_utils import (
-    TOKEN,
-    USER,
     CaptureLogger,
-    TestCasePlus,
     is_pt_flax_cross_test,
     is_pt_tf_cross_test,
-    is_staging_test,
     require_accelerate,
     require_safetensors,
     require_torch,
     require_torch_gpu,
     require_torch_multi_gpu,
-    require_usr_bin_time,
     slow,
     torch_device,
 )
 from transformers.utils import (
     CONFIG_NAME,
     GENERATION_CONFIG_NAME,
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     is_accelerate_available,
     is_flax_available,
@@ -101,65 +83,17 @@
 from transformers.utils.generic import ModelOutput
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
-
-
 if is_accelerate_available():
     from accelerate.utils import compute_module_sizes
 
 
 if is_torch_available():
     import torch
-    from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from torch import nn
 
-    from transformers import (
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MODEL_MAPPING,
-        AdaptiveEmbedding,
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        BertConfig,
-        BertModel,
-        CLIPTextModel,
-        PreTrainedModel,
-        T5Config,
-        T5ForConditionalGeneration,
-    )
-    from transformers.modeling_utils import shard_checkpoint
+    from transformers import MODEL_MAPPING, AdaptiveEmbedding
     from transformers.pytorch_utils import id_tensor_storage
 
-    # Fake pretrained models for tests
-    class BaseModel(PreTrainedModel):
-        config_class = PretrainedConfig
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.linear = nn.Linear(4, 5)
-            self.linear_2 = nn.Linear(5, 6)
-
-        def forward(self, x):
-            return self.linear_2(self.linear(x))
-
-    class ModelWithHead(PreTrainedModel):
-        base_model_prefix = "base"
-        config_class = PretrainedConfig
-
-        def _init_weights(self, module):
-            pass
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.base = BaseModel(config)
-            # linear is a common name between Base and Head on purpose.
-            self.linear = nn.Linear(6, 3)
-            self.linear2 = nn.Linear(3, 5)
-
-        def forward(self, x):
-            return self.linear2(self.linear(self.base(x)))
-
 
 if is_tf_available():
     import tensorflow as tf
@@ -187,10 +121,6 @@ def _config_zero_init(config):
     return configs_no_init
 
 
-TINY_T5 = "patrickvonplaten/t5-tiny-random"
-TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
-
-
 def _mock_init_weights(self, module):
     for name, param in module.named_parameters(recurse=False):
         # Use the first letter of the name to get a value and go from a <> -13 to z <> 12
@@ -2778,865 +2708,3 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
         values.append(rng.random() * scale)
 
     return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
-
-
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
-        if model1_p.data.ne(model2_p.data).sum() > 0:
-            models_are_equal = False
-
-    return models_are_equal
-
-
-@require_torch
-class ModelUtilsTest(TestCasePlus):
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-
-            self.assertEqual(len(loading_info["missing_keys"]), 0)
-            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
-            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
-            self.assertEqual(len(loading_info["error_msgs"]), 0)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-
-            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
-            config.name_or_path = model_name
-
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        model = BertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        model = BertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-        with self.assertRaises(OSError):
-            _ = BertModel.from_pretrained(model_id)
-
-        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = BertModel.from_pretrained(model_id)
-
-        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_with_different_pretrained_model_name(self):
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertIsNotNone(model)
-
-        logger = logging.get_logger("transformers.configuration_utils")
-        with CaptureLogger(logger) as cl:
-            BertModel.from_pretrained(TINY_T5)
-        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
-
-    def test_model_from_config_torch_dtype(self):
-        # test that the model can be instantiated with dtype of user's choice - as long as it's a
-        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
-        # model from the config object.
-
-        config = T5Config.from_pretrained(TINY_T5)
-        model = AutoModel.from_config(config)
-        # XXX: isn't supported
-        # model = T5ForConditionalGeneration.from_config(config)
-        self.assertEqual(model.dtype, torch.float32)
-
-        model = AutoModel.from_config(config, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
-        with self.assertRaises(ValueError):
-            model = AutoModel.from_config(config, torch_dtype=torch.int64)
-
-    def test_model_from_pretrained_torch_dtype(self):
-        # test that the model can be instantiated with dtype of either
-        # 1. explicit from_pretrained's torch_dtype argument
-        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
-        # so if a model.half() was saved, we want it to be instantiated as such.
-        #
-        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
-        model_path = self.get_auto_remove_tmp_dir()
-
-        # baseline - we know TINY_T5 is fp32 model
-        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
-        self.assertEqual(model.dtype, torch.float32)
-
-        def remove_torch_dtype(model_path):
-            file = f"{model_path}/config.json"
-            with open(file, "r", encoding="utf-8") as f:
-                s = json.load(f)
-            s.pop("torch_dtype")
-            with open(file, "w", encoding="utf-8") as f:
-                json.dump(s, f)
-
-        # test the default fp32 save_pretrained => from_pretrained cycle
-        model.save_pretrained(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path)
-        self.assertEqual(model.dtype, torch.float32)
-        # 1. test torch_dtype="auto" via `config.torch_dtype`
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-        # 2. test torch_dtype="auto" via auto-derivation
-        # now remove the torch_dtype entry from config.json and try "auto" again which should
-        # perform auto-derivation from weights
-        remove_torch_dtype(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-        # test forced loading in fp16 (even though the weights are in fp32)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test fp16 save_pretrained, loaded with auto-detection
-        model = model.half()
-        model.save_pretrained(model_path)
-        # 1. test torch_dtype="auto" via `config.torch_dtype`
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.config.torch_dtype, torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-        # tests `config.torch_dtype` saving
-        with open(f"{model_path}/config.json") as f:
-            config_dict = json.load(f)
-        self.assertEqual(config_dict["torch_dtype"], "float16")
-        # 2. test torch_dtype="auto" via auto-derivation
-        # now same with using config info
-        remove_torch_dtype(model_path)
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float16)
-
-        # 3. now retest that AutoModel behaves the same wrt torch_dtype="auto" as T5ForConditionalGeneration
-        model = AutoModel.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test fp16 save_pretrained, loaded with the explicit fp16
-        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test AutoModel separately as it goes through a different path
-        # test auto-detection - as currently TINY_T5 doesn't have torch_dtype entry
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
-        # test that the config object didn't get polluted with torch_dtype="auto"
-        # there was a bug that after this call we ended up with config.torch_dtype=="auto"
-        self.assertNotEqual(model.config.torch_dtype, "auto")
-        # now test the outcome
-        self.assertEqual(model.dtype, torch.float32)
-        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
-        self.assertEqual(model.dtype, torch.float16)
-
-        # test model whose first param is not of a floating type, but int
-        model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
-        self.assertEqual(model.dtype, torch.float32)
-
-    def test_no_super_init_config_and_model(self):
-        config = NoSuperInitConfig(attribute=32)
-        model = NoSuperInitModel(config)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir)
-
-            new_model = NoSuperInitModel.from_pretrained(tmp_dir)
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = torch.nn.Sequential(
-            torch.nn.Linear(100, 200, bias=False),  # size 80,000
-            torch.nn.Linear(200, 200, bias=False),  # size 160,000
-            torch.nn.Linear(200, 100, bias=False),  # size 80,000
-            torch.nn.Linear(100, 50, bias=False),  # size 20,000
-        )
-        state_dict = model.state_dict()
-
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = shard_checkpoint(state_dict)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00002.bin",
-                        "1.weight": "pytorch_model-00001-of-00002.bin",
-                        "2.weight": "pytorch_model-00002-of-00002.bin",
-                        "3.weight": "pytorch_model-00002-of-00002.bin",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
-            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
-            )
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "pytorch_model-00001-of-00003.bin",
-                        "1.weight": "pytorch_model-00002-of-00003.bin",
-                        "2.weight": "pytorch_model-00003-of-00003.bin",
-                        "3.weight": "pytorch_model-00003-of-00003.bin",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"]}
-            shard2 = {"1.weight": state_dict["1.weight"]}
-            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards,
-                {
-                    "pytorch_model-00001-of-00003.bin": shard1,
-                    "pytorch_model-00002-of-00003.bin": shard2,
-                    "pytorch_model-00003-of-00003.bin": shard3,
-                },
-            )
-
-    def test_checkpoint_sharding_local(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".bin"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        state_dict = torch.load(shard_file)
-                        self.assertEqual(len(state_dict), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".bin")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = BertModel.from_pretrained(tmp_dir)
-                for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                    self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_variant_local(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2")
-
-            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
-
-            weights_file = os.path.join(tmp_dir, weights_name)
-            self.assertTrue(os.path.isfile(weights_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_variant_local_sharded(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB")
-
-            weights_index_name = ".".join(WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
-            weights_index_file = os.path.join(tmp_dir, weights_index_name)
-            self.assertTrue(os.path.isfile(weights_index_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
-
-            for i in range(1, 6):
-                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_checkpoint_variant_local_safe(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", safe_serialization=True)
-
-            weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"])
-
-            weights_file = os.path.join(tmp_dir, weights_name)
-            self.assertTrue(os.path.isfile(weights_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_checkpoint_variant_local_sharded_safe(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB", safe_serialization=True)
-
-            weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
-            weights_index_file = os.path.join(tmp_dir, weights_index_name)
-            self.assertTrue(os.path.isfile(weights_index_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            for i in range(1, 6):
-                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(tmp_dir)
-
-            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_checkpoint_variant_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir)
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    def test_checkpoint_variant_hub_sharded(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(
-                    "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir
-                )
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    @require_safetensors
-    def test_checkpoint_variant_hub_safe(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir)
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    @require_safetensors
-    def test_checkpoint_variant_hub_sharded_safe(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = BertModel.from_pretrained(
-                    "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir
-                )
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir, variant="v2"
-            )
-        self.assertIsNotNone(model)
-
-    def test_checkpoint_variant_save_load(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model = BertModel.from_pretrained(
-                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
-            )
-            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
-
-            model.save_pretrained(tmp_dir, variant="v2")
-            # saving will create a variant checkpoint
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
-
-            model.save_pretrained(tmp_dir)
-            # saving shouldn't delete variant checkpoints
-            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
-
-            # there should be a normal checkpoint
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-        self.assertIsNotNone(model)
-
-    @require_accelerate
-    @mark.accelerate_tests
-    def test_from_pretrained_low_cpu_mem_usage_functional(self):
-        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
-        # sharded models
-
-        mnames = [
-            "hf-internal-testing/tiny-random-bert-sharded",
-            "hf-internal-testing/tiny-random-bert",
-        ]
-        for mname in mnames:
-            _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
-
-    @require_usr_bin_time
-    @require_accelerate
-    @mark.accelerate_tests
-    def test_from_pretrained_low_cpu_mem_usage_measured(self):
-        # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
-
-        mname = "bert-base-cased"
-
-        preamble = "from transformers import AutoModel"
-        one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
-        max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_normal=}")
-
-        one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
-        max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_low_mem=}")
-
-        diff_bytes = max_rss_normal - max_rss_low_mem
-        diff_percent = diff_bytes / max_rss_low_mem
-        # print(f"{diff_bytes=}, {diff_percent=}")
-        # ideally we would compare that the diff is close to ~1x checkpoint size in bytes, but
-        # measuring cpu memory on linux is very tricky and inconsistent, so instead let's check that
-        # it's at least 15% less cpu memory consumed
-
-        self.assertGreater(
-            diff_percent,
-            0.15,
-            "should use less CPU memory for low_cpu_mem_usage=True, "
-            f"but got max_rss_normal={max_rss_normal} and max_rss_low_mem={max_rss_low_mem}",
-        )
-
-        # if you want to compare things manually, let's first look at the size of the model in bytes
-        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
-        # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        # total_bytes = total_numel * 4  # 420MB
-        # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
-        # The easiest way to test this is to switch the model and torch.load to do all the work on
-        # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
-        # functionality to load models directly on gpu, this test can be rewritten to use torch's
-        # cuda memory tracking and then we should be able to do a much more precise test.
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_multi_gpu
-    @slow
-    def test_model_parallelism_gpt2(self):
-        device_map = {"transformer.wte": 0, "transformer.wpe": 0, "lm_head": 0, "transformer.ln_f": 1}
-        for i in range(12):
-            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
-
-        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
-
-        tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        inputs = tokenizer("Hello, my name is", return_tensors="pt")
-        output = model.generate(inputs["input_ids"].to(0))
-
-        text_output = tokenizer.decode(output[0].tolist())
-        self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm")
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_gpu
-    def test_from_pretrained_disk_offload_task_model(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        device_map = {
-            "transformer.wte": 0,
-            "transformer.wpe": 0,
-            "transformer.h.0": "cpu",
-            "transformer.h.1": "cpu",
-            "transformer.h.2": "cpu",
-            "transformer.h.3": "disk",
-            "transformer.h.4": "disk",
-            "transformer.ln_f": 0,
-            "lm_head": 0,
-        }
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            inputs = torch.tensor([[1, 2, 3]]).to(0)
-
-            model.save_pretrained(tmp_dir)
-            new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
-            outputs1 = new_model.to(0)(inputs)
-
-            offload_folder = os.path.join(tmp_dir, "offload")
-            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
-                tmp_dir, device_map=device_map, offload_folder=offload_folder
-            )
-            outputs2 = new_model_with_offload(inputs)
-
-            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
-
-            # With state dict temp offload
-            offload_folder = os.path.join(tmp_dir, "offload")
-            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
-                tmp_dir,
-                device_map=device_map,
-                offload_folder=offload_folder,
-                offload_state_dict=True,
-            )
-            outputs2 = new_model_with_offload(inputs)
-
-            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_load_from_one_file(self):
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get(
-                    "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", f
-                )
-
-            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            _ = BertModel.from_pretrained(tmp_file, config=config)
-        finally:
-            os.remove(tmp_file)
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        _ = BertModel.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", config=config
-        )
-
-    @require_safetensors
-    def test_use_safetensors(self):
-        # test nice error message if no safetensor files available
-        with self.assertRaises(OSError) as env_error:
-            AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
-
-        self.assertTrue(
-            "model.safetensors or model.safetensors.index.json and thus cannot be loaded with `safetensors`"
-            in str(env_error.exception)
-        )
-
-        # test that error if only safetensors is available
-        with self.assertRaises(OSError) as env_error:
-            BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors", use_safetensors=False)
-
-        self.assertTrue("does not appear to have a file named pytorch_model.bin" in str(env_error.exception))
-
-        # test that only safetensors if both available and use_safetensors=False
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            CLIPTextModel.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
-                subfolder="text_encoder",
-                use_safetensors=False,
-                cache_dir=tmp_dir,
-            )
-
-            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
-            self.assertTrue(any(f.endswith("bin") for f in all_downloaded_files))
-            self.assertFalse(any(f.endswith("safetensors") for f in all_downloaded_files))
-
-        # test that no safetensors if both available and use_safetensors=True
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            CLIPTextModel.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
-                subfolder="text_encoder",
-                use_safetensors=True,
-                cache_dir=tmp_dir,
-            )
-
-            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
-            self.assertTrue(any(f.endswith("safetensors") for f in all_downloaded_files))
-            self.assertFalse(any(f.endswith("bin") for f in all_downloaded_files))
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No pytorch_model.bin file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-
-            new_model = BertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_safetensors_save_and_load_sharded(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="100kB")
-            # No pytorch_model.bin index file, only a model.safetensors index
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-            # No regular weights file
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = BertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub_sharded(self):
-        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded-safetensors")
-        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
-            self.assertTrue(torch.allclose(p1, p2))
-
-    def test_base_model_to_head_model_load(self):
-        base_model = BaseModel(PretrainedConfig())
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            base_model.save_pretrained(tmp_dir)
-
-            # Can load a base model in a model with head
-            model = ModelWithHead.from_pretrained(tmp_dir)
-            for p1, p2 in zip(model.base.parameters(), base_model.parameters()):
-                self.assertTrue(torch.allclose(p1, p2))
-
-            # It doesn't work if the state dict has a mix of keys of the head and base without prefix though.
-            base_state_dict = base_model.state_dict()
-            head_state_dict = model.state_dict()
-            base_state_dict["linear2.weight"] = head_state_dict["linear2.weight"]
-            base_state_dict["linear2.bias"] = head_state_dict["linear2.bias"]
-            torch.save(base_state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
-
-            with self.assertRaisesRegex(
-                ValueError, "The state dictionary of the model you are trying to load is corrupted."
-            ):
-                _ = ModelWithHead.from_pretrained(tmp_dir)
-
-    @require_torch_gpu
-    @slow
-    def test_pretrained_low_mem_new_config(self):
-        # Checking for 1 model(the same one which was described in the issue) .
-        model_ids = ["gpt2"]
-
-        for model_id in model_ids:
-            model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_id)
-            model_config.n_layer = 48
-            model_config.n_head = 25
-            model_config.n_embd = 1600
-            model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_name_or_path=model_id,
-                config=model_config,
-                ignore_mismatched_sizes=True,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-            )
-            model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
-
-            self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
-
-
-@require_torch
-@is_staging_test
-class ModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-model")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = BertModel(config)
-        model.push_to_hub("test-model", use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained(f"{USER}/test-model")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained(f"{USER}/test-model")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = BertModel(config)
-        model.push_to_hub("valid_org/test-model-org", use_auth_token=self._token)
-
-        new_model = BertModel.from_pretrained("valid_org/test-model-org")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-org"
-            )
-
-        new_model = BertModel.from_pretrained("valid_org/test-model-org")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-    def test_push_to_hub_dynamic_model(self):
-        CustomConfig.register_for_auto_class()
-        CustomModel.register_for_auto_class()
-
-        config = CustomConfig(hidden_size=32)
-        model = CustomModel(config)
-
-        model.push_to_hub("test-dynamic-model", use_auth_token=self._token)
-        # checks
-        self.assertDictEqual(
-            config.auto_map,
-            {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
-        )
-
-        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
-
-        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
-        new_model = AutoModel.from_config(config, trust_remote_code=True)
-        self.assertEqual(new_model.__class__.__name__, "CustomModel")
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
index 058d7906113d..58ada0226a51 100644
--- a/tests/test_modeling_flax_common.py
+++ b/tests/test_modeling_flax_common.py
@@ -17,25 +17,14 @@
 import json
 import random
 import tempfile
-import unittest
 from typing import List, Tuple
 
 import numpy as np
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
 
 import transformers
-from transformers import BertConfig, is_flax_available, is_torch_available
+from transformers import is_flax_available, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import (
-    TOKEN,
-    USER,
-    CaptureLogger,
-    is_pt_flax_cross_test,
-    is_staging_test,
-    require_flax,
-    torch_device,
-)
+from transformers.testing_utils import CaptureLogger, is_pt_flax_cross_test, require_flax, torch_device
 from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
 
@@ -69,14 +58,6 @@
     import torch
 
 
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key:
-            setattr(configs_no_init, key, 1e-10)
-    return configs_no_init
-
-
 def ids_tensor(shape, vocab_size, rng=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
     if rng is None:
@@ -1164,155 +1145,3 @@ def test_gradient_checkpointing(self):
             # ensure that the outputs remain precisely equal
             for output, remat_output in zip(outputs, remat_outputs):
                 self.assertTrue((output == remat_output).all())
-
-
-@require_flax
-@is_staging_test
-class FlaxModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-flax")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-flax-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = FlaxBertModel(config)
-        model.push_to_hub("test-model-flax", use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-flax")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model-flax", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = FlaxBertModel(config)
-        model.push_to_hub("valid_org/test-model-flax-org", use_auth_token=self._token)
-
-        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-model-flax-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
-
-        base_params = flatten_dict(unfreeze(model.params))
-        new_params = flatten_dict(unfreeze(new_model.params))
-
-        for key in base_params.keys():
-            max_diff = (base_params[key] - new_params[key]).sum().item()
-            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-
-def check_models_equal(model1, model2):
-    models_are_equal = True
-    flat_params_1 = flatten_dict(model1.params)
-    flat_params_2 = flatten_dict(model2.params)
-    for key in flat_params_1.keys():
-        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
-            models_are_equal = False
-
-    return models_are_equal
-
-
-@require_flax
-class FlaxModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained_subfolder(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder))
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_subfolder_sharded(self):
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = FlaxBertModel(config)
-
-        subfolder = "bert"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
-
-            with self.assertRaises(OSError):
-                _ = FlaxBertModel.from_pretrained(tmp_dir)
-
-            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
-
-        self.assertTrue(check_models_equal(model, model_loaded))
-
-    def test_model_from_pretrained_hub_subfolder(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
-
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
-
-    def test_model_from_pretrained_hub_subfolder_sharded(self):
-        subfolder = "bert"
-        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
-        with self.assertRaises(OSError):
-            _ = FlaxBertModel.from_pretrained(model_id)
-
-        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
-
-        self.assertIsNotNone(model)
diff --git a/tests/test_modeling_flax_utils.py b/tests/test_modeling_flax_utils.py
new file mode 100644
index 000000000000..d8fb71a6104c
--- /dev/null
+++ b/tests/test_modeling_flax_utils.py
@@ -0,0 +1,186 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import numpy as np
+from huggingface_hub import HfFolder, delete_repo
+from requests.exceptions import HTTPError
+
+from transformers import BertConfig, is_flax_available
+from transformers.testing_utils import TOKEN, USER, is_staging_test, require_flax
+
+
+if is_flax_available():
+    import os
+
+    from flax.core.frozen_dict import unfreeze
+    from flax.traverse_util import flatten_dict
+
+    from transformers import FlaxBertModel
+
+    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
+
+
+@require_flax
+@is_staging_test
+class FlaxModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-flax")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-flax-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = FlaxBertModel(config)
+        model.push_to_hub("test-model-flax", use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model-flax")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model-flax", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained(f"{USER}/test-model-flax")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = FlaxBertModel(config)
+        model.push_to_hub("valid_org/test-model-flax-org", use_auth_token=self._token)
+
+        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-model-flax-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_model = FlaxBertModel.from_pretrained("valid_org/test-model-flax-org")
+
+        base_params = flatten_dict(unfreeze(model.params))
+        new_params = flatten_dict(unfreeze(new_model.params))
+
+        for key in base_params.keys():
+            max_diff = (base_params[key] - new_params[key]).sum().item()
+            self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+
+def check_models_equal(model1, model2):
+    models_are_equal = True
+    flat_params_1 = flatten_dict(model1.params)
+    flat_params_2 = flatten_dict(model2.params)
+    for key in flat_params_1.keys():
+        if np.sum(np.abs(flat_params_1[key] - flat_params_2[key])) > 1e-4:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+@require_flax
+class FlaxModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained_subfolder(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+        model = FlaxBertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder))
+
+            with self.assertRaises(OSError):
+                _ = FlaxBertModel.from_pretrained(tmp_dir)
+
+            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_subfolder_sharded(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
+        model = FlaxBertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
+
+            with self.assertRaises(OSError):
+                _ = FlaxBertModel.from_pretrained(tmp_dir)
+
+            model_loaded = FlaxBertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_hub_subfolder(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
+
+        with self.assertRaises(OSError):
+            _ = FlaxBertModel.from_pretrained(model_id)
+
+        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_hub_subfolder_sharded(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
+        with self.assertRaises(OSError):
+            _ = FlaxBertModel.from_pretrained(model_id)
+
+        model = FlaxBertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 9ebbe86bf031..a4de6e8f471e 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -23,42 +23,24 @@
 import random
 import tempfile
 import unittest
-import unittest.mock as mock
 from importlib import import_module
 from math import isnan
 from typing import List, Tuple
 
 from datasets import Dataset
-from huggingface_hub import HfFolder, Repository, delete_repo
-from huggingface_hub.file_download import http_get
-from requests.exceptions import HTTPError
 
 from transformers import is_tf_available, is_torch_available
-from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import (  # noqa: F401
-    TOKEN,
-    USER,
     CaptureLogger,
-    CaptureStdout,
     _tf_gpu_memory_limit,
     is_pt_tf_cross_test,
-    is_staging_test,
-    require_safetensors,
     require_tf,
     require_tf2onnx,
     slow,
-    tooslow,
     torch_device,
 )
-from transformers.utils import (
-    CONFIG_NAME,
-    GENERATION_CONFIG_NAME,
-    SAFE_WEIGHTS_NAME,
-    TF2_WEIGHTS_INDEX_NAME,
-    TF2_WEIGHTS_NAME,
-    logging,
-)
+from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
 
 
@@ -66,7 +48,6 @@
 
 
 if is_tf_available():
-    import h5py
     import numpy as np
     import tensorflow as tf
 
@@ -85,17 +66,8 @@
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        BertConfig,
-        PreTrainedModel,
-        PushToHubCallback,
-        RagRetriever,
         TFAutoModel,
         TFAutoModelForSequenceClassification,
-        TFBertForMaskedLM,
-        TFBertForSequenceClassification,
-        TFBertModel,
-        TFPreTrainedModel,
-        TFRagModel,
         TFSharedEmbeddings,
     )
     from transformers.generation import (
@@ -108,8 +80,6 @@
         TFSampleDecoderOnlyOutput,
         TFSampleEncoderDecoderOutput,
     )
-    from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs
-    from transformers.tf_utils import stable_softmax
 
     tf.config.experimental.enable_tensor_float_32_execution(False)
 
@@ -130,8 +100,6 @@
 if is_torch_available():
     import torch
 
-    from transformers import BertModel
-
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
@@ -1995,544 +1963,3 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None, dtype=None):
         values.append(rng.random() * scale)
 
     return tf.reshape(tf.constant(values, dtype=dtype if dtype is not None else tf.float32), shape=shape)
-
-
-@require_tf
-class UtilsFunctionsTest(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_load_from_one_file(self):
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", f)
-
-            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            _ = TFBertModel.from_pretrained(tmp_file, config=config)
-        finally:
-            os.remove(tmp_file)
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-        _ = TFBertModel.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", config=config
-        )
-
-    # tests whether the unpack_inputs function behaves as expected
-    def test_unpack_inputs(self):
-        class DummyModel:
-            def __init__(self):
-                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
-                self.config = PretrainedConfig(**config_kwargs)
-                self.main_input_name = "input_ids"
-
-            @unpack_inputs
-            def call(
-                self,
-                input_ids=None,
-                past_key_values=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-            ):
-                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
-
-            @unpack_inputs
-            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
-                return pixel_values, output_attentions, output_hidden_states, return_dict
-
-        dummy_model = DummyModel()
-        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
-        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
-        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
-
-        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
-        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 2: Same as above, but with positional arguments.
-        output = dummy_model.call(input_ids, past_key_values)
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 3: We can also pack everything in the first input.
-        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertFalse(output[4])
-
-        # test case 4: Explicit boolean arguments should override the config.
-        output = dummy_model.call(
-            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
-        )
-        tf.debugging.assert_equal(output[0], input_ids)
-        tf.debugging.assert_equal(output[1], past_key_values)
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-        self.assertTrue(output[4])
-
-        # test case 5: Unexpected arguments should raise an exception.
-        with self.assertRaises(ValueError):
-            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
-
-        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
-        # decorated function as its main input.
-        output = dummy_model.foo(pixel_values=pixel_values)
-        tf.debugging.assert_equal(output[0], pixel_values)
-        self.assertFalse(output[1])
-        self.assertFalse(output[2])
-        self.assertFalse(output[3])
-
-    # Tests whether the stable softmax is stable on CPU, with and without XLA
-    def test_xla_stable_softmax(self):
-        large_penalty = -1e9
-        n_tokens = 10
-        batch_size = 8
-
-        def masked_softmax(x, boolean_mask):
-            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-            masked_x = x + numerical_mask
-            return stable_softmax(masked_x)
-
-        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
-        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
-        x = tf.random.normal((batch_size, n_tokens))
-
-        # Same outcome regardless of the boolean mask here
-        masked_tokens = random.randint(0, n_tokens)
-        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
-
-        # We can randomly mask a random numerical input OUTSIDE XLA
-        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
-        masked_x = x + numerical_mask
-        xla_out = xla_stable_softmax(masked_x)
-        out = stable_softmax(masked_x)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-        # The stable softmax has the same output as the original softmax
-        unstable_out = tf.nn.softmax(masked_x)
-        assert tf.experimental.numpy.allclose(unstable_out, out)
-
-        # We can randomly mask a random numerical input INSIDE XLA
-        xla_out = xla_masked_softmax(x, boolean_mask)
-        out = masked_softmax(x, boolean_mask)
-        assert tf.experimental.numpy.allclose(xla_out, out)
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_sharded_checkpoint_with_prefix(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", load_weight_prefix="a/b")
-        sharded_model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded", load_weight_prefix="a/b")
-        for p1, p2 in zip(model.weights, sharded_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-            self.assertTrue(p1.name.startswith("a/b/"))
-            self.assertTrue(p2.name.startswith("a/b/"))
-
-    def test_sharded_checkpoint_transfer(self):
-        # If this doesn't throw an error then the test passes
-        TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_local_from_pt(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
-            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
-            # the model above is the same as the model below, just a sharded pytorch version.
-            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            for p1, p2 in zip(model.weights, ref_model.weights):
-                assert np.allclose(p1.numpy(), p2.numpy())
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_loading_with_prefix_from_pt(self):
-        model = TFBertModel.from_pretrained(
-            "hf-internal-testing/tiny-random-bert", from_pt=True, load_weight_prefix="a/b"
-        )
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-            self.assertTrue(p1.name.startswith("a/b/"))
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_hub_from_pt(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
-        # the model above is the same as the model below, just a sharded pytorch version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = tf.keras.Sequential(
-            [
-                tf.keras.layers.Dense(200, use_bias=False),  # size 80,000
-                tf.keras.layers.Dense(200, use_bias=False),  # size 160,000
-                tf.keras.layers.Dense(100, use_bias=False),  # size 80,000
-                tf.keras.layers.Dense(50, use_bias=False),  # size 20,000
-            ]
-        )
-        inputs = tf.zeros((1, 100), dtype=tf.float32)
-        model(inputs)
-        weights = model.weights
-        weights_dict = {w.name: w for w in weights}
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = tf_shard_checkpoint(weights)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
-                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
-                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
-            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
-                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
-                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
-                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
-                    },
-                },
-            )
-
-            shard1 = [weights_dict["dense/kernel:0"]]
-            shard2 = [weights_dict["dense_1/kernel:0"]]
-            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
-            self.assertDictEqual(
-                shards,
-                {
-                    "tf_model-00001-of-00003.h5": shard1,
-                    "tf_model-00002-of-00003.h5": shard2,
-                    "tf_model-00003-of-00003.h5": shard3,
-                },
-            )
-
-    @slow
-    def test_special_layer_name_sharding(self):
-        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
-        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
-                for p1, p2 in zip(model.weights, ref_model.weights):
-                    assert np.allclose(p1.numpy(), p2.numpy())
-
-    def test_checkpoint_sharding_local(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size)
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".h5"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        with h5py.File(shard_file, "r") as state_file:
-                            self.assertEqual(len(state_file), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".h5")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = TFBertModel.from_pretrained(tmp_dir)
-
-                model.build()
-                new_model.build()
-
-                for p1, p2 in zip(model.weights, new_model.weights):
-                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @slow
-    def test_save_pretrained_signatures(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Short custom TF signature function.
-        # `input_signature` is specific to BERT.
-        @tf.function(
-            input_signature=[
-                [
-                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
-                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
-                ]
-            ]
-        )
-        def serving_fn(input):
-            return model(input)
-
-        # Using default signature (default behavior) overrides 'serving_default'
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
-
-        # Providing custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
-
-        # Providing multiple custom signature function
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir,
-                saved_model=True,
-                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
-            )
-            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
-            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
-            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
-
-    @require_safetensors
-    def test_safetensors_save_and_load(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            # No tf_model.h5 file, only a model.safetensors
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @is_pt_tf_cross_test
-    def test_safetensors_save_and_load_pt_to_tf(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
-            # Check we have a model.safetensors file
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @require_safetensors
-    def test_safetensors_load_from_hub(self):
-        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Can load from the TF-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
-
-        # Check models are equal
-        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-
-@require_tf
-@is_staging_test
-class TFModelPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-tf")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-model-tf-callback")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-model-tf-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        model.build()
-
-        logging.set_verbosity_info()
-        logger = logging.get_logger("transformers.utils.hub")
-        with CaptureLogger(logger) as cl:
-            model.push_to_hub("test-model-tf", use_auth_token=self._token)
-        logging.set_verbosity_warning()
-        # Check the model card was created and uploaded.
-        self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
-
-        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-tf")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, repo_id="test-model-tf", push_to_hub=True, use_auth_token=self._token)
-
-        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-    @is_pt_tf_cross_test
-    def test_push_to_hub_callback(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertForMaskedLM(config)
-        model.compile()
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            push_to_hub_callback = PushToHubCallback(
-                output_dir=tmp_dir,
-                hub_model_id="test-model-tf-callback",
-                hub_token=self._token,
-            )
-            model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
-
-        new_model = TFBertForMaskedLM.from_pretrained(f"{USER}/test-model-tf-callback")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
-        tf_push_to_hub_params.pop("base_model_card_args")
-        pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
-        pt_push_to_hub_params.pop("deprecated_kwargs")
-        self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        model = TFBertModel(config)
-        # Make sure model is properly initialized
-        model.build()
-
-        model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
-
-        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-tf-org"
-            )
-
-        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
-        models_equal = True
-        for p1, p2 in zip(model.weights, new_model.weights):
-            if not tf.math.reduce_all(p1 == p2):
-                models_equal = False
-                break
-        self.assertTrue(models_equal)
diff --git a/tests/test_modeling_tf_utils.py b/tests/test_modeling_tf_utils.py
new file mode 100644
index 000000000000..862a2cffa8a0
--- /dev/null
+++ b/tests/test_modeling_tf_utils.py
@@ -0,0 +1,627 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import inspect
+import json
+import os
+import random
+import tempfile
+import unittest
+import unittest.mock as mock
+
+from huggingface_hub import HfFolder, Repository, delete_repo
+from huggingface_hub.file_download import http_get
+from requests.exceptions import HTTPError
+
+from transformers import is_tf_available, is_torch_available
+from transformers.configuration_utils import PretrainedConfig
+from transformers.testing_utils import (  # noqa: F401
+    TOKEN,
+    USER,
+    CaptureLogger,
+    _tf_gpu_memory_limit,
+    is_pt_tf_cross_test,
+    is_staging_test,
+    require_safetensors,
+    require_tf,
+    slow,
+)
+from transformers.utils import SAFE_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_tf_available():
+    import h5py
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import (
+        BertConfig,
+        PreTrainedModel,
+        PushToHubCallback,
+        RagRetriever,
+        TFBertForMaskedLM,
+        TFBertForSequenceClassification,
+        TFBertModel,
+        TFPreTrainedModel,
+        TFRagModel,
+    )
+    from transformers.modeling_tf_utils import tf_shard_checkpoint, unpack_inputs
+    from transformers.tf_utils import stable_softmax
+
+    tf.config.experimental.enable_tensor_float_32_execution(False)
+
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.set_logical_device_configuration(
+                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                )
+                logical_gpus = tf.config.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
+
+if is_torch_available():
+    from transformers import BertModel
+
+
+@require_tf
+class TFModelUtilsTest(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", f)
+
+            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            _ = TFBertModel.from_pretrained(tmp_file, config=config)
+        finally:
+            os.remove(tmp_file)
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        _ = TFBertModel.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/tf_model.h5", config=config
+        )
+
+    # tests whether the unpack_inputs function behaves as expected
+    def test_unpack_inputs(self):
+        class DummyModel:
+            def __init__(self):
+                config_kwargs = {"output_attentions": False, "output_hidden_states": False, "return_dict": False}
+                self.config = PretrainedConfig(**config_kwargs)
+                self.main_input_name = "input_ids"
+
+            @unpack_inputs
+            def call(
+                self,
+                input_ids=None,
+                past_key_values=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+            ):
+                return input_ids, past_key_values, output_attentions, output_hidden_states, return_dict
+
+            @unpack_inputs
+            def foo(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
+                return pixel_values, output_attentions, output_hidden_states, return_dict
+
+        dummy_model = DummyModel()
+        input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int32)
+        past_key_values = tf.constant([4, 5, 6, 7], dtype=tf.int32)
+        pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int32)
+
+        # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config.
+        output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values)
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 2: Same as above, but with positional arguments.
+        output = dummy_model.call(input_ids, past_key_values)
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 3: We can also pack everything in the first input.
+        output = dummy_model.call(input_ids={"input_ids": input_ids, "past_key_values": past_key_values})
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertFalse(output[4])
+
+        # test case 4: Explicit boolean arguments should override the config.
+        output = dummy_model.call(
+            input_ids=input_ids, past_key_values=past_key_values, output_attentions=False, return_dict=True
+        )
+        tf.debugging.assert_equal(output[0], input_ids)
+        tf.debugging.assert_equal(output[1], past_key_values)
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+        self.assertTrue(output[4])
+
+        # test case 5: Unexpected arguments should raise an exception.
+        with self.assertRaises(ValueError):
+            output = dummy_model.call(input_ids=input_ids, past_key_values=past_key_values, foo="bar")
+
+        # test case 6: the decorator is independent from `main_input_name` -- it treats the first argument of the
+        # decorated function as its main input.
+        output = dummy_model.foo(pixel_values=pixel_values)
+        tf.debugging.assert_equal(output[0], pixel_values)
+        self.assertFalse(output[1])
+        self.assertFalse(output[2])
+        self.assertFalse(output[3])
+
+    # Tests whether the stable softmax is stable on CPU, with and without XLA
+    def test_xla_stable_softmax(self):
+        large_penalty = -1e9
+        n_tokens = 10
+        batch_size = 8
+
+        def masked_softmax(x, boolean_mask):
+            numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
+            masked_x = x + numerical_mask
+            return stable_softmax(masked_x)
+
+        xla_masked_softmax = tf.function(masked_softmax, jit_compile=True)
+        xla_stable_softmax = tf.function(stable_softmax, jit_compile=True)
+        x = tf.random.normal((batch_size, n_tokens))
+
+        # Same outcome regardless of the boolean mask here
+        masked_tokens = random.randint(0, n_tokens)
+        boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32)
+
+        # We can randomly mask a random numerical input OUTSIDE XLA
+        numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty
+        masked_x = x + numerical_mask
+        xla_out = xla_stable_softmax(masked_x)
+        out = stable_softmax(masked_x)
+        assert tf.experimental.numpy.allclose(xla_out, out)
+
+        # The stable softmax has the same output as the original softmax
+        unstable_out = tf.nn.softmax(masked_x)
+        assert tf.experimental.numpy.allclose(unstable_out, out)
+
+        # We can randomly mask a random numerical input INSIDE XLA
+        xla_out = xla_masked_softmax(x, boolean_mask)
+        out = masked_softmax(x, boolean_mask)
+        assert tf.experimental.numpy.allclose(xla_out, out)
+
+    def test_checkpoint_sharding_from_hub(self):
+        model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+        # the model above is the same as the model below, just a sharded version.
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_sharded_checkpoint_with_prefix(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", load_weight_prefix="a/b")
+        sharded_model = TFBertModel.from_pretrained("ArthurZ/tiny-random-bert-sharded", load_weight_prefix="a/b")
+        for p1, p2 in zip(model.weights, sharded_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+            self.assertTrue(p1.name.startswith("a/b/"))
+            self.assertTrue(p2.name.startswith("a/b/"))
+
+    def test_sharded_checkpoint_transfer(self):
+        # If this doesn't throw an error then the test passes
+        TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_sharding_local_from_pt(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
+            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
+            # the model above is the same as the model below, just a sharded pytorch version.
+            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            for p1, p2 in zip(model.weights, ref_model.weights):
+                assert np.allclose(p1.numpy(), p2.numpy())
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_loading_with_prefix_from_pt(self):
+        model = TFBertModel.from_pretrained(
+            "hf-internal-testing/tiny-random-bert", from_pt=True, load_weight_prefix="a/b"
+        )
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+            self.assertTrue(p1.name.startswith("a/b/"))
+
+    @is_pt_tf_cross_test
+    def test_checkpoint_sharding_hub_from_pt(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
+        # the model above is the same as the model below, just a sharded pytorch version.
+        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.weights, ref_model.weights):
+            assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_shard_checkpoint(self):
+        # This is the model we will use, total size 340,000 bytes.
+        model = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(200, use_bias=False),  # size 80,000
+                tf.keras.layers.Dense(200, use_bias=False),  # size 160,000
+                tf.keras.layers.Dense(100, use_bias=False),  # size 80,000
+                tf.keras.layers.Dense(50, use_bias=False),  # size 20,000
+            ]
+        )
+        inputs = tf.zeros((1, 100), dtype=tf.float32)
+        model(inputs)
+        weights = model.weights
+        weights_dict = {w.name: w for w in weights}
+        with self.subTest("No shard when max size is bigger than model size"):
+            shards, index = tf_shard_checkpoint(weights)
+            self.assertIsNone(index)
+            self.assertDictEqual(shards, {TF2_WEIGHTS_NAME: weights})
+
+        with self.subTest("Test sharding, no weights bigger than max size"):
+            shards, index = tf_shard_checkpoint(weights, max_shard_size="300kB")
+            # Split is first two layers then last two.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "dense/kernel:0": "tf_model-00001-of-00002.h5",
+                        "dense_1/kernel:0": "tf_model-00001-of-00002.h5",
+                        "dense_2/kernel:0": "tf_model-00002-of-00002.h5",
+                        "dense_3/kernel:0": "tf_model-00002-of-00002.h5",
+                    },
+                },
+            )
+
+            shard1 = [weights_dict["dense/kernel:0"], weights_dict["dense_1/kernel:0"]]
+            shard2 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
+            self.assertDictEqual(shards, {"tf_model-00001-of-00002.h5": shard1, "tf_model-00002-of-00002.h5": shard2})
+
+        with self.subTest("Test sharding with weights bigger than max size"):
+            shards, index = tf_shard_checkpoint(weights, max_shard_size="100kB")
+            # Split is first layer, second layer then last 2.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "dense/kernel:0": "tf_model-00001-of-00003.h5",
+                        "dense_1/kernel:0": "tf_model-00002-of-00003.h5",
+                        "dense_2/kernel:0": "tf_model-00003-of-00003.h5",
+                        "dense_3/kernel:0": "tf_model-00003-of-00003.h5",
+                    },
+                },
+            )
+
+            shard1 = [weights_dict["dense/kernel:0"]]
+            shard2 = [weights_dict["dense_1/kernel:0"]]
+            shard3 = [weights_dict["dense_2/kernel:0"], weights_dict["dense_3/kernel:0"]]
+            self.assertDictEqual(
+                shards,
+                {
+                    "tf_model-00001-of-00003.h5": shard1,
+                    "tf_model-00002-of-00003.h5": shard2,
+                    "tf_model-00003-of-00003.h5": shard3,
+                },
+            )
+
+    @slow
+    def test_special_layer_name_sharding(self):
+        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        model = TFRagModel.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+                ref_model = TFRagModel.from_pretrained(tmp_dir, retriever=retriever)
+                for p1, p2 in zip(model.weights, ref_model.weights):
+                    assert np.allclose(p1.numpy(), p2.numpy())
+
+    def test_checkpoint_sharding_local(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
+            for max_size in ["150kB", "150kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+
+                # Get each shard file and its size
+                shard_to_size = {}
+                for shard in os.listdir(tmp_dir):
+                    if shard.endswith(".h5"):
+                        shard_file = os.path.join(tmp_dir, shard)
+                        shard_to_size[shard_file] = os.path.getsize(shard_file)
+
+                index_file = os.path.join(tmp_dir, TF2_WEIGHTS_INDEX_NAME)
+                # Check there is an index but no regular weight file
+                self.assertTrue(os.path.isfile(index_file))
+                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
+
+                # Check a file is bigger than max_size only when it has a single weight
+                for shard_file, size in shard_to_size.items():
+                    if max_size.endswith("kiB"):
+                        max_size_int = int(max_size[:-3]) * 2**10
+                    else:
+                        max_size_int = int(max_size[:-2]) * 10**3
+                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
+                    # the size asked for (since we count parameters)
+                    if size >= max_size_int + 50000:
+                        with h5py.File(shard_file, "r") as state_file:
+                            self.assertEqual(len(state_file), 1)
+
+                # Check the index and the shard files found match
+                with open(index_file, "r", encoding="utf-8") as f:
+                    index = json.loads(f.read())
+
+                all_shards = set(index["weight_map"].values())
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".h5")}
+                self.assertSetEqual(all_shards, shards_found)
+
+                # Finally, check the model can be reloaded
+                new_model = TFBertModel.from_pretrained(tmp_dir)
+
+                model.build()
+                new_model.build()
+
+                for p1, p2 in zip(model.weights, new_model.weights):
+                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @slow
+    def test_save_pretrained_signatures(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Short custom TF signature function.
+        # `input_signature` is specific to BERT.
+        @tf.function(
+            input_signature=[
+                [
+                    tf.TensorSpec([None, None], tf.int32, name="input_ids"),
+                    tf.TensorSpec([None, None], tf.int32, name="token_type_ids"),
+                    tf.TensorSpec([None, None], tf.int32, name="attention_mask"),
+                ]
+            ]
+        )
+        def serving_fn(input):
+            return model(input)
+
+        # Using default signature (default behavior) overrides 'serving_default'
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, saved_model=True, signatures=None)
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("serving_default" in list(model_loaded.signatures.keys()))
+
+        # Providing custom signature function
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, saved_model=True, signatures={"custom_signature": serving_fn})
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("custom_signature" in list(model_loaded.signatures.keys()))
+
+        # Providing multiple custom signature function
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir,
+                saved_model=True,
+                signatures={"custom_signature_1": serving_fn, "custom_signature_2": serving_fn},
+            )
+            model_loaded = tf.keras.models.load_model(f"{tmp_dir}/saved_model/1")
+            self.assertTrue("custom_signature_1" in list(model_loaded.signatures.keys()))
+            self.assertTrue("custom_signature_2" in list(model_loaded.signatures.keys()))
+
+    @require_safetensors
+    def test_safetensors_save_and_load(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
+            # No tf_model.h5 file, only a model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, TF2_WEIGHTS_NAME)))
+
+            new_model = TFBertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.weights, new_model.weights):
+                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @is_pt_tf_cross_test
+    def test_safetensors_save_and_load_pt_to_tf(self):
+        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
+            # Check we have a model.safetensors file
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            new_model = TFBertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.weights, new_model.weights):
+                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub(self):
+        tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Can load from the TF-formatted checkpoint
+        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors-tf")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+        # Can load from the PyTorch-formatted checkpoint
+        safetensors_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.weights, tf_model.weights):
+            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
+
+
+@require_tf
+@is_staging_test
+class TFModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-tf")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-model-tf-callback")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-tf-org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        model.build()
+
+        logging.set_verbosity_info()
+        logger = logging.get_logger("transformers.utils.hub")
+        with CaptureLogger(logger) as cl:
+            model.push_to_hub("test-model-tf", use_auth_token=self._token)
+        logging.set_verbosity_warning()
+        # Check the model card was created and uploaded.
+        self.assertIn("Uploading the following files to __DUMMY_TRANSFORMERS_USER__/test-model-tf", cl.out)
+
+        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model-tf")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model-tf", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+    @is_pt_tf_cross_test
+    def test_push_to_hub_callback(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertForMaskedLM(config)
+        model.compile()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            push_to_hub_callback = PushToHubCallback(
+                output_dir=tmp_dir,
+                hub_model_id="test-model-tf-callback",
+                hub_token=self._token,
+            )
+            model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
+
+        new_model = TFBertForMaskedLM.from_pretrained(f"{USER}/test-model-tf-callback")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
+        tf_push_to_hub_params.pop("base_model_card_args")
+        pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
+        pt_push_to_hub_params.pop("deprecated_kwargs")
+        self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        model.build()
+
+        model.push_to_hub("valid_org/test-model-tf-org", use_auth_token=self._token)
+
+        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-tf-org"
+            )
+
+        new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+        models_equal = True
+        for p1, p2 in zip(model.weights, new_model.weights):
+            if not tf.math.reduce_all(p1 == p2):
+                models_equal = False
+                break
+        self.assertTrue(models_equal)
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
new file mode 100755
index 000000000000..760d6c6d54d0
--- /dev/null
+++ b/tests/test_modeling_utils.py
@@ -0,0 +1,976 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+import os.path
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from huggingface_hub.file_download import http_get
+from pytest import mark
+from requests.exceptions import HTTPError
+
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    PretrainedConfig,
+    is_torch_available,
+    logging,
+)
+from transformers.testing_utils import (
+    TOKEN,
+    USER,
+    CaptureLogger,
+    TestCasePlus,
+    is_staging_test,
+    require_accelerate,
+    require_safetensors,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_usr_bin_time,
+    slow,
+)
+from transformers.utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+)
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig, NoSuperInitConfig  # noqa E402
+
+
+if is_torch_available():
+    import torch
+    from test_module.custom_modeling import CustomModel, NoSuperInitModel
+    from torch import nn
+
+    from transformers import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        BertConfig,
+        BertModel,
+        CLIPTextModel,
+        PreTrainedModel,
+        T5Config,
+        T5ForConditionalGeneration,
+    )
+    from transformers.modeling_utils import shard_checkpoint
+
+    # Fake pretrained models for tests
+    class BaseModel(PreTrainedModel):
+        config_class = PretrainedConfig
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.linear = nn.Linear(4, 5)
+            self.linear_2 = nn.Linear(5, 6)
+
+        def forward(self, x):
+            return self.linear_2(self.linear(x))
+
+    class ModelWithHead(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def _init_weights(self, module):
+            pass
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.base = BaseModel(config)
+            # linear is a common name between Base and Head on purpose.
+            self.linear = nn.Linear(6, 3)
+            self.linear2 = nn.Linear(3, 5)
+
+        def forward(self, x):
+            return self.linear2(self.linear(self.base(x)))
+
+
+TINY_T5 = "patrickvonplaten/t5-tiny-random"
+TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
+
+
+def check_models_equal(model1, model2):
+    models_are_equal = True
+    for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
+        if model1_p.data.ne(model2_p.data).sum() > 0:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+@require_torch
+class ModelUtilsTest(TestCasePlus):
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            self.assertEqual(len(loading_info["missing_keys"]), 0)
+            self.assertEqual(len(loading_info["unexpected_keys"]), 8)
+            self.assertEqual(len(loading_info["mismatched_keys"]), 0)
+            self.assertEqual(len(loading_info["error_msgs"]), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+
+            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
+            config.name_or_path = model_name
+
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+    def test_model_from_pretrained_subfolder(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model = BertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder))
+
+            with self.assertRaises(OSError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_subfolder_sharded(self):
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model = BertModel(config)
+
+        subfolder = "bert"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(os.path.join(tmp_dir, subfolder), max_shard_size="10KB")
+
+            with self.assertRaises(OSError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            model_loaded = BertModel.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        self.assertTrue(check_models_equal(model, model_loaded))
+
+    def test_model_from_pretrained_hub_subfolder(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-subfolder"
+        with self.assertRaises(OSError):
+            _ = BertModel.from_pretrained(model_id)
+
+        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_hub_subfolder_sharded(self):
+        subfolder = "bert"
+        model_id = "hf-internal-testing/tiny-random-bert-sharded-subfolder"
+        with self.assertRaises(OSError):
+            _ = BertModel.from_pretrained(model_id)
+
+        model = BertModel.from_pretrained(model_id, subfolder=subfolder)
+
+        self.assertIsNotNone(model)
+
+    def test_model_from_pretrained_with_different_pretrained_model_name(self):
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertIsNotNone(model)
+
+        logger = logging.get_logger("transformers.configuration_utils")
+        with CaptureLogger(logger) as cl:
+            BertModel.from_pretrained(TINY_T5)
+        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
+
+    def test_model_from_config_torch_dtype(self):
+        # test that the model can be instantiated with dtype of user's choice - as long as it's a
+        # float dtype. To make it happen config.torch_dtype needs to be set before instantiating the
+        # model from the config object.
+
+        config = T5Config.from_pretrained(TINY_T5)
+        model = AutoModel.from_config(config)
+        # XXX: isn't supported
+        # model = T5ForConditionalGeneration.from_config(config)
+        self.assertEqual(model.dtype, torch.float32)
+
+        model = AutoModel.from_config(config, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
+        with self.assertRaises(ValueError):
+            model = AutoModel.from_config(config, torch_dtype=torch.int64)
+
+    def test_model_from_pretrained_torch_dtype(self):
+        # test that the model can be instantiated with dtype of either
+        # 1. explicit from_pretrained's torch_dtype argument
+        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
+        # so if a model.half() was saved, we want it to be instantiated as such.
+        #
+        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
+        model_path = self.get_auto_remove_tmp_dir()
+
+        # baseline - we know TINY_T5 is fp32 model
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertEqual(model.dtype, torch.float32)
+
+        def remove_torch_dtype(model_path):
+            file = f"{model_path}/config.json"
+            with open(file, "r", encoding="utf-8") as f:
+                s = json.load(f)
+            s.pop("torch_dtype")
+            with open(file, "w", encoding="utf-8") as f:
+                json.dump(s, f)
+
+        # test the default fp32 save_pretrained => from_pretrained cycle
+        model.save_pretrained(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path)
+        self.assertEqual(model.dtype, torch.float32)
+        # 1. test torch_dtype="auto" via `config.torch_dtype`
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+        # 2. test torch_dtype="auto" via auto-derivation
+        # now remove the torch_dtype entry from config.json and try "auto" again which should
+        # perform auto-derivation from weights
+        remove_torch_dtype(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+        # test forced loading in fp16 (even though the weights are in fp32)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with auto-detection
+        model = model.half()
+        model.save_pretrained(model_path)
+        # 1. test torch_dtype="auto" via `config.torch_dtype`
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.config.torch_dtype, torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+        # tests `config.torch_dtype` saving
+        with open(f"{model_path}/config.json") as f:
+            config_dict = json.load(f)
+        self.assertEqual(config_dict["torch_dtype"], "float16")
+        # 2. test torch_dtype="auto" via auto-derivation
+        # now same with using config info
+        remove_torch_dtype(model_path)
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
+        # 3. now retest that AutoModel behaves the same wrt torch_dtype="auto" as T5ForConditionalGeneration
+        model = AutoModel.from_pretrained(model_path, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test fp16 save_pretrained, loaded with the explicit fp16
+        model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test AutoModel separately as it goes through a different path
+        # test auto-detection - as currently TINY_T5 doesn't have torch_dtype entry
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
+        # test that the config object didn't get polluted with torch_dtype="auto"
+        # there was a bug that after this call we ended up with config.torch_dtype=="auto"
+        self.assertNotEqual(model.config.torch_dtype, "auto")
+        # now test the outcome
+        self.assertEqual(model.dtype, torch.float32)
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
+        # test model whose first param is not of a floating type, but int
+        model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+
+    def test_no_super_init_config_and_model(self):
+        config = NoSuperInitConfig(attribute=32)
+        model = NoSuperInitModel(config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            new_model = NoSuperInitModel.from_pretrained(tmp_dir)
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_shard_checkpoint(self):
+        # This is the model we will use, total size 340,000 bytes.
+        model = torch.nn.Sequential(
+            torch.nn.Linear(100, 200, bias=False),  # size 80,000
+            torch.nn.Linear(200, 200, bias=False),  # size 160,000
+            torch.nn.Linear(200, 100, bias=False),  # size 80,000
+            torch.nn.Linear(100, 50, bias=False),  # size 20,000
+        )
+        state_dict = model.state_dict()
+
+        with self.subTest("No shard when max size is bigger than model size"):
+            shards, index = shard_checkpoint(state_dict)
+            self.assertIsNone(index)
+            self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
+
+        with self.subTest("Test sharding, no weights bigger than max size"):
+            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
+            # Split is first two layers then last two.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "0.weight": "pytorch_model-00001-of-00002.bin",
+                        "1.weight": "pytorch_model-00001-of-00002.bin",
+                        "2.weight": "pytorch_model-00002-of-00002.bin",
+                        "3.weight": "pytorch_model-00002-of-00002.bin",
+                    },
+                },
+            )
+
+            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
+            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
+            self.assertDictEqual(
+                shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
+            )
+
+        with self.subTest("Test sharding with weights bigger than max size"):
+            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
+            # Split is first layer, second layer then last 2.
+            self.assertDictEqual(
+                index,
+                {
+                    "metadata": {"total_size": 340000},
+                    "weight_map": {
+                        "0.weight": "pytorch_model-00001-of-00003.bin",
+                        "1.weight": "pytorch_model-00002-of-00003.bin",
+                        "2.weight": "pytorch_model-00003-of-00003.bin",
+                        "3.weight": "pytorch_model-00003-of-00003.bin",
+                    },
+                },
+            )
+
+            shard1 = {"0.weight": state_dict["0.weight"]}
+            shard2 = {"1.weight": state_dict["1.weight"]}
+            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
+            self.assertDictEqual(
+                shards,
+                {
+                    "pytorch_model-00001-of-00003.bin": shard1,
+                    "pytorch_model-00002-of-00003.bin": shard2,
+                    "pytorch_model-00003-of-00003.bin": shard3,
+                },
+            )
+
+    def test_checkpoint_sharding_local(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
+            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
+                model.save_pretrained(tmp_dir, max_shard_size=max_size)
+
+                # Get each shard file and its size
+                shard_to_size = {}
+                for shard in os.listdir(tmp_dir):
+                    if shard.endswith(".bin"):
+                        shard_file = os.path.join(tmp_dir, shard)
+                        shard_to_size[shard_file] = os.path.getsize(shard_file)
+
+                index_file = os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)
+                # Check there is an index but no regular weight file
+                self.assertTrue(os.path.isfile(index_file))
+                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+                # Check a file is bigger than max_size only when it has a single weight
+                for shard_file, size in shard_to_size.items():
+                    if max_size.endswith("kiB"):
+                        max_size_int = int(max_size[:-3]) * 2**10
+                    else:
+                        max_size_int = int(max_size[:-2]) * 10**3
+                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
+                    # the size asked for (since we count parameters)
+                    if size >= max_size_int + 50000:
+                        state_dict = torch.load(shard_file)
+                        self.assertEqual(len(state_dict), 1)
+
+                # Check the index and the shard files found match
+                with open(index_file, "r", encoding="utf-8") as f:
+                    index = json.loads(f.read())
+
+                all_shards = set(index["weight_map"].values())
+                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".bin")}
+                self.assertSetEqual(all_shards, shards_found)
+
+                # Finally, check the model can be reloaded
+                new_model = BertModel.from_pretrained(tmp_dir)
+                for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                    self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_sharding_from_hub(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+        # the model above is the same as the model below, just a sharded version.
+        ref_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_local(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2")
+
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+
+            weights_file = os.path.join(tmp_dir, weights_name)
+            self.assertTrue(os.path.isfile(weights_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_local_sharded(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB")
+
+            weights_index_name = ".".join(WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
+            weights_index_file = os.path.join(tmp_dir, weights_index_name)
+            self.assertTrue(os.path.isfile(weights_index_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
+
+            for i in range(1, 6):
+                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"])
+                weights_name_file = os.path.join(tmp_dir, weights_name)
+                self.assertTrue(os.path.isfile(weights_name_file))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_checkpoint_variant_local_safe(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", safe_serialization=True)
+
+            weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"])
+
+            weights_file = os.path.join(tmp_dir, weights_name)
+            self.assertTrue(os.path.isfile(weights_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_checkpoint_variant_local_sharded_safe(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB", safe_serialization=True)
+
+            weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
+            weights_index_file = os.path.join(tmp_dir, weights_index_name)
+            self.assertTrue(os.path.isfile(weights_index_file))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            for i in range(1, 6):
+                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"])
+                weights_name_file = os.path.join(tmp_dir, weights_name)
+                self.assertTrue(os.path.isfile(weights_name_file))
+
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(tmp_dir)
+
+            new_model = BertModel.from_pretrained(tmp_dir, variant="v2")
+
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_checkpoint_variant_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir)
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    def test_checkpoint_variant_hub_sharded(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(
+                    "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir
+                )
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-sharded", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    @require_safetensors
+    def test_checkpoint_variant_hub_safe(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir)
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-safe", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    @require_safetensors
+    def test_checkpoint_variant_hub_sharded_safe(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(EnvironmentError):
+                _ = BertModel.from_pretrained(
+                    "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir
+                )
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant-sharded-safe", cache_dir=tmp_dir, variant="v2"
+            )
+        self.assertIsNotNone(model)
+
+    def test_checkpoint_variant_save_load(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = BertModel.from_pretrained(
+                "hf-internal-testing/tiny-random-bert-variant", cache_dir=tmp_dir, variant="v2"
+            )
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+
+            model.save_pretrained(tmp_dir, variant="v2")
+            # saving will create a variant checkpoint
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
+
+            model.save_pretrained(tmp_dir)
+            # saving shouldn't delete variant checkpoints
+            weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["bin"])
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
+
+            # there should be a normal checkpoint
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+        self.assertIsNotNone(model)
+
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_from_pretrained_low_cpu_mem_usage_functional(self):
+        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
+        # sharded models
+
+        mnames = [
+            "hf-internal-testing/tiny-random-bert-sharded",
+            "hf-internal-testing/tiny-random-bert",
+        ]
+        for mname in mnames:
+            _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
+
+    @require_usr_bin_time
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_from_pretrained_low_cpu_mem_usage_measured(self):
+        # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
+
+        mname = "bert-base-cased"
+
+        preamble = "from transformers import AutoModel"
+        one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
+        max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
+        # print(f"{max_rss_normal=}")
+
+        one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
+        max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
+        # print(f"{max_rss_low_mem=}")
+
+        diff_bytes = max_rss_normal - max_rss_low_mem
+        diff_percent = diff_bytes / max_rss_low_mem
+        # print(f"{diff_bytes=}, {diff_percent=}")
+        # ideally we would compare that the diff is close to ~1x checkpoint size in bytes, but
+        # measuring cpu memory on linux is very tricky and inconsistent, so instead let's check that
+        # it's at least 15% less cpu memory consumed
+
+        self.assertGreater(
+            diff_percent,
+            0.15,
+            "should use less CPU memory for low_cpu_mem_usage=True, "
+            f"but got max_rss_normal={max_rss_normal} and max_rss_low_mem={max_rss_low_mem}",
+        )
+
+        # if you want to compare things manually, let's first look at the size of the model in bytes
+        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
+        # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+        # total_bytes = total_numel * 4  # 420MB
+        # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
+        # The easiest way to test this is to switch the model and torch.load to do all the work on
+        # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
+        # functionality to load models directly on gpu, this test can be rewritten to use torch's
+        # cuda memory tracking and then we should be able to do a much more precise test.
+
+    @require_accelerate
+    @mark.accelerate_tests
+    @require_torch_multi_gpu
+    @slow
+    def test_model_parallelism_gpt2(self):
+        device_map = {"transformer.wte": 0, "transformer.wpe": 0, "lm_head": 0, "transformer.ln_f": 1}
+        for i in range(12):
+            device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
+
+        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
+
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        inputs = tokenizer("Hello, my name is", return_tensors="pt")
+        output = model.generate(inputs["input_ids"].to(0))
+
+        text_output = tokenizer.decode(output[0].tolist())
+        self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm")
+
+    @require_accelerate
+    @mark.accelerate_tests
+    @require_torch_gpu
+    def test_from_pretrained_disk_offload_task_model(self):
+        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        device_map = {
+            "transformer.wte": 0,
+            "transformer.wpe": 0,
+            "transformer.h.0": "cpu",
+            "transformer.h.1": "cpu",
+            "transformer.h.2": "cpu",
+            "transformer.h.3": "disk",
+            "transformer.h.4": "disk",
+            "transformer.ln_f": 0,
+            "lm_head": 0,
+        }
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            inputs = torch.tensor([[1, 2, 3]]).to(0)
+
+            model.save_pretrained(tmp_dir)
+            new_model = AutoModelForCausalLM.from_pretrained(tmp_dir).to(0)
+            outputs1 = new_model.to(0)(inputs)
+
+            offload_folder = os.path.join(tmp_dir, "offload")
+            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
+                tmp_dir, device_map=device_map, offload_folder=offload_folder
+            )
+            outputs2 = new_model_with_offload(inputs)
+
+            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+
+            # With state dict temp offload
+            offload_folder = os.path.join(tmp_dir, "offload")
+            new_model_with_offload = AutoModelForCausalLM.from_pretrained(
+                tmp_dir,
+                device_map=device_map,
+                offload_folder=offload_folder,
+                offload_state_dict=True,
+            )
+            outputs2 = new_model_with_offload(inputs)
+
+            self.assertTrue(torch.allclose(outputs1.logits.cpu(), outputs2.logits.cpu()))
+
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_load_from_one_file(self):
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get(
+                    "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", f
+                )
+
+            config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            _ = BertModel.from_pretrained(tmp_file, config=config)
+        finally:
+            os.remove(tmp_file)
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+        _ = BertModel.from_pretrained(
+            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin", config=config
+        )
+
+    @require_safetensors
+    def test_use_safetensors(self):
+        # test nice error message if no safetensor files available
+        with self.assertRaises(OSError) as env_error:
+            AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
+
+        self.assertTrue(
+            "model.safetensors or model.safetensors.index.json and thus cannot be loaded with `safetensors`"
+            in str(env_error.exception)
+        )
+
+        # test that error if only safetensors is available
+        with self.assertRaises(OSError) as env_error:
+            BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors", use_safetensors=False)
+
+        self.assertTrue("does not appear to have a file named pytorch_model.bin" in str(env_error.exception))
+
+        # test that only safetensors if both available and use_safetensors=False
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            CLIPTextModel.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                subfolder="text_encoder",
+                use_safetensors=False,
+                cache_dir=tmp_dir,
+            )
+
+            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
+            self.assertTrue(any(f.endswith("bin") for f in all_downloaded_files))
+            self.assertFalse(any(f.endswith("safetensors") for f in all_downloaded_files))
+
+        # test that no safetensors if both available and use_safetensors=True
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            CLIPTextModel.from_pretrained(
+                "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+                subfolder="text_encoder",
+                use_safetensors=True,
+                cache_dir=tmp_dir,
+            )
+
+            all_downloaded_files = glob.glob(os.path.join(tmp_dir, "*", "snapshots", "*", "*", "*"))
+            self.assertTrue(any(f.endswith("safetensors") for f in all_downloaded_files))
+            self.assertFalse(any(f.endswith("bin") for f in all_downloaded_files))
+
+    @require_safetensors
+    def test_safetensors_save_and_load(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
+            # No pytorch_model.bin file, only a model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+
+            new_model = BertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub(self):
+        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-safetensors")
+        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_save_and_load_sharded(self):
+        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="100kB")
+            # No pytorch_model.bin index file, only a model.safetensors index
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
+            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+            # No regular weights file
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_NAME)))
+            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
+
+            new_model = BertModel.from_pretrained(tmp_dir)
+
+            # Check models are equal
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+    @require_safetensors
+    def test_safetensors_load_from_hub_sharded(self):
+        safetensors_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded-safetensors")
+        pytorch_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
+
+        # Check models are equal
+        for p1, p2 in zip(safetensors_model.parameters(), pytorch_model.parameters()):
+            self.assertTrue(torch.allclose(p1, p2))
+
+    def test_base_model_to_head_model_load(self):
+        base_model = BaseModel(PretrainedConfig())
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            base_model.save_pretrained(tmp_dir)
+
+            # Can load a base model in a model with head
+            model = ModelWithHead.from_pretrained(tmp_dir)
+            for p1, p2 in zip(model.base.parameters(), base_model.parameters()):
+                self.assertTrue(torch.allclose(p1, p2))
+
+            # It doesn't work if the state dict has a mix of keys of the head and base without prefix though.
+            base_state_dict = base_model.state_dict()
+            head_state_dict = model.state_dict()
+            base_state_dict["linear2.weight"] = head_state_dict["linear2.weight"]
+            base_state_dict["linear2.bias"] = head_state_dict["linear2.bias"]
+            torch.save(base_state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+
+            with self.assertRaisesRegex(
+                ValueError, "The state dictionary of the model you are trying to load is corrupted."
+            ):
+                _ = ModelWithHead.from_pretrained(tmp_dir)
+
+    @require_torch_gpu
+    @slow
+    def test_pretrained_low_mem_new_config(self):
+        # Checking for 1 model(the same one which was described in the issue) .
+        model_ids = ["gpt2"]
+
+        for model_id in model_ids:
+            model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_id)
+            model_config.n_layer = 48
+            model_config.n_head = 25
+            model_config.n_embd = 1600
+            model = AutoModelForCausalLM.from_pretrained(
+                pretrained_model_name_or_path=model_id,
+                config=model_config,
+                ignore_mismatched_sizes=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+            )
+            model_ref = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)
+
+            self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
+
+
+@require_torch
+@is_staging_test
+class ModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-model")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-model-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-model")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        model.push_to_hub("test-model", use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained(f"{USER}/test-model")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-model")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, repo_id="test-model", push_to_hub=True, use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained(f"{USER}/test-model")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        model.push_to_hub("valid_org/test-model-org", use_auth_token=self._token)
+
+        new_model = BertModel.from_pretrained("valid_org/test-model-org")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir, push_to_hub=True, use_auth_token=self._token, repo_id="valid_org/test-model-org"
+            )
+
+        new_model = BertModel.from_pretrained("valid_org/test-model-org")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_dynamic_model(self):
+        CustomConfig.register_for_auto_class()
+        CustomModel.register_for_auto_class()
+
+        config = CustomConfig(hidden_size=32)
+        model = CustomModel(config)
+
+        model.push_to_hub("test-dynamic-model", use_auth_token=self._token)
+        # checks
+        self.assertDictEqual(
+            config.auto_map,
+            {"AutoConfig": "custom_configuration.CustomConfig", "AutoModel": "custom_modeling.CustomModel"},
+        )
+
+        new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model is from the CustomModel class of a dynamic module
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
+        for p1, p2 in zip(model.parameters(), new_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
+        new_model = AutoModel.from_config(config, trust_remote_code=True)
+        self.assertEqual(new_model.__class__.__name__, "CustomModel")
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 53231998ee41..0656e382a8ef 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -21,28 +21,20 @@
 import pickle
 import re
 import shutil
-import sys
 import tempfile
 import traceback
 import unittest
-import unittest.mock as mock
 from collections import OrderedDict
 from itertools import takewhile
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 
-from huggingface_hub import HfFolder, delete_repo
-from huggingface_hub.file_download import http_get
 from parameterized import parameterized
-from requests.exceptions import HTTPError
 
 from transformers import (
     AlbertTokenizer,
     AlbertTokenizerFast,
-    AutoTokenizer,
     BertTokenizer,
     BertTokenizerFast,
-    GPT2TokenizerFast,
     PreTrainedTokenizer,
     PreTrainedTokenizerBase,
     PreTrainedTokenizerFast,
@@ -51,24 +43,20 @@
     TrainingArguments,
     is_flax_available,
     is_tf_available,
-    is_tokenizers_available,
     is_torch_available,
     logging,
 )
 from transformers.testing_utils import (
-    TOKEN,
-    USER,
     check_json_file_has_correct_format,
     get_tests_dir,
     is_pt_tf_cross_test,
-    is_staging_test,
     require_tf,
     require_tokenizers,
     require_torch,
     run_test_in_subprocess,
     slow,
 )
-from transformers.tokenization_utils import AddedToken, Trie
+from transformers.tokenization_utils import AddedToken
 
 
 if is_torch_available():
@@ -79,15 +67,6 @@
     from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 
 
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_tokenization import CustomTokenizer  # noqa E402
-
-
-if is_tokenizers_available():
-    from test_module.custom_tokenization_fast import CustomTokenizerFast
-
-
 logger = logging.get_logger(__name__)
 
 NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
@@ -3974,238 +3953,3 @@ def test_clean_up_tokenization_spaces(self):
         tokenizer.clean_up_tokenization_spaces = True
         decoded = tokenizer.decode(tokens)
         assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
-
-
-class TokenizerUtilTester(unittest.TestCase):
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    @require_tokenizers
-    def test_cached_files_are_used_when_internet_is_down_missing_files(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = GPT2TokenizerFast.from_pretrained("gpt2")
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = GPT2TokenizerFast.from_pretrained("gpt2")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_one_file(self):
-        # This test is for deprecated behavior and can be removed in v5
-        try:
-            tmp_file = tempfile.mktemp()
-            with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
-
-            _ = AlbertTokenizer.from_pretrained(tmp_file)
-        finally:
-            os.remove(tmp_file)
-
-        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
-        # the current folder and have the right name.
-        if os.path.isfile("tokenizer.json"):
-            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
-            return
-        try:
-            with open("tokenizer.json", "wb") as f:
-                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
-            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
-            self.assertEqual(tokenizer.vocab_size, 1000)
-            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
-
-        finally:
-            os.remove("tokenizer.json")
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
-
-
-@is_staging_test
-class TokenizerPushToHubTester(unittest.TestCase):
-    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
-
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-tokenizer")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-tokenizer-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-tokenizer")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = BertTokenizer(vocab_file)
-
-        tokenizer.push_to_hub("test-tokenizer", use_auth_token=self._token)
-        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-tokenizer")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer.save_pretrained(tmp_dir, repo_id="test-tokenizer", push_to_hub=True, use_auth_token=self._token)
-
-        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-    def test_push_to_hub_in_organization(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = BertTokenizer(vocab_file)
-
-        tokenizer.push_to_hub("valid_org/test-tokenizer-org", use_auth_token=self._token)
-        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-tokenizer-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
-        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
-
-    @require_tokenizers
-    def test_push_to_hub_dynamic_tokenizer(self):
-        CustomTokenizer.register_for_auto_class()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = CustomTokenizer(vocab_file)
-
-        # No fast custom tokenizer
-        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
-
-        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
-
-        # Fast and slow custom tokenizer
-        CustomTokenizerFast.register_for_auto_class()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            vocab_file = os.path.join(tmp_dir, "vocab.txt")
-            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-
-            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
-            bert_tokenizer.save_pretrained(tmp_dir)
-            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
-
-        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
-
-        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
-        tokenizer = AutoTokenizer.from_pretrained(
-            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
-        )
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
-
-
-class TrieTest(unittest.TestCase):
-    def test_trie(self):
-        trie = Trie()
-        trie.add("Hello 友達")
-        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}})
-        trie.add("Hello")
-        trie.data
-        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}})
-
-    def test_trie_split(self):
-        trie = Trie()
-        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS] This is a extra_id_100"])
-        trie.add("[CLS]")
-        trie.add("extra_id_1")
-        trie.add("extra_id_100")
-        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
-
-    def test_trie_single(self):
-        trie = Trie()
-        trie.add("A")
-        self.assertEqual(trie.split("ABC"), ["A", "BC"])
-        self.assertEqual(trie.split("BCA"), ["BC", "A"])
-
-    def test_trie_final(self):
-        trie = Trie()
-        trie.add("TOKEN]")
-        trie.add("[SPECIAL_TOKEN]")
-        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
-
-    def test_trie_subtokens(self):
-        trie = Trie()
-        trie.add("A")
-        trie.add("P")
-        trie.add("[SPECIAL_TOKEN]")
-        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
-
-    def test_trie_suffix_tokens(self):
-        trie = Trie()
-        trie.add("AB")
-        trie.add("B")
-        trie.add("C")
-        self.assertEqual(trie.split("ABC"), ["AB", "C"])
-
-    def test_trie_skip(self):
-        trie = Trie()
-        trie.add("ABC")
-        trie.add("B")
-        trie.add("CD")
-        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
-
-    def test_cut_text_hardening(self):
-        # Even if the offsets are wrong, we necessarily output correct string
-        # parts.
-        trie = Trie()
-        parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
-        self.assertEqual(parts, ["AB", "C"])
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
new file mode 100644
index 000000000000..2984de97fdcb
--- /dev/null
+++ b/tests/test_tokenization_utils.py
@@ -0,0 +1,280 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+import unittest.mock as mock
+from pathlib import Path
+
+from huggingface_hub import HfFolder, delete_repo
+from huggingface_hub.file_download import http_get
+from requests.exceptions import HTTPError
+
+from transformers import (
+    AlbertTokenizer,
+    AutoTokenizer,
+    BertTokenizer,
+    BertTokenizerFast,
+    GPT2TokenizerFast,
+    is_tokenizers_available,
+)
+from transformers.testing_utils import TOKEN, USER, is_staging_test, require_tokenizers
+from transformers.tokenization_utils import Trie
+
+
+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_tokenization import CustomTokenizer  # noqa E402
+
+
+if is_tokenizers_available():
+    from test_module.custom_tokenization_fast import CustomTokenizerFast
+
+
+class TokenizerUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    @require_tokenizers
+    def test_cached_files_are_used_when_internet_is_down_missing_files(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = {}
+        response_mock.raise_for_status.side_effect = HTTPError
+        response_mock.json.return_value = {}
+
+        # Download this model to make sure it's in the cache.
+        _ = GPT2TokenizerFast.from_pretrained("gpt2")
+
+        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
+        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+            _ = GPT2TokenizerFast.from_pretrained("gpt2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+    def test_legacy_load_from_one_file(self):
+        # This test is for deprecated behavior and can be removed in v5
+        try:
+            tmp_file = tempfile.mktemp()
+            with open(tmp_file, "wb") as f:
+                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
+
+            _ = AlbertTokenizer.from_pretrained(tmp_file)
+        finally:
+            os.remove(tmp_file)
+
+        # Supporting this legacy load introduced a weird bug where the tokenizer would load local files if they are in
+        # the current folder and have the right name.
+        if os.path.isfile("tokenizer.json"):
+            # We skip the test if the user has a `tokenizer.json` in this folder to avoid deleting it.
+            return
+        try:
+            with open("tokenizer.json", "wb") as f:
+                http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
+            self.assertEqual(tokenizer.vocab_size, 1000)
+            # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
+
+        finally:
+            os.remove("tokenizer.json")
+
+    def test_legacy_load_from_url(self):
+        # This test is for deprecated behavior and can be removed in v5
+        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
+
+
+@is_staging_test
+class TokenizerPushToHubTester(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
+
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+        HfFolder.save_token(TOKEN)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-tokenizer")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="valid_org/test-tokenizer-org")
+        except HTTPError:
+            pass
+
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-tokenizer")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+        tokenizer.push_to_hub("test-tokenizer", use_auth_token=self._token)
+        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="test-tokenizer")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tokenizer.save_pretrained(tmp_dir, repo_id="test-tokenizer", push_to_hub=True, use_auth_token=self._token)
+
+        new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    def test_push_to_hub_in_organization(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+        tokenizer.push_to_hub("valid_org/test-tokenizer-org", use_auth_token=self._token)
+        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+        # Reset repo
+        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
+
+        # Push to hub via save_pretrained
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tokenizer.save_pretrained(
+                tmp_dir, repo_id="valid_org/test-tokenizer-org", push_to_hub=True, use_auth_token=self._token
+            )
+
+        new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+        self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    @require_tokenizers
+    def test_push_to_hub_dynamic_tokenizer(self):
+        CustomTokenizer.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = CustomTokenizer(vocab_file)
+
+        # No fast custom tokenizer
+        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
+
+        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
+
+        # Fast and slow custom tokenizer
+        CustomTokenizerFast.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+
+            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
+            bert_tokenizer.save_pretrained(tmp_dir)
+            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
+
+        tokenizer.push_to_hub("test-dynamic-tokenizer", use_auth_token=self._token)
+
+        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
+        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
+        tokenizer = AutoTokenizer.from_pretrained(
+            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
+        )
+        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
+
+
+class TrieTest(unittest.TestCase):
+    def test_trie(self):
+        trie = Trie()
+        trie.add("Hello 友達")
+        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}})
+        trie.add("Hello")
+        trie.data
+        self.assertEqual(trie.data, {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}})
+
+    def test_trie_split(self):
+        trie = Trie()
+        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS] This is a extra_id_100"])
+        trie.add("[CLS]")
+        trie.add("extra_id_1")
+        trie.add("extra_id_100")
+        self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
+
+    def test_trie_single(self):
+        trie = Trie()
+        trie.add("A")
+        self.assertEqual(trie.split("ABC"), ["A", "BC"])
+        self.assertEqual(trie.split("BCA"), ["BC", "A"])
+
+    def test_trie_final(self):
+        trie = Trie()
+        trie.add("TOKEN]")
+        trie.add("[SPECIAL_TOKEN]")
+        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
+
+    def test_trie_subtokens(self):
+        trie = Trie()
+        trie.add("A")
+        trie.add("P")
+        trie.add("[SPECIAL_TOKEN]")
+        self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
+
+    def test_trie_suffix_tokens(self):
+        trie = Trie()
+        trie.add("AB")
+        trie.add("B")
+        trie.add("C")
+        self.assertEqual(trie.split("ABC"), ["AB", "C"])
+
+    def test_trie_skip(self):
+        trie = Trie()
+        trie.add("ABC")
+        trie.add("B")
+        trie.add("CD")
+        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
+
+    def test_cut_text_hardening(self):
+        # Even if the offsets are wrong, we necessarily output correct string
+        # parts.
+        trie = Trie()
+        parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
+        self.assertEqual(parts, ["AB", "C"])

From 6cd34d451c8bcefb011907485fd6f5ee6828507f Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Thu, 15 Jun 2023 19:33:37 +0800
Subject: [PATCH 432/935] [fix] bug in BatchEncoding.__getitem__ (#24293)

Co-authored-by: luchen <luchen@luchendeMBP.lan>
---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index ba2f8ef3f13b..eb757d98acfb 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -242,7 +242,7 @@ def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
         elif self._encodings is not None:
             return self._encodings[item]
         elif isinstance(item, slice):
-            return {key: self.data[key][slice] for key in self.data.keys()}
+            return {key: self.data[key][item] for key in self.data.keys()}
         else:
             raise KeyError(
                 "Invalid key. Only three types of key are available: "

From e6122c3f40d3c1fec5c4966a58340fd62d55cb71 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 15 Jun 2023 13:09:31 +0100
Subject: [PATCH 433/935] Fix image segmentation tool bug (#23897)

* Image segmentation tool bug

* Remove resizing in the tests
---
 src/transformers/tools/image_segmentation.py | 1 -
 tests/tools/test_image_segmentation.py       | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/tools/image_segmentation.py b/src/transformers/tools/image_segmentation.py
index 4471b8490527..b6cbf3eb3f7d 100644
--- a/src/transformers/tools/image_segmentation.py
+++ b/src/transformers/tools/image_segmentation.py
@@ -44,7 +44,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     def encode(self, image: "Image", label: str):
-        self.pre_processor.image_processor.size = {"width": image.size[0], "height": image.size[1]}
         return self.pre_processor(text=[label], images=[image], padding=True, return_tensors="pt")
 
     def forward(self, inputs):
diff --git a/tests/tools/test_image_segmentation.py b/tests/tools/test_image_segmentation.py
index 8a741b501b87..2f003f2c8b91 100644
--- a/tests/tools/test_image_segmentation.py
+++ b/tests/tools/test_image_segmentation.py
@@ -33,21 +33,21 @@ def setUp(self):
         self.remote_tool = load_tool("image-segmentation", remote=True)
 
     def test_exact_match_arg(self):
-        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
         result = self.tool(image, "cat")
         self.assertTrue(isinstance(result, Image.Image))
 
     def test_exact_match_arg_remote(self):
-        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
         result = self.remote_tool(image, "cat")
         self.assertTrue(isinstance(result, Image.Image))
 
     def test_exact_match_kwarg(self):
-        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
         result = self.tool(image=image, label="cat")
         self.assertTrue(isinstance(result, Image.Image))
 
     def test_exact_match_kwarg_remote(self):
-        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png").resize((512, 512))
+        image = Image.open(Path(get_tests_dir("fixtures/tests_samples/COCO")) / "000000039769.png")
         result = self.remote_tool(image=image, label="cat")
         self.assertTrue(isinstance(result, Image.Image))

From 604a21b1e68267df29e4910f425c92c336973f5d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 15 Jun 2023 14:29:32 +0200
Subject: [PATCH 434/935] [Docs] Improve docs for MMS loading of other
 languages (#24292)

* Improve docs

* Apply suggestions from code review

* upload readme

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/model_doc/mms.mdx | 46 +++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/mms.mdx b/docs/source/en/model_doc/mms.mdx
index bd32617370cb..32cffbdfcbb4 100644
--- a/docs/source/en/model_doc/mms.mdx
+++ b/docs/source/en/model_doc/mms.mdx
@@ -44,11 +44,51 @@ MMS's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2
 
 The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
 
-## Inference
+## Loading
+
+By default MMS loads adapter weights for English. If you want to load adapter weights of another language 
+make sure to specify `target_lang=<your-chosen-target-lang>` as well as `"ignore_mismatched_sizes=True`.
+The `ignore_mismatched_sizes=True` keyword has to be passed to allow the language model head to be resized according
+to the vocabulary of the specified language.
+Similarly, the processor should be loaded with the same target language
+
+```py
+from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+model_id = "facebook/mms-1b-all"
+target_lang = "fra"
+
+processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang)
+model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True)
+```
+
+<Tip>
+
+You can safely ignore a warning such as:
 
-By default MMS loads adapter weights for English, but those can be easily switched out for another language.
-Let's look at an example.
+```text
+Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized because the shapes did not match:
+- lm_head.bias: found shape torch.Size([154]) in the checkpoint and torch.Size([314]) in the model instantiated
+- lm_head.weight: found shape torch.Size([154, 1280]) in the checkpoint and torch.Size([314, 1280]) in the model instantiated
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+</Tip>
+
+If you want to use the ASR pipeline, you can load your chosen target language as such:
+
+```py
+from transformers import pipeline
+
+model_id = "facebook/mms-1b-all"
+target_lang = "fra"
+
+pipe = pipeline(model=model_id, model_kwargs={"target_lang": "fra", "ignore_mismatched_sizes": True})
+```
+
+## Inference
 
+Next, let's look at how we can run MMS in inference and change adapter layers after having called [`~PretrainedModel.from_pretrained`]
 First, we load audio data in different languages using the [Datasets](https://github.com/huggingface/datasets).
 
 ```py

From 6a081c512a3f7f702dd35c9de4ed5019f82c3303 Mon Sep 17 00:00:00 2001
From: Cooper <fqzw2015@gmail.com>
Date: Thu, 15 Jun 2023 20:50:40 +0800
Subject: [PATCH 435/935] Update README_zh-hans.md (#24181)

* Update README_zh-hans.md

update document link

* Update README_zh-hans.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README_zh-hans.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_zh-hans.md b/README_zh-hans.md
index 469fd867602e..81912c08fd84 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -454,7 +454,7 @@ conda install -c huggingface transformers
 
 | 章节 | 描述 |
 |-|-|
-| [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 |
+| [文档](https://huggingface.co/docs/transformers/) | 完整的 API 文档和教程 |
 | [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 |
 | [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 |
 | [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |

From 01b55779d3789fa07885a677e2c17b21187eed98 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Thu, 15 Jun 2023 18:47:09 +0530
Subject: [PATCH 436/935] deepspeed init during eval fix (#24298)

* deepspeed init during eval fix

* commit suggestions

Co-Authored-By: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 9429b12ff96f..e9fec6a5466b 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -340,6 +340,7 @@ def __init__(
         # Seed must be set before instantiating the model when using model
         enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
         self.hp_name = None
+        self.deepspeed = None
         self.is_in_train = False
 
         self.create_accelerator_and_postprocess()
@@ -3041,7 +3042,7 @@ def evaluation_loop(
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
         # if eval is called w/o train, handle model prep here
-        if self.is_deepspeed_enabled and self.model_wrapped is self.model:
+        if self.is_deepspeed_enabled and self.deepspeed is None:
             _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)
@@ -3634,7 +3635,7 @@ def prediction_loop(
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
         # if eval is called w/o train, handle model prep here
-        if self.is_deepspeed_enabled and self.model_wrapped is self.model:
+        if self.is_deepspeed_enabled and self.deepspeed is None:
             _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
 
         model = self._wrap_model(self.model, training=False, dataloader=dataloader)

From 4124a09f8b3349f338917ad3282ca952bd15ec3a Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 15 Jun 2023 14:36:19 +0100
Subject: [PATCH 437/935] [EnCodec] Changes for 32kHz ckpt (#24296)

* [EnCodec] Changes for 32kHz ckpt

* Update src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py

* Update src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
---
 .../models/encodec/configuration_encodec.py     |  5 +++++
 .../convert_encodec_checkpoint_to_pytorch.py    | 17 +++++++++++++++--
 .../models/encodec/modeling_encodec.py          |  5 ++++-
 tests/models/encodec/test_modeling_encodec.py   |  5 +++++
 4 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
index 9ea2cfee945b..e75711d9264e 100644
--- a/src/transformers/models/encodec/configuration_encodec.py
+++ b/src/transformers/models/encodec/configuration_encodec.py
@@ -90,6 +90,9 @@ class EncodecConfig(PretrainedConfig):
             Number of discret codes that make up VQVAE.
         codebook_dim (`int`, *optional*):
             Dimension of the codebook vectors. If not defined, uses `hidden_size`.
+        use_conv_shortcut (`bool`, *optional*, defaults to `True`):
+            Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False,
+            an identity function will be used, giving a generic residual connection.
 
     Example:
 
@@ -131,6 +134,7 @@ def __init__(
         trim_right_ratio=1.0,
         codebook_size=1024,
         codebook_dim=None,
+        use_conv_shortcut=True,
         **kwargs,
     ):
         self.target_bandwidths = target_bandwidths
@@ -155,6 +159,7 @@ def __init__(
         self.trim_right_ratio = trim_right_ratio
         self.codebook_size = codebook_size
         self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+        self.use_conv_shortcut = use_conv_shortcut
 
         if self.norm_type not in ["weight_norm", "time_group_norm"]:
             raise ValueError(
diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
index cd7ead3d7274..3a16a4b7ba0f 100644
--- a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
+++ b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
@@ -28,6 +28,7 @@
 
 # checkpoints downloaded from:
 # https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th
+# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin
 # https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th
 
 
@@ -206,7 +207,7 @@ def should_ignore(name, ignore_keys):
 def recursively_load_weights(orig_dict, hf_model, model_name):
     unused_weights = []
 
-    if model_name == "encodec_24khz":
+    if model_name == "encodec_24khz" or "encodec_32khz":
         MAPPING = MAPPING_24K
     elif model_name == "encodec_48khz":
         MAPPING = MAPPING_48K
@@ -292,6 +293,15 @@ def convert_checkpoint(
 
     if model_name == "encodec_24khz":
         pass  # config is already correct
+    elif model_name == "encodec_32khz":
+        config.upsampling_ratios = [8, 5, 4, 4]
+        config.target_bandwidths = [2.2]
+        config.num_filters = 64
+        config.sampling_rate = 32_000
+        config.codebook_size = 2048
+        config.use_causal_conv = False
+        config.normalize = False
+        config.use_conv_shortcut = False
     elif model_name == "encodec_48khz":
         config.upsampling_ratios = [8, 5, 4, 2]
         config.target_bandwidths = [3.0, 6.0, 12.0, 24.0]
@@ -316,6 +326,9 @@ def convert_checkpoint(
     feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
     original_checkpoint = torch.load(checkpoint_path)
+    if "best_state" in original_checkpoint:
+        # we might have a training state saved, in which case discard the yaml results and just retain the weights
+        original_checkpoint = original_checkpoint["best_state"]
     recursively_load_weights(original_checkpoint, model, model_name)
     model.save_pretrained(pytorch_dump_folder_path)
 
@@ -331,7 +344,7 @@ def convert_checkpoint(
         "--model",
         default="encodec_24khz",
         type=str,
-        help="The model to convert. Should be one of 'encodec_24khz', 'encodec_48khz'.",
+        help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.",
     )
     parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
     parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index ad1f6a0ee83a..697fb3c94fbb 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -259,7 +259,10 @@ def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
             block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
         self.block = nn.ModuleList(block)
 
-        self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
+        if config.use_conv_shortcut:
+            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
+        else:
+            self.shortcut = nn.Identity()
 
     def forward(self, hidden_states):
         residual = hidden_states
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 23b2114a5db3..398da6f5d09a 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -385,6 +385,11 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    def test_identity_shortcut(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config.use_conv_shortcut = False
+        self.model_tester.create_and_check_model_forward(config, inputs_dict)
+
 
 def normalize(arr):
     norm = np.linalg.norm(arr)

From c3ca346b49b24169e045de591fd237927185cc73 Mon Sep 17 00:00:00 2001
From: hitchhicker <yubokai8@gmail.com>
Date: Thu, 15 Jun 2023 16:45:49 +0200
Subject: [PATCH 438/935] [Docs] Fix the paper URL for MMS model (#24302)

Fix the paper URL for MMS model
---
 docs/source/en/model_doc/mms.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/mms.mdx b/docs/source/en/model_doc/mms.mdx
index 32cffbdfcbb4..da2986fbb336 100644
--- a/docs/source/en/model_doc/mms.mdx
+++ b/docs/source/en/model_doc/mms.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The MMS model was proposed in [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2111.09296) 
+The MMS model was proposed in [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 
 by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli
 
 The abstract from the paper is the following:

From 1a113fcf65c9231d3aece817960fbfe521e7e275 Mon Sep 17 00:00:00 2001
From: Belladore <135602125+belladoreai@users.noreply.github.com>
Date: Thu, 15 Jun 2023 18:31:47 +0300
Subject: [PATCH 439/935] Update tokenizer_summary.mdx (grammar) (#24286)

---
 docs/source/en/tokenizer_summary.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/tokenizer_summary.mdx b/docs/source/en/tokenizer_summary.mdx
index 942fe279068e..61099edabe08 100644
--- a/docs/source/en/tokenizer_summary.mdx
+++ b/docs/source/en/tokenizer_summary.mdx
@@ -141,7 +141,7 @@ words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](mode
 [FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/gpt) which uses
 Spacy and ftfy, to count the frequency of each word in the training corpus.
 
-After pre-tokenization, a set of unique words has been created and the frequency of each word it occurred in the
+After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the
 training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set
 of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until
 the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to

From e45bc143507d6994e196e653e4892c3e172ca93a Mon Sep 17 00:00:00 2001
From: jprivera44 <jprivera44@gmail.com>
Date: Thu, 15 Jun 2023 08:48:02 -0700
Subject: [PATCH 440/935] Beam search type (#24288)

* test check in

* adding in type hint fix on beam search

* fixed code quality issue
---
 src/transformers/generation/beam_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index d8bd8010dce3..792b2a17f5d6 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -15,7 +15,7 @@
 
 from abc import ABC, abstractmethod
 from collections import UserDict
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -211,7 +211,7 @@ def process(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
         beam_indices: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor]:
+    ) -> Dict[str, torch.Tensor]:
         cur_len = input_ids.shape[-1] + 1  # add up to the length which the next_scores is calculated on
         batch_size = len(self._beam_hyps)
         if not (batch_size == (input_ids.shape[0] // self.group_size)):

From 6134b9b4c7718b748dcd4486b7cd9be9b2d789ab Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 18:31:38 +0200
Subject: [PATCH 441/935] Make `can_generate` as class method (#24299)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/modeling_flax_utils.py | 5 +++--
 src/transformers/modeling_tf_utils.py   | 5 +++--
 src/transformers/modeling_utils.py      | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 6a612293eb66..df307d9f4b4e 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -468,13 +468,14 @@ def load_flax_sharded_weights(cls, shard_files):
         # the state dict is unflattened to the match the format of model.params
         return unflatten_dict(state_sharded_dict, sep="/")
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`. Returns:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation.__func__):
+        if "GenerationMixin" in str(cls.prepare_inputs_for_generation):
             return False
         return True
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 73ba70806ca6..01d29bd02481 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1328,7 +1328,8 @@ def serving_output(self, output):
                     pass  # Layers may not have the same dimensions
         return output
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`.
 
@@ -1336,7 +1337,7 @@ def can_generate(self) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation.__func__):
+        if "GenerationMixin" in str(cls.prepare_inputs_for_generation):
             return False
         return True
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b0d52f10d938..9a0894e4e20b 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1174,7 +1174,8 @@ def base_model(self) -> nn.Module:
         """
         return getattr(self, self.base_model_prefix, self)
 
-    def can_generate(self) -> bool:
+    @classmethod
+    def can_generate(cls) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`.
 
@@ -1182,7 +1183,7 @@ def can_generate(self) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
-        if "GenerationMixin" in str(self.prepare_inputs_for_generation.__func__):
+        if "GenerationMixin" in str(cls.prepare_inputs_for_generation):
             return False
         return True
 

From 0b7b4429c78de68acaf72224eb6dae43616d820c Mon Sep 17 00:00:00 2001
From: Sayed Qaiser Ali <66676360+sqali@users.noreply.github.com>
Date: Thu, 15 Jun 2023 22:31:11 +0530
Subject: [PATCH 442/935] Update test versions on README.md (#24307)

Update README.md

Updated the tested versions
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7d6ea2787880..be56b8de554e 100644
--- a/README.md
+++ b/README.md
@@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 
 ### With pip
 
-This repository is tested on Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ and TensorFlow 2.3+.
+This repository is tested on Python 3.7+, Flax 0.4.1+, PyTorch 1.9+ and TensorFlow 2.4+.
 
 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 

From ba3fb4b8d72b9202423cda01896349a883480d2e Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 16 Jun 2023 15:40:33 +0200
Subject: [PATCH 443/935] [`SwitchTransformers`] Fix return values (#24300)

* clean history

* remove other changes

* fix

* fix coipes
---
 .../modeling_gptsan_japanese.py               |  2 +-
 .../modeling_switch_transformers.py           | 49 +++++++++----------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index 30ebf003145e..8c1cdd0b1a55 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -1348,7 +1348,7 @@ def _unpack_router_logits(self, router_outputs):
         total_router_logits = []
         total_expert_indexes = []
         for router_output in router_outputs:
-            if router_output[0] is not None:
+            if len(router_output[0].shape) > 1:
                 router_logits, expert_indexes = router_output
                 total_router_logits.append(router_logits)
                 total_expert_indexes.append(expert_indexes)
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 3b36b7732b09..b8a90a6c1538 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -798,7 +798,7 @@ def forward(
         if isinstance(hidden_states, tuple):
             hidden_states, router_tuple = hidden_states
         else:
-            router_tuple = (None,)
+            router_tuple = (torch.tensor([0]),)
 
         # clamp inf values to enable fp16 training
         if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
@@ -1683,50 +1683,45 @@ def forward(
         decoder_z_loss = None
         decoder_aux_loss = None
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            # todo check in the config if router loss enables
-
-            if output_router_logits:
-                # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
-                encoder_router_logits, encoder_expert_indexes = self._unpack_router_logits(
-                    encoder_outputs.router_probs
-                )
+        if output_router_logits:
+            # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
+            if self.encoder.config.encoder_sparse_step > 1:
+                encoder_router_logits, encoder_expert_indexes = self._unpack_router_logits(encoder_outputs[-1])
                 encoder_z_loss = router_z_loss_func(encoder_router_logits)
                 encoder_router_probs = nn.Softmax(dim=-1)(encoder_router_logits)
                 encoder_aux_loss = load_balancing_loss_func(encoder_router_probs, encoder_expert_indexes)
+            else:
+                encoder_z_loss = 0
+                encoder_aux_loss = 0
 
-                decoder_router_logits, decoder_expert_indexes = self._unpack_router_logits(
-                    decoder_outputs.router_probs
-                )
+            if self.decoder.config.decoder_sparse_step > 1:
+                decoder_router_logits, decoder_expert_indexes = self._unpack_router_logits(decoder_outputs[-1])
                 decoder_z_loss = router_z_loss_func(decoder_router_logits)
                 decoder_router_probs = nn.Softmax(dim=-1)(decoder_router_logits)
                 decoder_aux_loss = load_balancing_loss_func(decoder_router_probs, decoder_expert_indexes)
+            else:
+                decoder_z_loss = 0
+                decoder_aux_loss = 0
 
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
             # move labels to correct device to enable PP
             labels = labels.to(lm_logits.device)
             loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
 
-            if output_router_logits and labels is not None:
+            if output_router_logits:
                 z_loss = self.router_z_loss_coef * (encoder_z_loss + decoder_z_loss)
                 aux_loss = self.router_aux_loss_coef * (encoder_aux_loss + decoder_aux_loss)
                 loss = loss + z_loss + aux_loss
 
         if not return_dict:
             output = (lm_logits,)
-            if output_router_logits:  # only return the loss if they are not None
-                output += (
-                    encoder_z_loss,
-                    encoder_aux_loss,
-                    decoder_z_loss,
-                    decoder_aux_loss,
-                    *decoder_outputs[1:],
-                    *encoder_outputs,
-                )
-            else:
-                output += (*decoder_outputs[1:], *encoder_outputs)
+            if output_router_logits:
+                output += (encoder_z_loss, encoder_aux_loss, decoder_z_loss, decoder_aux_loss)
+            output += (*decoder_outputs[1:], *encoder_outputs)
 
             return ((loss,) + output) if loss is not None else output
+
         return Seq2SeqMoEOutput(
             loss=loss,
             logits=lm_logits,
@@ -1738,18 +1733,18 @@ def forward(
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
             cross_attentions=decoder_outputs.cross_attentions,
+            decoder_router_logits=decoder_outputs.router_probs,
             encoder_last_hidden_state=encoder_outputs.last_hidden_state,
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             encoder_router_logits=encoder_outputs.router_probs,
-            decoder_router_logits=decoder_outputs.router_probs,
         )
 
     def _unpack_router_logits(self, router_outputs):
         total_router_logits = []
         total_expert_indexes = []
         for router_output in router_outputs:
-            if router_output[0] is not None:
+            if len(router_output[0].shape) > 1:
                 router_logits, expert_indexes = router_output
                 total_router_logits.append(router_logits)
                 total_expert_indexes.append(expert_indexes)

From 62d71f4083acccca1c1c9b0eea68db69d9ef759a Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 16 Jun 2023 14:43:43 +0100
Subject: [PATCH 444/935] Fix functional TF Whisper and modernize tests
 (#24301)

* Revert whisper change and modify the test_compile_tf_model test

* make fixup

* Tweak test slightly

* Add functional model saving to test

* Ensure TF can infer shapes for data2vec

* Add override for efficientformer

* Mark test as slow
---
 .../data2vec/modeling_tf_data2vec_vision.py   |   4 +-
 .../models/whisper/modeling_tf_whisper.py     |  11 +-
 .../test_modeling_tf_data2vec_vision.py       |   4 -
 .../test_modeling_tf_efficientformer.py       |  18 +++
 .../models/funnel/test_modeling_tf_funnel.py  |   4 -
 .../models/lxmert/test_modeling_tf_lxmert.py  |  49 --------
 .../models/marian/test_modeling_tf_marian.py  |  32 ------
 tests/models/mbart/test_modeling_tf_mbart.py  |  27 -----
 .../mobilevit/test_modeling_tf_mobilevit.py   |   4 -
 .../pegasus/test_modeling_tf_pegasus.py       |  32 ------
 .../segformer/test_modeling_tf_segformer.py   |   4 -
 .../vit_mae/test_modeling_tf_vit_mae.py       |  46 --------
 tests/test_modeling_tf_common.py              | 105 +++---------------
 13 files changed, 40 insertions(+), 300 deletions(-)

diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index ee8bec20a019..a5953467cdd2 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -1383,11 +1383,11 @@ def call(
         # only keep certain features, and reshape
         # note that we do +1 as the encoder_hidden_states also includes the initial embeddings
         features = [feature for idx, feature in enumerate(encoder_hidden_states) if idx + 1 in self.config.out_indices]
-        batch_size = shape_list(pixel_values)[0]
         patch_resolution = self.config.image_size // self.config.patch_size
 
         def reshape_features(x):
-            x = tf.reshape(x, (batch_size, patch_resolution, patch_resolution, -1))
+            # We do it this way so TF can always infer the non-batch dims at compile time
+            x = tf.reshape(x, (-1, patch_resolution, patch_resolution, self.config.hidden_size))
             return x
 
         features = [reshape_features(x[:, 1:, :]) for x in features]
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index f33340e1c06a..4d6ecb85b599 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -766,12 +766,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         batch_size, seq_len = input_shape[0], input_shape[1]
 
-        if seq_len > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
-        else:
-            combined_attention_mask = _expand_mask(
-                tf.ones((batch_size, seq_len + past_key_values_length)), tgt_len=seq_len
-            )
+        combined_attention_mask = tf.cond(
+            tf.math.greater(seq_len, 1),
+            lambda: _make_causal_mask(input_shape, past_key_values_length=past_key_values_length),
+            lambda: _expand_mask(tf.ones((batch_size, seq_len + past_key_values_length)), tgt_len=seq_len),
+        )
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index 6a30c83ebaf9..320b5ede5c10 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -240,10 +240,6 @@ def test_for_image_segmentation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
 
-    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
-    def test_compile_tf_model(self):
-        pass
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/efficientformer/test_modeling_tf_efficientformer.py b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
index 5301aee561b0..059ff1ac1295 100644
--- a/tests/models/efficientformer/test_modeling_tf_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
@@ -344,6 +344,24 @@ def test_attention_outputs(self):
                     [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                 )
 
+    def test_compile_tf_model(self):
+        # We use a simplified version of this test for EfficientFormer because it requires training=False
+        # and Keras refuses to let us force that during functional construction
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            # Prepare our model
+            model = model_class(config)
+            # These are maximally general inputs for the model, with multiple None dimensions
+            # Hopefully this will catch any conditionals that fail for flexible shapes
+            functional_inputs = {
+                key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
+                for key, val in model.input_signature.items()
+                if key in model.dummy_inputs
+            }
+            outputs_dict = model(functional_inputs)
+            self.assertTrue(outputs_dict is not None)
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/funnel/test_modeling_tf_funnel.py b/tests/models/funnel/test_modeling_tf_funnel.py
index 5aea7e4309b5..051da46fadaf 100644
--- a/tests/models/funnel/test_modeling_tf_funnel.py
+++ b/tests/models/funnel/test_modeling_tf_funnel.py
@@ -390,10 +390,6 @@ def test_for_question_answering(self):
     def test_saved_model_creation(self):
         pass
 
-    def test_compile_tf_model(self):
-        # This test fails the CI. TODO Lysandre re-enable it
-        pass
-
 
 @require_tf
 class TFFunnelBaseModelTest(TFModelTesterMixin, unittest.TestCase):
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
index 9d97ddb462cc..a99495d008a1 100644
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ b/tests/models/lxmert/test_modeling_tf_lxmert.py
@@ -532,55 +532,6 @@ def test_save_load(self):
 
                 self.assert_outputs_same(after_outputs, outputs)
 
-    def test_compile_tf_model(self):
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-                return_obj_labels="PreTraining" in model_class.__name__
-            )
-
-            input_ids = tf.keras.Input(
-                batch_shape=(self.model_tester.batch_size, self.model_tester.seq_length),
-                name="input_ids",
-                dtype="int32",
-            )
-            visual_feats = tf.keras.Input(
-                batch_shape=(
-                    self.model_tester.batch_size,
-                    self.model_tester.num_visual_features,
-                    self.model_tester.visual_feat_dim,
-                ),
-                name="visual_feats",
-                dtype="int32",
-            )
-            visual_pos = tf.keras.Input(
-                batch_shape=(self.model_tester.batch_size, self.model_tester.num_visual_features, 4),
-                name="visual_pos",
-                dtype="int32",
-            )
-
-            # Prepare our model
-            model = model_class(config)
-
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                outputs = model(self._prepare_for_class(inputs_dict, model_class))  # build the model
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-
-            outputs_dict = model(input_ids, visual_feats, visual_pos)
-            hidden_states = outputs_dict[0]
-
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[input_ids, visual_feats, visual_pos], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
index 8833938f3b2e..1a87f4e984a1 100644
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ b/tests/models/marian/test_modeling_tf_marian.py
@@ -16,7 +16,6 @@
 
 from __future__ import annotations
 
-import tempfile
 import unittest
 import warnings
 
@@ -209,37 +208,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        model_class = self.all_generative_model_classes[0]
-        input_ids = {
-            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
-        }
-
-        # Prepare our model
-        model = model_class(config)
-        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-        # Let's load it from the disk to be sure we can use pre-trained weights
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = model_class.from_pretrained(tmpdirname)
-
-        outputs_dict = model(input_ids)
-        hidden_states = outputs_dict[0]
-
-        # Add a dense layer on top to test integration with other keras modules
-        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-        # Compile extended model
-        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index 70a93acf8175..753f961d1f1a 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import tempfile
 import unittest
 
 from transformers import AutoTokenizer, MBartConfig, is_tf_available
@@ -118,32 +117,6 @@ def check_decoder_model_past_large_inputs(self, config, inputs_dict):
         output, past_key_values = outputs.to_tuple()
         past_key_values = past_key_values[1]
 
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-        model_class = self.all_generative_model_classes[0]
-        input_ids = {
-            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
-        }
-        # Prepare our model
-        model = model_class(config)
-        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-        # Let's load it from the disk to be sure we can use pretrained weights
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = model_class.from_pretrained(tmpdirname)
-        outputs_dict = model(input_ids)
-        hidden_states = outputs_dict[0]
-        # Add a dense layer on top to test integration with other keras modules
-        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-        # Compile extended model
-        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
 
 def prepare_mbart_inputs_dict(
     config,
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index 37d7db39e68d..c635feae8f42 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -199,10 +199,6 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
-    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
-    def test_compile_tf_model(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
index 0bd1ed25e7bb..dcd0479e2cbf 100644
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/models/pegasus/test_modeling_tf_pegasus.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import tempfile
 import unittest
 
 from transformers import AutoTokenizer, PegasusConfig, is_tf_available
@@ -207,37 +206,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        model_class = self.all_generative_model_classes[0]
-        input_ids = {
-            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
-        }
-
-        # Prepare our model
-        model = model_class(config)
-        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-        # Let's load it from the disk to be sure we can use pretrained weights
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = model_class.from_pretrained(tmpdirname)
-
-        outputs_dict = model(input_ids)
-        hidden_states = outputs_dict[0]
-
-        # Add a dense layer on top to test integration with other keras modules
-        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-        # Compile extended model
-        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
-        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
     @tooslow
     def test_saved_model_creation(self):
         pass
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index b831e8ddbc2b..d3317b2079be 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -186,10 +186,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
-    def test_compile_tf_model(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index d5e16e963850..34e5a636d825 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -283,52 +283,6 @@ def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
 
         super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
 
-    # overwrite from common since TFViTMAEForPretraining outputs loss along with
-    # logits and mask indices. loss and mask indices are not suitable for integration
-    # with other keras modules.
-    def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-
-        for model_class in self.all_model_classes:
-            # `pixel_values` implies that the input is an image
-            inputs = tf.keras.Input(
-                batch_shape=(
-                    3,
-                    self.model_tester.num_channels,
-                    self.model_tester.image_size,
-                    self.model_tester.image_size,
-                ),
-                name="pixel_values",
-                dtype="float32",
-            )
-
-            # Prepare our model
-            model = model_class(config)
-            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
-
-            outputs_dict = model(inputs)
-            hidden_states = outputs_dict[0]
-
-            # `TFViTMAEForPreTraining` outputs are not recommended to be used for
-            #  downstream application. This is just to check if the outputs of
-            # `TFViTMAEForPreTraining` can be integrated with other keras modules.
-            if model_class.__name__ == "TFViTMAEForPreTraining":
-                hidden_states = outputs_dict["logits"]
-
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
-            # Compile extended model
-            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
     # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def test_keras_save_load(self):
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index a4de6e8f471e..3c3ca75ff6bb 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -685,105 +685,30 @@ def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
             if tf_inputs_dict_with_labels:
                 self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict_with_labels)
 
+    @slow
     def test_compile_tf_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
-        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_features": tf.keras.Input(
-                        batch_shape=(
-                            2,
-                            max_input,
-                            self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
-                        ),
-                        name="input_features",
-                        dtype="float32",
-                    ),
-                }
-            elif model_class.__name__ in ["TFWhisperModel", "TFWhisperForConditionalGeneration"]:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_features": tf.keras.Input(
-                        batch_shape=(
-                            2,
-                            self.model_tester.num_mel_bins,
-                            self.model_tester.seq_length,
-                        ),
-                        name="input_features",
-                        dtype="float32",
-                    ),
-                }
-            elif self.is_encoder_decoder:
-                inputs = {
-                    "decoder_input_ids": tf.keras.Input(
-                        batch_shape=(2, max_input),
-                        name="decoder_input_ids",
-                        dtype="int32",
-                    ),
-                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
-                }
-            # `pixel_values` implies that the input is an image
-            elif model_class.main_input_name == "pixel_values":
-                inputs = tf.keras.Input(
-                    batch_shape=(
-                        3,
-                        self.model_tester.num_channels,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                    ),
-                    name="pixel_values",
-                    dtype="float32",
-                )
-            elif model_class.__name__ in ["TFCLIPModel", "TFGroupViTModel", "TFBlipModel"]:
-                inputs = {
-                    "input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
-                    "pixel_values": tf.keras.Input(
-                        batch_shape=(
-                            3,
-                            self.model_tester.vision_model_tester.num_channels,
-                            self.model_tester.vision_model_tester.image_size,
-                            self.model_tester.vision_model_tester.image_size,
-                        ),
-                        name="pixel_values",
-                        dtype="float32",
-                    ),
-                }
-            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
-            else:
-                inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
-
             # Prepare our model
             model = model_class(config)
-            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
-            # Let's load it from the disk to be sure we can use pretrained weights
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=False)
-                model = model_class.from_pretrained(tmpdirname)
+            # These are maximally general inputs for the model, with multiple None dimensions
+            # Hopefully this will catch any conditionals that fail for flexible shapes
+            functional_inputs = {
+                key: tf.keras.Input(shape=val.shape[1:], dtype=val.dtype, name=key)
+                for key, val in model.input_signature.items()
+                if key in model.dummy_inputs
+            }
+            outputs_dict = model(functional_inputs)
 
-            outputs_dict = model(inputs)
             hidden_states = outputs_dict[0]
 
-            # Add a dense layer on top to test integration with other keras modules
-            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
-
             # Compile extended model
-            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
-            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+            functional_model = tf.keras.Model(inputs=functional_inputs, outputs=hidden_states)
+            model_out = functional_model.predict(model.dummy_inputs)  # Check we can pass inputs with the Keras API
+            self.assertTrue(model_out is not None)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                functional_model.save(tmpdirname)  # Ensure we can save/export the whole functional model
 
     def test_keyword_and_dict_args(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 896a58de15498c9a51af2b803e292352b9dae7a5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 16 Jun 2023 16:38:23 +0200
Subject: [PATCH 445/935] Byebye pytorch 1.9 (#24080)

byebye

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../workflows/build-past-ci-docker-images.yml |  2 +-
 .../workflows/self-nightly-past-ci-caller.yml | 13 +-----------
 docker/transformers-past-gpu/Dockerfile       |  2 +-
 .../bridgetower/modeling_bridgetower.py       |  8 +------
 src/transformers/models/vilt/modeling_vilt.py |  7 -------
 src/transformers/pytorch_utils.py             |  8 +------
 src/transformers/trainer.py                   | 21 +++++++------------
 src/transformers/utils/import_utils.py        | 17 ++++-----------
 tests/models/bloom/test_modeling_bloom.py     |  7 -------
 .../bridgetower/test_modeling_bridgetower.py  |  6 ------
 tests/models/levit/test_modeling_levit.py     |  6 ------
 tests/models/tvlt/test_modeling_tvlt.py       |  4 ----
 tests/models/vilt/test_modeling_vilt.py       |  6 ------
 13 files changed, 16 insertions(+), 91 deletions(-)

diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml
index 18d88f2d52fa..aa47dfd08c2d 100644
--- a/.github/workflows/build-past-ci-docker-images.yml
+++ b/.github/workflows/build-past-ci-docker-images.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ["1.13", "1.12", "1.11", "1.10", "1.9"]
+        version: ["1.13", "1.12", "1.11", "1.10"]
     runs-on: ubuntu-latest
     steps:
       -
diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml
index 25e711105fed..dfc258e5be85 100644
--- a/.github/workflows/self-nightly-past-ci-caller.yml
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@@ -67,21 +67,10 @@ jobs:
       sha: ${{ github.sha }}
     secrets: inherit
 
-  run_past_ci_pytorch_1-9:
-    name: PyTorch 1.9
-    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    needs: [run_past_ci_pytorch_1-10]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.9"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
   run_past_ci_tensorflow_2-11:
     name: TensorFlow 2.11
     if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
-    needs: [run_past_ci_pytorch_1-9]
+    needs: [run_past_ci_pytorch_1-10]
     uses: ./.github/workflows/self-past.yml
     with:
       framework: tensorflow
diff --git a/docker/transformers-past-gpu/Dockerfile b/docker/transformers-past-gpu/Dockerfile
index 7b7f53c85899..0cdc9ff07124 100644
--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@@ -24,7 +24,7 @@ ARG FRAMEWORK
 ARG VERSION
 
 # Control `setuptools` version to avoid some issues
-RUN [ "$VERSION" != "1.9" -a "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
+RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
 
 # Remove all frameworks
 RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 2eee5c9fe56d..4290241fbc09 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -33,19 +33,13 @@
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...pytorch_utils import find_pruneable_heads_and_indices, is_torch_greater_or_equal_than_1_10, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig
 
 
 logger = logging.get_logger(__name__)
 
-if not is_torch_greater_or_equal_than_1_10:
-    logger.warning(
-        f"You are using torch=={torch.__version__}, but torch>=1.10.0 is required to use "
-        "BridgeTowerModel. Please upgrade torch."
-    )
-
 _CONFIG_FOR_DOC = "BridgeTowerConfig"
 _CHECKPOINT_FOR_DOC = "BridgeTower/bridgetower-base"
 _TOKENIZER_FOR_DOC = "RobertaTokenizer"
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 19a1454c6e0b..6ee1e396a625 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -36,7 +36,6 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import (
     find_pruneable_heads_and_indices,
-    is_torch_greater_or_equal_than_1_10,
     meshgrid,
     prune_linear_layer,
 )
@@ -46,12 +45,6 @@
 
 logger = logging.get_logger(__name__)
 
-if not is_torch_greater_or_equal_than_1_10:
-    logger.warning(
-        f"You are using torch=={torch.__version__}, but torch>=1.10.0 is required to use "
-        "ViltModel. Please upgrade torch."
-    )
-
 _CONFIG_FOR_DOC = "ViltConfig"
 _CHECKPOINT_FOR_DOC = "dandelin/vilt-b32-mlm"
 
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 3beaf31efa6d..4723c43035e6 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -31,7 +31,6 @@
 is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
 is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
 is_torch_greater_or_equal_than_1_11 = parsed_torch_version_base >= version.parse("1.11")
-is_torch_greater_or_equal_than_1_10 = parsed_torch_version_base >= version.parse("1.10")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
 
 
@@ -275,12 +274,7 @@ def meshgrid(
 
     Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html
     """
-    if is_torch_greater_or_equal_than_1_10:
-        return torch.meshgrid(*tensors, indexing=indexing)
-    else:
-        if indexing != "ij":
-            raise ValueError('torch.meshgrid only supports `indexing="ij"` for torch<1.10.')
-        return torch.meshgrid(*tensors)
+    return torch.meshgrid(*tensors, indexing=indexing)
 
 
 def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e9fec6a5466b..f47b57d6d9a1 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -70,7 +70,7 @@
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
 from .optimization import Adafactor, get_scheduler
-from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_10
+from .pytorch_utils import ALL_LAYERNORM_LAYERS
 from .tokenization_utils_base import PreTrainedTokenizerBase
 from .trainer_callback import (
     CallbackHandler,
@@ -155,8 +155,6 @@
 from .utils.generic import ContextManagers
 
 
-_is_native_cpu_amp_available = is_torch_greater_or_equal_than_1_10
-
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
 DEFAULT_PROGRESS_CALLBACK = ProgressCallback
 
@@ -621,10 +619,8 @@ def __init__(
                 if args.device == torch.device("cpu"):
                     if args.fp16:
                         raise ValueError("Tried to use `fp16` but it is not supported on cpu")
-                    elif _is_native_cpu_amp_available:
-                        args.half_precision_backend = "cpu_amp"
                     else:
-                        raise ValueError("Tried to use cpu amp but native cpu amp is not available")
+                        args.half_precision_backend = "cpu_amp"
                 else:
                     args.half_precision_backend = "cuda_amp"
 
@@ -2595,14 +2591,11 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
         arguments, depending on the situation.
         """
         if self.use_cuda_amp or self.use_cpu_amp:
-            if is_torch_greater_or_equal_than_1_10:
-                ctx_manager = (
-                    torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
-                    if self.use_cpu_amp
-                    else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
-                )
-            else:
-                ctx_manager = torch.cuda.amp.autocast()
+            ctx_manager = (
+                torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
+                if self.use_cpu_amp
+                else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
+            )
         else:
             ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 03b564876898..bbdae6a7b4b8 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -258,16 +258,12 @@ def is_torch_bf16_gpu_available():
     # since currently no utility function is available we build our own.
     # some bits come from https://github.com/pytorch/pytorch/blob/2289a12f21c54da93bf5d696e3f9aea83dd9c10d/torch/testing/_internal/common_cuda.py#L51
     # with additional check for torch version
-    # to succeed:
-    # 1. torch >= 1.10 (1.9 should be enough for AMP API has changed in 1.10, so using 1.10 as minimal)
-    # 2. the hardware needs to support bf16 (GPU arch >= Ampere, or CPU)
-    # 3. if using gpu, CUDA >= 11
-    # 4. torch.autocast exists
+    # to succeed: (torch is required to be >= 1.10 anyway)
+    # 1. the hardware needs to support bf16 (GPU arch >= Ampere, or CPU)
+    # 2. if using gpu, CUDA >= 11
+    # 3. torch.autocast exists
     # XXX: one problem here is that it may give invalid results on mixed gpus setup, so it's
     # really only correct for the 0th gpu (or currently set default device if different from 0)
-    if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.10"):
-        return False
-
     if torch.cuda.is_available() and torch.version.cuda is not None:
         if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
             return False
@@ -287,9 +283,6 @@ def is_torch_bf16_cpu_available():
 
     import torch
 
-    if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.10"):
-        return False
-
     try:
         # multiple levels of AttributeError depending on the pytorch version so do them all in one check
         _ = torch.cpu.amp.autocast
@@ -526,8 +519,6 @@ def is_optimum_neuron_available():
 
 
 def is_safetensors_available():
-    if is_torch_available() and version.parse(_torch_version) < version.parse("1.10"):
-        return False
     return _safetensors_available
 
 
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 678c46bd0ca4..f7ef199febdb 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -38,9 +38,6 @@
         BloomModel,
         BloomTokenizerFast,
     )
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
-else:
-    is_torch_greater_or_equal_than_1_10 = False
 
 
 @require_torch
@@ -518,10 +515,6 @@ def setUp(self):
         super().setUp()
         self.path_bigscience_model = "bigscience/bigscience-small-testing"
 
-    @unittest.skipIf(
-        not is_torch_greater_or_equal_than_1_10,
-        "Test failed with torch < 1.10 (`LayerNormKernelImpl` not implemented for `BFloat16`)",
-    )
     @require_torch
     def test_embeddings(self):
         # The config in this checkpoint has `bfloat16` as `torch_dtype` -> model in `bfloat16`
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index 9c40f376a7b5..0eb293fdc33c 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -50,9 +50,6 @@
         BridgeTowerModel,
     )
     from transformers.models.bridgetower.modeling_bridgetower import BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
-else:
-    is_torch_greater_or_equal_than_1_10 = False
 
 if is_vision_available():
     from PIL import Image
@@ -298,7 +295,6 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "BridgeTower is only available in torch v1.10+")
 class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
@@ -516,7 +512,6 @@ def prepare_img():
 
 @require_torch
 @require_vision
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "BridgeTower is only available in torch v1.10+")
 class BridgeTowerModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_processor(self):
@@ -601,7 +596,6 @@ def test_constrastive_learning(self):
 
 @slow
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "BridgeTower is only available in torch v1.10+")
 class BridgeTowerModelTrainingTest(unittest.TestCase):
     all_training_supported_model_classes = (
         (BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM, BridgeTowerForContrastiveLearning)
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 18764a0090f5..d4e803ed4d2f 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -20,8 +20,6 @@
 import warnings
 from math import ceil, floor
 
-from packaging import version
-
 from transformers import LevitConfig
 from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
@@ -346,10 +344,6 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     def test_problem_types(self):
-        parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
-        if parsed_torch_version_base.base_version.startswith("1.9"):
-            self.skipTest(reason="This test fails with PyTorch 1.9.x: some CUDA issue")
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         problem_types = [
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index 4b307d489e1a..e1864e48dd04 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -42,9 +42,6 @@
 
     from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
     from transformers.models.tvlt.modeling_tvlt import TVLT_PRETRAINED_MODEL_ARCHIVE_LIST
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
-else:
-    is_torch_greater_or_equal_than_1_10 = False
 
 
 if is_datasets_available():
@@ -322,7 +319,6 @@ def prepare_audio_values(self):
 
 
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "TVLT is only available in torch v1.10+")
 class TvltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 958527dbf483..772091d5b976 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -42,9 +42,6 @@
         ViltModel,
     )
     from transformers.models.vilt.modeling_vilt import VILT_PRETRAINED_MODEL_ARCHIVE_LIST
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
-else:
-    is_torch_greater_or_equal_than_1_10 = False
 
 if is_vision_available():
     import PIL
@@ -218,7 +215,6 @@ def prepare_pixel_values(self):
 
 
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "Vilt is only available in torch v1.10+")
 class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
@@ -520,7 +516,6 @@ def test_model_from_pretrained(self):
 
 
 @require_torch
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "Vilt is only available in torch v1.10+")
 class ViltForImagesAndTextClassificationModelTest(ViltModelTest, unittest.TestCase):
     all_model_classes = (ViltForImagesAndTextClassification,) if is_torch_available() else ()
 
@@ -545,7 +540,6 @@ def prepare_img():
 
 @require_torch
 @require_vision
-@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "Vilt is only available in torch v1.10+")
 class ViltModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_processor(self):

From 3403712958fa06189afbc81715dafa457bead6f2 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 16 Jun 2023 15:40:49 +0100
Subject: [PATCH 446/935] Big TF test cleanup (#24282)

* Fix one BLIP arg not being optional, remove misspelled arg

* Remove the lxmert test overrides and just use the base test_saved_model_creation

* saved_model_creation fixes and re-enabling tests across the board

* Remove unnecessary skip

* Stop caching sinusoidal embeddings in speech_to_text

* Fix transfo_xl compilation

* Fix transfo_xl compilation

* Fix the conditionals in xglm

* Set the save spec only when building

* Clarify comment

* Move comment correctly

* Correct embeddings generation for speech2text

* Mark RAG generation tests as @slow

* Remove redundant else:

* Add comment to clarify the save_spec line in build()

* Fix size tests for XGLM at last!

* make fixup

* Remove one band_part operation

* Mark test_keras_fit as @slow
---
 src/transformers/modeling_tf_utils.py         |  5 +-
 .../models/blip/modeling_tf_blip.py           |  3 +-
 .../models/opt/modeling_tf_opt.py             |  2 +-
 .../modeling_tf_speech_to_text.py             | 28 +++-----
 .../transfo_xl/modeling_tf_transfo_xl.py      | 38 +++-------
 .../models/xglm/modeling_tf_xglm.py           | 25 +++----
 ...tf_{{cookiecutter.lowercase_modelname}}.py |  1 -
 tests/models/bart/test_modeling_tf_bart.py    |  6 +-
 .../blenderbot/test_modeling_tf_blenderbot.py |  6 +-
 .../test_modeling_tf_blenderbot_small.py      |  6 +-
 .../convnext/test_modeling_tf_convnext.py     |  1 +
 tests/models/cvt/test_modeling_tf_cvt.py      |  1 +
 .../test_modeling_tf_data2vec_vision.py       |  1 +
 .../models/funnel/test_modeling_tf_funnel.py  | 10 +--
 .../groupvit/test_modeling_tf_groupvit.py     |  6 +-
 tests/models/led/test_modeling_tf_led.py      |  8 +--
 .../longformer/test_modeling_tf_longformer.py |  8 +--
 .../models/lxmert/test_modeling_tf_lxmert.py  | 70 +------------------
 .../models/marian/test_modeling_tf_marian.py  |  6 +-
 tests/models/mbart/test_modeling_tf_mbart.py  |  6 +-
 .../mobilebert/test_modeling_tf_mobilebert.py | 11 +--
 tests/models/opt/test_modeling_tf_opt.py      |  6 +-
 .../pegasus/test_modeling_tf_pegasus.py       |  6 +-
 tests/models/rag/test_modeling_tf_rag.py      |  2 +
 .../models/regnet/test_modeling_tf_regnet.py  |  1 +
 .../segformer/test_modeling_tf_segformer.py   |  1 +
 .../test_modeling_speech_to_text.py           |  4 ++
 .../test_modeling_tf_speech_to_text.py        |  4 ++
 tests/models/swin/test_modeling_tf_swin.py    |  6 +-
 tests/models/t5/test_modeling_tf_t5.py        |  6 +-
 tests/test_modeling_tf_common.py              |  1 +
 31 files changed, 68 insertions(+), 217 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 01d29bd02481..12a020aa21cf 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1157,6 +1157,9 @@ def build(self, input_shape=None):
             self.built = True
         else:
             self.built = True
+            # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
+            # Setting it in build() allows users to override the shape when loading a non-pretrained model from config
+            self._set_save_spec(self._prune_signature(self.input_signature))
             self(self.dummy_inputs, training=False)
 
     def __init__(self, config, *inputs, **kwargs):
@@ -1171,8 +1174,6 @@ def __init__(self, config, *inputs, **kwargs):
         self.config = config
         self.name_or_path = config.name_or_path
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
-        # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
-        self._set_save_spec(self._prune_signature(self.input_signature))
 
     def get_config(self):
         return self.config.to_dict()
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index b94c005eb483..95c22e204497 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -1216,12 +1216,11 @@ def _shift_right(self, input_ids):
     def call(
         self,
         input_ids: tf.Tensor,
-        pixel_values: tf.Tensor,
+        pixel_values: tf.Tensor | None = None,
         decoder_input_ids: tf.Tensor | None = None,
         decoder_attention_mask: tf.Tensor | None = None,
         attention_mask: tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
-        foutput_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         labels: tf.Tensor | None = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 9d92ef5c4cd3..bc76a30ce8c5 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -618,7 +618,7 @@ def call(
             attention_mask = tf.ones(inputs_embeds.shape[:2], dtype=tf.bool)
         else:
             tf.debugging.assert_equal(
-                attention_mask.shape[1],
+                tf.shape(attention_mask)[1],
                 past_key_values_length + input_shape[1],
                 message=(
                     f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 59caabffab9c..86ec432e49f7 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -195,30 +195,19 @@ def _get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optiona
             emb = tf.concat([emb[:padding_idx, :], tf.zeros((1, tf.shape(emb)[1])), emb[padding_idx + 1 :, :]], axis=0)
         return emb
 
-    def build(self, input_shape: tf.TensorShape):
-        """
-        Build shared token embedding layer Shared weights logic adapted from
-        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        self.embeddings = self.add_weight(
-            name="weights",  # name also used in PT
-            shape=tf.shape(self.embedding_weights),
-            trainable=False,
-        )
-        self.embeddings.assign(self.embedding_weights)
-        super().build(input_shape)
-
     def call(self, input_ids: tf.Tensor, past_key_values_length: int = 0) -> tf.Tensor:
         bsz, seq_len = shape_list(input_ids)
         # Create the position ids from the input token ids. Any padded tokens remain padded.
         position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
 
-        # expand embeddings if needed
-        max_pos = self.padding_idx + 1 + seq_len
-        if max_pos > shape_list(self.embeddings)[0]:
-            self.embedding_weights = self._get_embedding(max_pos + self.offset, self.embedding_dim, self.padding_idx)
-            self.embeddings.assign(self.embedding_weights)
-        return tf.reshape(tf.gather(self.embeddings, tf.reshape(position_ids, (-1,)), axis=0), (bsz, seq_len, -1))
+        # Matt: The PyTorch code does a lot of work to cache the embeddings, setting the cached values as a
+        # model attribute in the forward pass. This is extremely forbidden in TF, which wants forward calls to be
+        # idempotent. TF doesn't need that caching anyway, since it can just store constants during compilation,
+        # so we just remove all of that code.
+        embeddings = self._get_embedding(
+            self.padding_idx + 1 + seq_len + self.offset + past_key_values_length, self.embedding_dim, self.padding_idx
+        )
+        return tf.reshape(tf.gather(embeddings, tf.reshape(position_ids, (-1,)), axis=0), (bsz, seq_len, -1))
 
     @staticmethod
     def create_position_ids_from_input_ids(
@@ -562,6 +551,7 @@ class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
     config_class = Speech2TextConfig
     base_model_prefix = "model"
     main_input_name = "input_features"
+    _keys_to_ignore_on_load_unexpected = [r"encoder.embed_positions.weights"]
 
     def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
         """
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index b8b9d8555594..dee1d71bc63f 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -588,35 +588,19 @@ def call(
         klen = mlen + qlen
 
         # Compute decoder attention mask
-
-        # ::: PyTorch masking code for reference :::
-        # if self.same_length:
-        #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
-        #     mask_len = klen - self.mem_len
-        #     if mask_len > 0:
-        #         mask_shift_len = qlen - mask_len
-        #     else:
-        #         mask_shift_len = qlen
-        #     dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-        #             + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
-        # else:
-        #     dec_attn_mask = torch.triu(
-        #         word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
-
-        # TensorFlow version
-        dec_attn_mask = 1 - tf.linalg.band_part(
-            tf.ones([qlen, klen], dtype=tf.int32), -1, mlen
-        )  # (q, q): diagonal with 1's
+        all_ones = tf.ones([qlen, klen], dtype=tf.int32)
+        upper_mask = 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, mlen)
         if self.same_length:
             mask_len = klen - self.mem_len
-            if mask_len > 0:
-                mask_shift_len = qlen - mask_len
-            else:
-                mask_shift_len = qlen
-            if mask_shift_len >= 1:
-                dec_attn_mask += 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), mask_shift_len - 1, -1)
-            else:
-                dec_attn_mask += tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, -mask_shift_len)
+            mask_shift_len = qlen - tf.nn.relu(mask_len)  # Lazy clamping of negatives to zero
+
+            # Use an indicator variable instead of a conditional to keep the compiler happy
+            lower_mask = tf.linalg.band_part(all_ones, -1, 0) - (
+                tf.linalg.band_part(all_ones, mask_shift_len - 1, 0) * tf.cast(mask_shift_len != 0, tf.int32)
+            )
+            dec_attn_mask = upper_mask + lower_mask
+        else:
+            dec_attn_mask = upper_mask
 
         hids = []
         attentions = [] if output_attentions else None
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 6cc9db021cf9..639eb912f3d1 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -463,19 +463,14 @@ def _prepare_decoder_attention_mask(
     ) -> tf.Tensor:
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask: tf.Tensor | None = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
-
-        if attention_mask is not None:
-            expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
-            combined_attention_mask = (
-                expand_attention_mask
-                if combined_attention_mask is None
-                else expand_attention_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
+        combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
+        combined_attention_mask = tf.cond(
+            input_shape[-1] > 1, lambda: combined_attention_mask, lambda: tf.ones_like(combined_attention_mask)
+        )
+        if attention_mask is None:
+            return combined_attention_mask
+        expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
+        return expand_attention_mask + combined_attention_mask
 
     def embed_positions(self, position_ids: np.ndarray | tf.Tensor | None = None) -> tf.Tensor:
         position_ids += self.offset
@@ -512,10 +507,10 @@ def call(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = shape_list(input_ids)
+            input_shape = tf.shape(input_ids)
             input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
         elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
+            input_shape = tf.shape(inputs_embeds)[:-1]
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index c7c69aa18fd4..53057814e311 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -2676,7 +2676,6 @@ class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiec
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.model = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="model")
-        self.model._set_save_spec(self._prune_signature(self.input_signature))
         self.use_cache = config.use_cache
         # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
         self.bias_layer = BiasLayer(
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index dfd953f8cf09..60b182672990 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from transformers import BartConfig, BartTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -225,10 +225,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     # TODO (Joao): fix me
     @unittest.skip("Onnx compliancy broke with TF 2.10")
     def test_onnx_compliancy(self):
diff --git a/tests/models/blenderbot/test_modeling_tf_blenderbot.py b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
index ac4649cfc263..7553bb908ea5 100644
--- a/tests/models/blenderbot/test_modeling_tf_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import BlenderbotConfig, BlenderbotTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -207,10 +207,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tokenizers
 @require_tf
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
index 31e4c94c670c..2118ec6837a8 100644
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import BlenderbotSmallConfig, BlenderbotSmallTokenizer, is_tf_available
-from transformers.testing_utils import require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -209,10 +209,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tokenizers
 @require_tf
diff --git a/tests/models/convnext/test_modeling_tf_convnext.py b/tests/models/convnext/test_modeling_tf_convnext.py
index 8d049cf9f501..a508e038dfe6 100644
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -156,6 +156,7 @@ def test_inputs_embeds(self):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index 78d95931b3b5..2a42732e67c4 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -185,6 +185,7 @@ def test_dataset_conversion(self):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index 320b5ede5c10..b3f2eec8fb3d 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -347,6 +347,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
     # Overriding this method since the base method won't be compatible with Data2VecVision.
+    @slow
     def test_keras_fit(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
diff --git a/tests/models/funnel/test_modeling_tf_funnel.py b/tests/models/funnel/test_modeling_tf_funnel.py
index 051da46fadaf..b8327c86acfa 100644
--- a/tests/models/funnel/test_modeling_tf_funnel.py
+++ b/tests/models/funnel/test_modeling_tf_funnel.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import FunnelConfig, is_tf_available
-from transformers.testing_utils import require_tf, tooslow
+from transformers.testing_utils import require_tf
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
@@ -386,10 +386,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tf
 class TFFunnelBaseModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -417,7 +413,3 @@ def test_for_sequence_classification(self):
     def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index a80ef606e5fc..6cd6ae7aaecd 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -601,6 +601,7 @@ def test_model_common_attributes(self):
         pass
 
     @require_tensorflow_probability
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
@@ -692,11 +693,6 @@ def test_model_from_pretrained(self):
     def test_saved_model_creation(self):
         pass
 
-    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
-    @slow
-    def test_saved_model_creation_extended(self):
-        pass
-
     @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.")
     @slow
     def test_prepare_serving_output(self):
diff --git a/tests/models/led/test_modeling_tf_led.py b/tests/models/led/test_modeling_tf_led.py
index 480d4520d901..a06a29fd3838 100644
--- a/tests/models/led/test_modeling_tf_led.py
+++ b/tests/models/led/test_modeling_tf_led.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import LEDConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
@@ -292,11 +292,7 @@ def check_encoder_attentions_output(outputs):
             self.assertEqual(model.config.output_hidden_states, True)
             check_encoder_attentions_output(outputs)
 
-    def test_xla_mode(self):
-        # TODO JP: Make LED XLA compliant
-        pass
-
-    @tooslow
+    @unittest.skip("LED keeps using potentially symbolic tensors in conditionals and breaks tracing.")
     def test_saved_model_creation(self):
         pass
 
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index dcdd68b18ffd..1cba14cdb22c 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
@@ -356,14 +356,10 @@ def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
 
-    @tooslow
+    @unittest.skip("Longformer keeps using potentially symbolic tensors in conditionals and breaks tracing.")
     def test_saved_model_creation(self):
         pass
 
-    def test_xla_mode(self):
-        # TODO JP: Make Longformer XLA compliant
-        pass
-
 
 @require_tf
 @require_sentencepiece
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
index a99495d008a1..52f64d614423 100644
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ b/tests/models/lxmert/test_modeling_tf_lxmert.py
@@ -15,14 +15,13 @@
 
 from __future__ import annotations
 
-import os
 import tempfile
 import unittest
 
 import numpy as np
 
 from transformers import LxmertConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
@@ -532,73 +531,6 @@ def test_save_load(self):
 
                 self.assert_outputs_same(after_outputs, outputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            num_out = len(model(class_inputs_dict))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = tf.keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-                language_hidden_states = outputs["language_hidden_states"]
-                vision_hidden_states = outputs["vision_hidden_states"]
-                language_attentions = outputs["language_attentions"]
-                vision_attentions = outputs["vision_attentions"]
-                cross_encoder_attentions = outputs["cross_encoder_attentions"]
-
-                self.assertEqual(len(outputs), num_out)
-
-                self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
-                self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
-
-                seq_length = self.model_tester.seq_length
-                num_visual_features = self.model_tester.num_visual_features
-
-                self.assertListEqual(
-                    list(language_hidden_states[0].shape[-2:]),
-                    [seq_length, self.model_tester.hidden_size],
-                )
-                self.assertListEqual(
-                    list(vision_hidden_states[0].shape[-2:]),
-                    [num_visual_features, self.model_tester.hidden_size],
-                )
-
-                self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
-                self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
-                self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
-
-                attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
-                attention_shapes = [
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                    [
-                        self.model_tester.num_attention_heads,
-                        self.model_tester.num_visual_features,
-                        self.model_tester.num_visual_features,
-                    ],
-                    [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
-                ]
-
-                for attention, attention_shape in zip(attentions, attention_shapes):
-                    self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
-
 
 @require_tf
 class TFLxmertModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
index 1a87f4e984a1..50cff4219a14 100644
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ b/tests/models/marian/test_modeling_tf_marian.py
@@ -20,7 +20,7 @@
 import warnings
 
 from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -208,10 +208,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_tf
 class AbstractMarianIntegrationTest(unittest.TestCase):
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index 753f961d1f1a..9ca4a7d69324 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -18,7 +18,7 @@
 import unittest
 
 from transformers import AutoTokenizer, MBartConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -195,10 +195,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_sentencepiece
 @require_tokenizers
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
index 7e67c56a4fb6..607ba5b88a92 100644
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
@@ -20,7 +20,7 @@
 
 from transformers import MobileBertConfig, is_tf_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_tf, slow, tooslow
+from transformers.testing_utils import require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
@@ -311,15 +311,6 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mobilebert_for_token_classification(*config_and_inputs)
 
-    @slow
-    def test_keras_fit(self):
-        # Override as it is a slow test on this model
-        super().test_keras_fit()
-
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         # for model_name in TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/opt/test_modeling_tf_opt.py b/tests/models/opt/test_modeling_tf_opt.py
index fb7c842c9c3f..1847ad50a949 100644
--- a/tests/models/opt/test_modeling_tf_opt.py
+++ b/tests/models/opt/test_modeling_tf_opt.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from transformers import OPTConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
@@ -219,10 +219,6 @@ def _get_word_embedding_weight(model, embedding_layer):
                             models_equal = False
                     self.assertTrue(models_equal)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 def _long_tensor(tok_lst):
     return tf.constant(tok_lst, dtype=tf.int32)
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
index dcd0479e2cbf..33e908ab23ea 100644
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/models/pegasus/test_modeling_tf_pegasus.py
@@ -18,7 +18,7 @@
 import unittest
 
 from transformers import AutoTokenizer, PegasusConfig, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -206,10 +206,6 @@ def test_decoder_model_past_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
 
 @require_sentencepiece
 @require_tokenizers
diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
index b4720f7c7f0d..0041b0e6c49a 100644
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ b/tests/models/rag/test_modeling_tf_rag.py
@@ -490,6 +490,7 @@ def test_model_without_retriever(self):
         inputs_dict = self.config_and_inputs
         self.check_model_without_retriever(**inputs_dict)
 
+    @slow
     def test_model_generate_from_context_input_ids(self):
         inputs_dict = self.config_and_inputs
         self.check_model_generate_from_context_input_ids(**inputs_dict)
@@ -498,6 +499,7 @@ def test_model_with_encoder_outputs(self):
         inputs_dict = self.config_and_inputs
         self.check_model_with_encoder_outputs(**inputs_dict)
 
+    @slow
     def test_model_generate(self):
         inputs_dict = self.config_and_inputs
         self.check_model_generate(**inputs_dict)
diff --git a/tests/models/regnet/test_modeling_tf_regnet.py b/tests/models/regnet/test_modeling_tf_regnet.py
index cee3995d2100..4536458c549e 100644
--- a/tests/models/regnet/test_modeling_tf_regnet.py
+++ b/tests/models/regnet/test_modeling_tf_regnet.py
@@ -148,6 +148,7 @@ def test_inputs_embeds(self):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         super().test_keras_fit()
 
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index d3317b2079be..b381a2e890ba 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -347,6 +347,7 @@ def check_keras_fit_results(self, val_loss1, val_loss2, atol=2e-1, rtol=2e-1):
         not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0,
         reason="TF does not support backprop for grouped convolutions on CPU.",
     )
+    @slow
     def test_keras_fit(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index d270a1cf4a55..16ad704fd510 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -722,6 +722,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
+
 
 @require_torch
 @require_torchaudio
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
index b283b4478bda..c874d5c5c3ce 100644
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -558,6 +558,10 @@ def test_forward_signature(self):
             ]
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
+    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
+        # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
+        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
+
 
 @require_tf
 @require_sentencepiece
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
index a898d22fb132..de271fb38a43 100644
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ b/tests/models/swin/test_modeling_tf_swin.py
@@ -23,7 +23,7 @@
 import numpy as np
 
 from transformers import SwinConfig
-from transformers.testing_utils import require_tf, require_vision, slow, to_2tuple, tooslow
+from transformers.testing_utils import require_tf, require_vision, slow, to_2tuple
 from transformers.utils import cached_property, is_tf_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -232,10 +232,6 @@ def test_for_image_classification(self):
     def test_inputs_embeds(self):
         pass
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index a2caa664ff39..46b0f4c5911c 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -18,7 +18,7 @@
 import unittest
 
 from transformers import T5Config, is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow, tooslow
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -300,10 +300,6 @@ def test_t5_decoder_model_past_large_inputs(self):
 
         self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @tooslow
-    def test_saved_model_creation(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model = TFT5Model.from_pretrained("t5-small")
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 3c3ca75ff6bb..840f4d42b18f 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1415,6 +1415,7 @@ def test_loss_computation(self):
     def check_keras_fit_results(self, val_loss1, val_loss2, atol=1e-2, rtol=1e-3):
         self.assertTrue(np.allclose(val_loss1, val_loss2, atol=atol, rtol=rtol))
 
+    @slow
     def test_keras_fit(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:

From 61ffdeba382c59c3e1fb7b3a5bb9f6d6e8ea0893 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 16 Jun 2023 16:43:19 +0200
Subject: [PATCH 447/935] Fix ner average grouping with no groups (#24319)

Fixes #https://github.com/huggingface/transformers/issues/24314
---
 src/transformers/pipelines/token_classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 7698b37f31df..563620099819 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -491,7 +491,8 @@ def aggregate_words(self, entities: List[dict], aggregation_strategy: Aggregatio
                 word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
                 word_group = [entity]
         # Last item
-        word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+        if word_group is not None:
+            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
         return word_entities
 
     def group_sub_entities(self, entities: List[dict]) -> dict:

From 096f2cf12664bb7da41f89897d3a22966baee9b4 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 16 Jun 2023 10:55:42 -0400
Subject: [PATCH 448/935] Tied weights load (#24310)

* Use tied weight keys

* More

* Fix tied weight missing warning

* Only give info on unexpected keys with different classes

* Deal with empty archs

* Fix tests

* Refine test
---
 src/transformers/modeling_utils.py | 27 ++++-----
 tests/test_modeling_utils.py       | 89 ++++++++++++++++++++++++++++--
 2 files changed, 96 insertions(+), 20 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9a0894e4e20b..4129c52702c2 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1779,10 +1779,10 @@ def save_pretrained(
             for names in shared_ptrs.values():
                 # Removing the keys which are declared as known duplicates on
                 # load. This allows to make sure the name which is kept is consistent.
-                if self._keys_to_ignore_on_load_missing is not None:
+                if self._tied_weights_keys is not None:
                     found = 0
                     for name in sorted(names):
-                        matches_pattern = any(re.search(pat, name) for pat in self._keys_to_ignore_on_load_missing)
+                        matches_pattern = any(re.search(pat, name) for pat in self._tied_weights_keys)
                         if matches_pattern and name in state_dict:
                             found += 1
                             if found < len(names):
@@ -3020,22 +3020,15 @@ def _fix_key(key):
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         if is_accelerate_available():
+            model.tie_weights()
             tied_params = find_tied_parameters(model)
         else:
             tied_params = []
-        _missing = []
-        for k in missing_keys:
-            found = False
-            for group in tied_params:
-                if k in group:
-                    found = True
-                    if len(group) > 2:
-                        group.remove(k)
-                    else:
-                        _missing.append(k)
-            if not found:
-                _missing.append(k)
-        missing_keys = _missing
+
+        for group in tied_params:
+            missing_in_group = [k for k in missing_keys if k in group]
+            if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
+                missing_keys = [k for k in missing_keys if k not in missing_in_group]
 
         # Some models may have keys that are not in the state by design, removing them before needlessly warning
         # the user.
@@ -3275,7 +3268,9 @@ def _find_mismatched_keys(
             missing_keys = [elem for elem in missing_keys if "SCB" not in elem]
 
         if len(unexpected_keys) > 0:
-            logger.warning(
+            archs = [] if model.config.architectures is None else model.config.architectures
+            warner = logger.warn if model.__class__.__name__ in archs else logger.info
+            warner(
                 f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
                 f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
                 f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 760d6c6d54d0..3b441ec7e582 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -82,16 +82,31 @@
 
     # Fake pretrained models for tests
     class BaseModel(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.linear = nn.Linear(5, 5)
+            self.linear_2 = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.linear_2(self.linear(x))
+
+    class BaseModelWithTiedWeights(PreTrainedModel):
         config_class = PretrainedConfig
 
         def __init__(self, config):
             super().__init__(config)
-            self.linear = nn.Linear(4, 5)
-            self.linear_2 = nn.Linear(5, 6)
+            self.linear = nn.Linear(5, 5)
+            self.linear_2 = nn.Linear(5, 5)
 
         def forward(self, x):
             return self.linear_2(self.linear(x))
 
+        def tie_weights(self):
+            self.linear_2.weight = self.linear.weight
+
     class ModelWithHead(PreTrainedModel):
         base_model_prefix = "base"
         config_class = PretrainedConfig
@@ -103,12 +118,30 @@ def __init__(self, config):
             super().__init__(config)
             self.base = BaseModel(config)
             # linear is a common name between Base and Head on purpose.
-            self.linear = nn.Linear(6, 3)
-            self.linear2 = nn.Linear(3, 5)
+            self.linear = nn.Linear(5, 5)
+            self.linear2 = nn.Linear(5, 5)
 
         def forward(self, x):
             return self.linear2(self.linear(self.base(x)))
 
+    class ModelWithHeadAndTiedWeights(PreTrainedModel):
+        base_model_prefix = "base"
+        config_class = PretrainedConfig
+
+        def _init_weights(self, module):
+            pass
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.base = BaseModel(config)
+            self.decoder = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.decoder(self.base(x))
+
+        def tie_weights(self):
+            self.decoder.weight = self.base.linear.weight
+
 
 TINY_T5 = "patrickvonplaten/t5-tiny-random"
 TINY_BERT_FOR_TOKEN_CLASSIFICATION = "hf-internal-testing/tiny-bert-for-token-classification"
@@ -857,6 +890,54 @@ def test_base_model_to_head_model_load(self):
             ):
                 _ = ModelWithHead.from_pretrained(tmp_dir)
 
+    def test_tied_weights_reload(self):
+        # Base
+        model = BaseModelWithTiedWeights(PretrainedConfig())
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            new_model = BaseModelWithTiedWeights.from_pretrained(tmp_dir)
+            self.assertIs(new_model.linear.weight, new_model.linear_2.weight)
+
+            state_dict = model.state_dict()
+            # Remove tied weight from state_dict -> model should load with no complain of missing keys
+            del state_dict["linear_2.weight"]
+            torch.save(state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+            new_model, load_info = BaseModelWithTiedWeights.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertListEqual(load_info["missing_keys"], [])
+            self.assertIs(new_model.linear.weight, new_model.linear_2.weight)
+
+            # With head
+            model.save_pretrained(tmp_dir)
+            new_model, load_info = ModelWithHeadAndTiedWeights.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertIs(new_model.base.linear.weight, new_model.decoder.weight)
+            # Should only complain about the missing bias
+            self.assertListEqual(load_info["missing_keys"], ["decoder.bias"])
+
+    def test_unexpected_keys_warnings(self):
+        model = ModelWithHead(PretrainedConfig())
+        logger = logging.get_logger("transformers.modeling_utils")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+
+            # Loading the model with a new class, we don't get a warning for unexpected weights, just an info
+            with CaptureLogger(logger) as cl:
+                _, loading_info = BaseModel.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertNotIn("were not used when initializing ModelWithHead", cl.out)
+            self.assertEqual(
+                set(loading_info["unexpected_keys"]),
+                {"linear.weight", "linear.bias", "linear2.weight", "linear2.bias"},
+            )
+
+            # Loading the model with the same class, we do get a warning for unexpected weights
+            state_dict = model.state_dict()
+            state_dict["added_key"] = state_dict["linear.weight"]
+            torch.save(state_dict, os.path.join(tmp_dir, WEIGHTS_NAME))
+            with CaptureLogger(logger) as cl:
+                _, loading_info = ModelWithHead.from_pretrained(tmp_dir, output_loading_info=True)
+            self.assertIn("were not used when initializing ModelWithHead: ['added_key']", cl.out)
+            self.assertEqual(loading_info["unexpected_keys"], ["added_key"])
+
     @require_torch_gpu
     @slow
     def test_pretrained_low_mem_new_config(self):

From bdfd57d1d160a19f275e7ec0decbd9025471ab2f Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 16 Jun 2023 17:01:22 +0100
Subject: [PATCH 449/935] Fix ImageGPT doc example (#24317)

* Fix ImageGPT doc example

* Update src/transformers/models/imagegpt/image_processing_imagegpt.py

* Fix types
---
 .../models/imagegpt/image_processing_imagegpt.py | 16 ++++++++--------
 .../models/imagegpt/modeling_imagegpt.py         |  4 ++--
 utils/documentation_tests.txt                    |  1 +
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 9cfa40078bdd..e40e09a533a1 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -60,9 +60,9 @@ class ImageGPTImageProcessor(BaseImageProcessor):
     (color clusters).
 
     Args:
-        clusters (`np.ndarray`, *optional*):
-            The color clusters to use, as a `np.ndarray` of shape `(n_clusters, 3)` when color quantizing. Can be
-            overriden by `clusters` in `preprocess`.
+        clusters (`np.ndarray` or `List[List[int]]`, *optional*):
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overriden by `clusters`
+            in `preprocess`.
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
             `do_resize` in `preprocess`.
@@ -82,7 +82,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
     def __init__(
         self,
         # clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor
-        clusters: Optional[np.ndarray] = None,
+        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -93,7 +93,7 @@ def __init__(
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 256, "width": 256}
         size = get_size_dict(size)
-        self.clusters = clusters
+        self.clusters = np.array(clusters) if clusters is not None else None
         self.do_resize = do_resize
         self.size = size
         self.resample = resample
@@ -154,7 +154,7 @@ def preprocess(
         resample: PILImageResampling = None,
         do_normalize: bool = None,
         do_color_quantize: Optional[bool] = None,
-        clusters: Optional[Union[int, List[int]]] = None,
+        clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
         **kwargs,
@@ -176,7 +176,7 @@ def preprocess(
                 Whether to normalize the image
             do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
                 Whether to color quantize the image.
-            clusters (`np.ndarray`, *optional*, defaults to `self.clusters`):
+            clusters (`np.ndarray` or `List[List[int]]`, *optional*, defaults to `self.clusters`):
                 Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
                 `do_color_quantize` is set to `True`.
             return_tensors (`str` or `TensorType`, *optional*):
@@ -199,6 +199,7 @@ def preprocess(
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         clusters = clusters if clusters is not None else self.clusters
+        clusters = np.array(clusters)
 
         images = make_list_of_images(images)
 
@@ -227,7 +228,6 @@ def preprocess(
             images = [to_channel_dimension_format(image, ChannelDimension.LAST) for image in images]
             # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
             images = np.array(images)
-            clusters = np.array(clusters)
             images = color_quantize(images, clusters).reshape(images.shape[:-1])
 
             # flatten to (batch_size, height*width)
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 262c45c92738..4c146c6efc35 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -983,9 +983,9 @@ def forward(
         >>> model.to(device)
 
         >>> # unconditional generation of 8 images
-        >>> batch_size = 8
+        >>> batch_size = 4
         >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
-        >>> context = torch.tensor(context).to(device)
+        >>> context = context.to(device)
         >>> output = model.generate(
         ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
         ... )
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 63eb4a000265..5b95f24a11f4 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -102,6 +102,7 @@ src/transformers/models/groupvit/modeling_groupvit.py
 src/transformers/models/groupvit/modeling_tf_groupvit.py
 src/transformers/models/hubert/modeling_hubert.py
 src/transformers/models/imagegpt/configuration_imagegpt.py
+src/transformers/models/imagegpt/modeling_imagegpt.py
 src/transformers/models/layoutlm/configuration_layoutlm.py
 src/transformers/models/layoutlm/modeling_layoutlm.py
 src/transformers/models/layoutlm/modeling_tf_layoutlm.py

From 913899502519f3fbb7a15f52be229ff210050dc3 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 16 Jun 2023 17:03:13 +0100
Subject: [PATCH 450/935] Add test for proper TF input signatures (#24320)

* Add test for proper input signatures

* No more signature pruning

* Test the dummy inputs are valid too

* fine-tine -> fine-tune

* Fix indent in test_dataset_conversion
---
 src/transformers/modeling_tf_utils.py           | 17 +++++------------
 .../models/hubert/modeling_tf_hubert.py         |  2 +-
 .../models/mobilevit/modeling_tf_mobilevit.py   |  2 +-
 .../models/wav2vec2/modeling_tf_wav2vec2.py     |  2 +-
 tests/test_modeling_tf_common.py                | 12 +++++++++++-
 tests/utils/test_modeling_tf_core.py            |  5 ++---
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 12a020aa21cf..48876d27f7cf 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1122,8 +1122,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
         dummies = {}
-        sig = self._prune_signature(self.input_signature)
-        for key, spec in sig.items():
+        for key, spec in self.input_signature.items():
             # 2 is the most correct arbitrary size. I will not be taking questions
             dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
             if spec.shape[0] is None:
@@ -1159,7 +1158,7 @@ def build(self, input_shape=None):
             self.built = True
             # Set the serving spec quickly to ensure that Keras doesn't use the specific dummy input shapes as the spec
             # Setting it in build() allows users to override the shape when loading a non-pretrained model from config
-            self._set_save_spec(self._prune_signature(self.input_signature))
+            self._set_save_spec(self.input_signature)
             self(self.dummy_inputs, training=False)
 
     def __init__(self, config, *inputs, **kwargs):
@@ -1300,11 +1299,6 @@ def input_signature(self) -> Dict[str, tf.TensorSpec]:
             raise NotImplementedError("Audio models need a manually defined input_signature")
         return sig
 
-    def _prune_signature(self, signature):
-        """Keeps only the keys of a given input signature that are valid for this model."""
-        model_inputs = list(inspect.signature(self.call).parameters)
-        return {key: val for key, val in signature.items() if key in model_inputs}
-
     def serving_output(self, output):
         """
         Prepare the output of the saved model. Can be overridden if specific serving modifications are required.
@@ -2423,14 +2417,13 @@ def save_pretrained(
             if getattr(self.config, "torch_dtype", None) is not None and not isinstance(self.config.torch_dtype, str):
                 self.config.torch_dtype = str(self.config.torch_dtype).split(".")[1]
             if signatures is None:
-                sig = self._prune_signature(self.input_signature)
-                serving_default = self.serving.get_concrete_function(sig)
-                if any(spec.dtype == tf.int32 for spec in sig.values()):
+                serving_default = self.serving.get_concrete_function(self.input_signature)
+                if any(spec.dtype == tf.int32 for spec in self.input_signature.values()):
                     int64_spec = {
                         key: tf.TensorSpec(
                             shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name
                         )
-                        for key, spec in sig.items()
+                        for key, spec in self.input_signature.items()
                     }
                     int64_serving = self.serving.get_concrete_function(int64_spec)
                     signatures = {"serving_default": serving_default, "int64_serving": int64_serving}
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index c237616bf2a4..ff14ac5b7bf1 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1168,7 +1168,7 @@ def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         logger.warning(
             f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
-            "to train/fine-tine this model, you need a GPU or a TPU"
+            "to train/fine-tune this model, you need a GPU or a TPU"
         )
 
 
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index 4d48ce72725c..3dcca75706c8 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -98,7 +98,7 @@ def __init__(
         super().__init__(**kwargs)
         logger.warning(
             f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
-            "to train/fine-tine this model, you need a GPU or a TPU"
+            "to train/fine-tune this model, you need a GPU or a TPU"
         )
 
         padding = int((kernel_size - 1) / 2) * dilation
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 78cc2e467ba2..97174301ccbc 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1202,7 +1202,7 @@ def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         logger.warning(
             f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
-            "to train/fine-tine this model, you need a GPU or a TPU"
+            "to train/fine-tune this model, you need a GPU or a TPU"
         )
 
     def _get_feat_extract_output_lengths(self, input_lengths, add_adapter=None):
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 840f4d42b18f..37429b73fd9b 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1065,6 +1065,16 @@ def prepare_numpy_arrays(inputs_dict):
             output_for_kw_input = model(**inputs_np)
             self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
 
+    def test_valid_input_signature_and_dummies(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            call_args = inspect.signature(model.call).parameters
+            for key in model.input_signature:
+                self.assertIn(key, call_args)
+            for key in model.dummy_inputs:
+                self.assertIn(key, call_args)
+
     def test_resize_token_embeddings(self):
         # TODO (joao): after the embeddings refactor is complete, rework this test so as to rely exclusively on
         # tf.keras.layers.Embedding
@@ -1700,7 +1710,7 @@ def test_dataset_conversion(self):
                 for tensor in test_batch.values():
                     self.assertTrue(isinstance(tensor, tf.Tensor))
                     self.assertEqual(len(tensor), len(input_dataset))  # Assert we didn't lose any data
-                    model(test_batch, training=False)
+            model(test_batch, training=False)
 
             if "labels" in inspect.signature(model_class.call).parameters.keys():
                 tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index 17d68a4de59a..135db86d4d55 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -217,18 +217,17 @@ def test_saved_model_creation_extended(self):
         for model_class in self.all_model_classes:
             class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
-            class_sig = model._prune_signature(model.input_signature)
             num_out = len(model(class_inputs_dict))
 
             for key in list(class_inputs_dict.keys()):
                 # Remove keys not in the serving signature, as the SavedModel will not be compiled to deal with them
-                if key not in class_sig:
+                if key not in model.input_signature:
                     del class_inputs_dict[key]
                 # Check it's a tensor, in case the inputs dict has some bools in it too
                 elif isinstance(class_inputs_dict[key], tf.Tensor) and class_inputs_dict[key].dtype.is_integer:
                     class_inputs_dict[key] = tf.cast(class_inputs_dict[key], tf.int32)
 
-            if set(class_inputs_dict.keys()) != set(class_sig.keys()):
+            if set(class_inputs_dict.keys()) != set(model.input_signature.keys()):
                 continue  # Some models have inputs that the preparation functions don't create, we skip those
 
             with tempfile.TemporaryDirectory() as tmpdirname:

From ee88ae59940fd4b2c8fc119373143d7a1175c651 Mon Sep 17 00:00:00 2001
From: Teven <teven.lescao@gmail.com>
Date: Fri, 16 Jun 2023 15:14:03 -0400
Subject: [PATCH 451/935] Adding ddp_broadcast_buffers argument to Trainer
 (#24326)

adding ddp_broadcast_buffers argument
---
 src/transformers/trainer.py       |  3 +++
 src/transformers/training_args.py | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f47b57d6d9a1..0eec118cf4df 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1450,6 +1450,9 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
             if self.args.ddp_bucket_cap_mb is not None:
                 kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
 
+            if self.args.ddp_broadcast_buffers is not None:
+                kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers
+
             self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
 
         return model
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 5fe680a802f8..71d75c6e0e29 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -505,6 +505,9 @@ class TrainingArguments:
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
         ddp_bucket_cap_mb (`int`, *optional*):
             When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.
+        ddp_broadcast_buffers (`bool`, *optional*):
+            When using distributed training, the value of the flag `broadcast_buffers` passed to
+            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
         dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
             Whether you want to pin memory in data loaders or not. Will default to `True`.
         skip_memory_metrics (`bool`, *optional*, defaults to `True`):
@@ -1045,6 +1048,15 @@ class TrainingArguments:
             )
         },
     )
+    ddp_broadcast_buffers: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "When using distributed training, the value of the flag `broadcast_buffers` passed to "
+                "`DistributedDataParallel`."
+            )
+        },
+    )
     dataloader_pin_memory: bool = field(
         default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
     )

From 881c0df952fbe1e166033806f9128b3c52dc507d Mon Sep 17 00:00:00 2001
From: Xiaoyang Sun <xshaun@outlook.com>
Date: Mon, 19 Jun 2023 18:34:21 +0800
Subject: [PATCH 452/935] error bug on saving distributed optim state when
 using data parallel (#24108)

Update checkpoint_reshaping_and_interoperability.py
---
 .../megatron_gpt2/checkpoint_reshaping_and_interoperability.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index fffed940105e..5a3d30b649b2 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -690,7 +690,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
             for j in range(args.target_tensor_model_parallel_size):
                 for k in range(args.target_data_parallel_size):
                     if args.target_pipeline_model_parallel_size == 1:
-                        checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}"
+                        checkpoint_dir = f"mp_rank_{j:02d}_{k:03d}"
                     else:
                         checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}"
                     checkpoint_dir = os.path.join(release_dir, checkpoint_dir)

From 3c124df57904e84d864ea67d0b36171022fcc00d Mon Sep 17 00:00:00 2001
From: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Date: Mon, 19 Jun 2023 19:43:57 +0900
Subject: [PATCH 453/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Fixed=20`tut?=
 =?UTF-8?q?orial/preprocessing.mdx`=20(#24156)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: revise translations

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/preprocessing.mdx | 100 +++++++++++++++----------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/docs/source/ko/preprocessing.mdx b/docs/source/ko/preprocessing.mdx
index 6b9aff451003..9b83d7bb29ba 100644
--- a/docs/source/ko/preprocessing.mdx
+++ b/docs/source/ko/preprocessing.mdx
@@ -14,16 +14,16 @@ specific language governing permissions and limitations under the License.
 
 [[open-in-colab]]
 
-모델을 학습하려면 데이터셋을 모델에 맞는 입력 형식으로 전처리 해야 합니다. 데이터가 텍스트, 이미지 또는 오디오인지 여부에 관계없이 데이터를 텐서 배치로 변환하고 조립할 필요가 있습니다. 🤗 Transformers는 모델에 대한 데이터를 준비하는 데 도움이 되는 일련의 전처리 클래스를 제공합니다. 이 튜토리얼에서는 다음 내용을 배울 수 있습니다.
+모델을 훈련하려면 데이터 세트를 모델에 맞는 입력 형식으로 전처리해야 합니다. 텍스트, 이미지 또는 오디오인지 관계없이 데이터를 텐서 배치로 변환하고 조립할 필요가 있습니다. 🤗 Transformers는 모델에 대한 데이터를 준비하는 데 도움이 되는 일련의 전처리 클래스를 제공합니다. 이 튜토리얼에서는 다음 내용을 배울 수 있습니다:
 
-* 텍스트는 [Tokenizer](./main_classes/tokenizer)를 사용하여 텍스트를 토큰 시퀀스로 변환하고 토큰의 숫자 표현을 만든 후 텐서로 조립합니다.
+* 텍스트는 [Tokenizer](./main_classes/tokenizer)를 사용하여 토큰 시퀀스로 변환하고 토큰의 숫자 표현을 만든 후 텐서로 조립합니다.
 * 음성 및 오디오는 [Feature extractor](./main_classes/feature_extractor)를 사용하여 오디오 파형에서 시퀀스 특성을 파악하여 텐서로 변환합니다.
 * 이미지 입력은 [ImageProcessor](./main_classes/image)을 사용하여 이미지를 텐서로 변환합니다.
 * 멀티모달 입력은 [Processor](./main_classes/processors)을 사용하여 토크나이저와 특성 추출기 또는 이미지 프로세서를 결합합니다.
 
 <Tip>
 
-`AutoProcessor`는 **항상** 작동하며 토크나이저, 이미지 프로세서, 특성 추출기 또는 프로세서 등 사용 중인 모델에 맞는 클래스를 자동으로 선택합니다.
+`AutoProcessor`는 **언제나** 작동하여 토크나이저, 이미지 프로세서, 특성 추출기 또는 프로세서 등 사용 중인 모델에 맞는 클래스를 자동으로 선택합니다.
 
 </Tip>
 
@@ -41,11 +41,11 @@ pip install datasets
 
 <Tip>
 
-사전 훈련된 모델을 사용할 계획이라면 모델과 함께 사전 훈련된 토크나이저를 사용하는 것이 중요합니다. 이렇게 하면 텍스트가 사전 훈련 말뭉치와 동일한 방식으로 분할되고 사전 훈련 중에 동일한 해당 토큰-인덱스 쌍(일반적으로 *vocab*이라고 함)을 사용합니다.
+사전훈련된 모델을 사용할 계획이라면 모델과 함께 사전훈련된 토크나이저를 사용하는 것이 중요합니다. 이렇게 하면 텍스트가 사전훈련 말뭉치와 동일한 방식으로 분할되고 사전훈련 중에 동일한 해당 토큰-인덱스 쌍(일반적으로 *vocab*이라고 함)을 사용합니다.
 
 </Tip>
 
-시작하려면 [`AutoTokenizer.from_pretrained`] 메소드를 사용하여 사전 훈련된 토크나이저를 불러오세요. 모델과 함께 사전 훈련된 *vocab*을 다운로드합니다:
+시작하려면 [`AutoTokenizer.from_pretrained`] 메소드를 사용하여 사전훈련된 토크나이저를 불러오세요. 모델과 함께 사전훈련된 *vocab*을 다운로드합니다:
 
 ```py
 >>> from transformers import AutoTokenizer
@@ -63,7 +63,7 @@ pip install datasets
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 ```
 
-토크나이저는 세 가지 중요한 항목을 포함한 사전을 반환합니다:
+토크나이저는 세 가지 중요한 항목을 포함한 딕셔너리를 반환합니다:
 
 * [input_ids](glossary#input-ids)는 문장의 각 토큰에 해당하는 인덱스입니다.
 * [attention_mask](glossary#attention-mask)는 토큰을 처리해야 하는지 여부를 나타냅니다.
@@ -76,10 +76,10 @@ pip install datasets
 '[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
 ```
 
-토크나이저가 두 개의 특수한 토큰(분류 토큰 CLS와 분할 토큰 SEP)을 문장에 추가했습니다.
-모든 모델에 특수한 토큰이 필요한 것은 아니지만, 필요한 경우 토크나이저가 자동으로 추가합니다.
+토크나이저가 두 개의 특수한 토큰(분류 토큰 `CLS`와 분할 토큰 `SEP`)을 문장에 추가했습니다.
+모든 모델에 특수한 토큰이 필요한 것은 아니지만, 필요하다면 토크나이저가 자동으로 추가합니다.
 
-전처리할 문장이 여러 개 있는 경우 이를 리스트로 토크나이저에 전달합니다:
+전처리할 문장이 여러 개 있는 경우에는 리스트로 토크나이저에 전달합니다:
 
 ```py
 >>> batch_sentences = [
@@ -102,9 +102,9 @@ pip install datasets
 
 ### 패딩[[pad]]
 
-모델 입력인 텐서는 균일한 모양을 가져야 하는데, 문장의 길이가 항상 같지 않아서 문제가 될 수 있습니다. 패딩은 짧은 문장에 특수한 *패딩 토큰*을 추가하여 텐서가 직사각형 모양이 되도록 하는 전략입니다.
+모델 입력인 텐서는 모양이 균일해야 하지만, 문장의 길이가 항상 같지는 않기 때문에 문제가 될 수 있습니다. 패딩은 짧은 문장에 특수한 *패딩 토큰*을 추가하여 텐서를 직사각형 모양이 되도록 하는 전략입니다.
 
-`padding` 매개변수를 `True`로 설정하여 배치의 짧은 시퀀스를 가장 긴 시퀀스와 일치하도록 패딩합니다.
+`padding` 매개변수를 `True`로 설정하여 배치 내의 짧은 시퀀스를 가장 긴 시퀀스에 맞춰 패딩합니다.
 
 ```py
 >>> batch_sentences = [
@@ -125,11 +125,11 @@ pip install datasets
                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
 ```
 
-길이가 짧은 첫 문장과 세 번째 문장은 이제 `0`으로 채워집니다.
+길이가 짧은 첫 문장과 세 번째 문장이 이제 `0`으로 채워졌습니다.
 
-### 생략[[truncation]]
+### 잘라내기[[truncation]]
 
-한편, 때로는 시퀀스가 모델에서 처리하기에 너무 길 수도 있습니다. 이 경우, 시퀀스를 더 짧은 길이로 줄일 필요가 있습니다.
+한편, 때로는 시퀀스가 모델에서 처리하기에 너무 길 수도 있습니다. 이 경우, 시퀀스를 더 짧게 줄일 필요가 있습니다.
 
 모델에서 허용하는 최대 길이로 시퀀스를 자르려면 `truncation` 매개변수를 `True`로 설정하세요:
 
@@ -154,7 +154,7 @@ pip install datasets
 
 <Tip>
 
-다양한 패딩 및 생략 인수에 대해 더 알아보려면 [Padding and truncation](./pad_truncation) 개념 가이드를 확인해보세요.
+다양한 패딩과 잘라내기 인수에 대해 더 알아보려면 [패딩과 잘라내기](./pad_truncation) 개념 가이드를 확인해보세요.
 
 </Tip>
 
@@ -214,9 +214,9 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 ## 오디오[[audio]]
 
-오디오 작업에는 데이터셋을 모델에 준비하기 위해 [특성 추출기](main_classes/feature_extractor)가 필요합니다. 특성 추출기는 원시 오디오 데이터에서 특성를 추출하고 이를 텐서로 변환하는 것이 목적입니다.
+오디오 작업은 모델에 맞는 데이터 세트를 준비하기 위해 [특성 추출기](main_classes/feature_extractor)가 필요합니다. 특성 추출기는 원시 오디오 데이터에서 특성를 추출하고 이를 텐서로 변환하는 것이 목적입니다.
 
-오디오 데이터셋에 특성 추출기를 사용하는 방법을 보려면 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터셋을 가져오세요. (데이터셋을 가져오는 방법은 🤗 [데이터셋 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 자세히 설명하고 있습니다.)
+오디오 데이터 세트에 특성 추출기를 사용하는 방법을 보기 위해 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트를 가져오세요. (데이터 세트를 가져오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 자세히 설명하고 있습니다.)
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -240,8 +240,8 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 * `path`는 오디오 파일의 위치를 가리킵니다.
 * `sampling_rate`는 음성 신호에서 초당 측정되는 데이터 포인트 수를 나타냅니다.
 
-이 튜토리얼에서는 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) 모델을 사용합니다. 모델 카드를 보면 Wav2Vec2가 16kHz 샘플링된 음성 오디오를 기반으로 사전 학습된 것을 알 수 있습니다. 
-모델을 사전 학습하는 데 사용된 데이터셋의 샘플링 레이트와 오디오 데이터의 샘플링 레이트가 일치해야 합니다. 데이터의 샘플링 레이트가 다르면 데이터를 리샘플링해야 합니다.
+이 튜토리얼에서는 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) 모델을 사용합니다. 모델 카드를 보면 Wav2Vec2가 16kHz 샘플링된 음성 오디오를 기반으로 사전훈련된 것을 알 수 있습니다. 
+모델을 사전훈련하는 데 사용된 데이터 세트의 샘플링 레이트와 오디오 데이터의 샘플링 레이트가 일치해야 합니다. 데이터의 샘플링 레이트가 다르면 데이터를 리샘플링해야 합니다.
 
 1. 🤗 Datasets의 [`~datasets.Dataset.cast_column`] 메소드를 사용하여 샘플링 레이트를 16kHz로 업샘플링하세요:
 
@@ -259,8 +259,8 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
  'sampling_rate': 16000}
 ```
 
-다음으로, 입력을 정규화하고 패딩하는 특성 추출기를 가져오세요. 텍스트 데이터의 경우, 더 짧은 시퀀스에 대해 `0`이 추가됩니다. 오디오 데이터에도 같은 개념이 적용됩니다. 
-특성 추출기는 배열에 대해 `0`(묵음으로 해석)을 추가합니다.
+다음으로, 입력을 정규화하고 패딩할 특성 추출기를 가져오세요. 텍스트 데이터의 경우, 더 짧은 시퀀스에 대해 `0`이 추가됩니다. 오디오 데이터에도 같은 개념이 적용됩니다. 
+특성 추출기는 배열에 `0`(묵음으로 해석)을 추가합니다.
 
 [`AutoFeatureExtractor.from_pretrained`]를 사용하여 특성 추출기를 가져오세요:
 
@@ -270,7 +270,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 ```
 
-오디오 `array`를 특성 추출기에 전달하세요. 또한, 특성 추출기에 `sampling_rate` 인수를 추가하여 발생할 수 있는 조용한 오류(silent errors)를 더 잘 디버깅하는 것을 권장합니다.
+오디오 `array`를 특성 추출기에 전달하세요. 또한, 발생할 수 있는 조용한 오류(silent errors)를 더 잘 디버깅할 수 있도록 특성 추출기에 `sampling_rate` 인수를 추가하는 것을 권장합니다.
 
 ```py
 >>> audio_input = [dataset[0]["audio"]["array"]]
@@ -279,7 +279,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
 ```
 
-토크나이저와 마찬가지로 배치 내에서 가변적인 시퀀스를 처리하기 위해 패딩 또는 생략을 적용할 수 있습니다. 이 두 개의 오디오 샘플의 시퀀스 길이를 확인해보세요:
+토크나이저와 마찬가지로 배치 내에서 가변적인 시퀀스를 처리하기 위해 패딩 또는 잘라내기를 적용할 수 있습니다. 이 두 개의 오디오 샘플의 시퀀스 길이를 확인해보세요:
 
 ```py
 >>> dataset[0]["audio"]["array"].shape
@@ -289,7 +289,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 (106496,)
 ```
 
-오디오 샘플의 길이가 동일하도록 데이터셋을 전처리하는 함수를 만들어 보세요. 최대 샘플 길이를 지정하면, 특성 추출기가 해당 길이에 맞춰 시퀀스를 패딩하거나 생략합니다:
+오디오 샘플의 길이가 동일하도록 데이터 세트를 전처리하는 함수를 만드세요. 최대 샘플 길이를 지정하면 특성 추출기가 해당 길이에 맞춰 시퀀스를 패딩하거나 잘라냅니다:
 
 ```py
 >>> def preprocess_function(examples):
@@ -304,13 +304,13 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 ...     return inputs
 ```
 
-`preprocess_function`을 데이터셋의 처음 몇 가지 예제에 적용해보세요:
+`preprocess_function`을 데이터 세트의 처음 예시 몇 개에 적용해보세요:
 
 ```py
 >>> processed_dataset = preprocess_function(dataset[:5])
 ```
 
-이제 샘플 길이가 모두 같고 지정된 최대 길이에 맞게 되었습니다. 드디어 전처리된 데이터셋을 모델에 전달할 수 있습니다!
+이제 샘플 길이가 모두 같고 지정된 최대 길이에 맞게 되었습니다. 드디어 전처리된 데이터 세트를 모델에 전달할 수 있습니다!
 
 ```py
 >>> processed_dataset["input_values"][0].shape
@@ -322,7 +322,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 ## 컴퓨터 비전[[computer-vision]]
 
-컴퓨터 비전 작업의 경우, 모델에 대한 데이터셋을 준비하기 위해 [이미지 프로세서](main_classes/image_processor)가 필요합니다.
+컴퓨터 비전 작업의 경우, 모델에 대한 데이터 세트를 준비하기 위해 [이미지 프로세서](main_classes/image_processor)가 필요합니다.
 이미지 전처리는 이미지를 모델이 예상하는 입력으로 변환하는 여러 단계로 이루어집니다. 
 이러한 단계에는 크기 조정, 정규화, 색상 채널 보정, 이미지의 텐서 변환 등이 포함됩니다.
 
@@ -331,22 +331,22 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 이미지 전처리는 이미지 증강 기법을 몇 가지 적용한 뒤에 할 수도 있습니다.
 이미지 전처리 및 이미지 증강은 모두 이미지 데이터를 변형하지만, 서로 다른 목적을 가지고 있습니다:
 
-* 이미지 증강은 과적합(over-fitting)을 방지하고 모델의 견고성(resiliency)을 높이는 데 도움이 되는 방식으로 이미지를 수정합니다. 
+* 이미지 증강은 과적합(over-fitting)을 방지하고 모델의 견고함(resiliency)을 높이는 데 도움이 되는 방식으로 이미지를 수정합니다. 
 밝기와 색상 조정, 자르기, 회전, 크기 조정, 확대/축소 등 다양한 방법으로 데이터를 증강할 수 있습니다. 
 그러나 증강으로 이미지의 의미가 바뀌지 않도록 주의해야 합니다.
 * 이미지 전처리는 이미지가 모델이 예상하는 입력 형식과 일치하도록 보장합니다. 
 컴퓨터 비전 모델을 미세 조정할 때 이미지는 모델이 초기에 훈련될 때와 정확히 같은 방식으로 전처리되어야 합니다.
 
-이미지 증강에는 원하는 라이브러리를 사용할 수 있습니다. 이미지 전처리에는 모델과 연결된 `ImageProcessor`를 사용합니다.
+이미지 증강에는 원하는 라이브러리를 무엇이든 사용할 수 있습니다. 이미지 전처리에는 모델과 연결된 `ImageProcessor`를 사용합니다.
 
 </Tip>
 
-[food101](https://huggingface.co/datasets/food101) 데이터셋을 가져와서 컴퓨터 비전 데이터셋에서 이미지 프로세서를 어떻게 사용하는지 알아보세요. 
-데이터셋 불러오는 방법은 🤗 [데이터셋 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)를 참고하세요.
+[food101](https://huggingface.co/datasets/food101) 데이터 세트를 가져와서 컴퓨터 비전 데이터 세트에서 이미지 프로세서를 어떻게 사용하는지 알아보세요. 
+데이터 세트를 불러오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)을 참고하세요.
 
 <Tip>
 
-데이터셋이 상당히 크기 때문에 🤗 Datasets의 `split` 매개변수를 사용하여 학습 분할에서 작은 샘플만 가져오세요!
+데이터 세트가 상당히 크기 때문에 🤗 Datasets의 `split` 매개변수를 사용하여 훈련 세트에서 작은 샘플만 가져오세요!
 
 </Tip>
 
@@ -356,7 +356,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-다음으로, 🤗 Datasets의 [`image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) 기능으로 이미지를 확인해보세요:
+다음으로, 🤗 Datasets의 [`image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image)로 이미지를 확인해보세요:
 
 ```py
 >>> dataset[0]["image"]
@@ -375,9 +375,9 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 ```
 
 먼저 이미지 증강 단계를 추가해 봅시다. 아무 라이브러리나 사용해도 괜찮지만, 이번 튜토리얼에서는 torchvision의 [`transforms`](https://pytorch.org/vision/stable/transforms.html) 모듈을 사용하겠습니다.
-다른 데이터 증강 라이브러리를 사용하는 방법이 알고 싶다면, [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) 또는 [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)에서 배울 수 있습니다.
+다른 데이터 증강 라이브러리를 사용해보고 싶다면, [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) 또는 [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)에서 어떻게 사용하는지 배울 수 있습니다.
 
-1. [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)로  [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)와 [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) 등의 변환을 몇 가지 연결하세요.
+1. [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)로  [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)와 [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) 등 변환을 몇 가지 연결하세요.
 참고로 크기 조정에 필요한 이미지의 크기 요구사항은 `image_processor`에서 가져올 수 있습니다. 
 일부 모델은 정확한 높이와 너비를 요구하지만, 제일 짧은 변의 길이(`shortest_edge`)만 정의된 모델도 있습니다.
 
@@ -407,8 +407,8 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 <Tip>
 
 위의 예에서는 이미지 증강 중에 이미지 크기를 조정했기 때문에 `do_resize=False`로 설정하고, 해당 `image_processor`에서 `size` 속성을 활용했습니다. 
-이미지 증강 중에 이미지 크기를 조정하지 않은 경우 이 매개 변수를 생략하세요. 
-기본적으로 `ImageProcessor`가 크기 조정을 처리합니다. 
+이미지 증강 중에 이미지 크기를 조정하지 않은 경우 이 매개변수를 생략하세요. 
+기본적으로는 `ImageProcessor`가 크기 조정을 처리합니다. 
 
 증강 변환 과정에서 이미지를 정규화하려면 `image_processor.image_mean` 및 `image_processor.image_std` 값을 사용하세요.
 
@@ -420,8 +420,8 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> dataset.set_transform(transforms)
 ```
 
-4. 이제 이미지에 액세스하면 이미지 프로세서가 `pixel_values`를 추가한 것을 알 수 있습니다. 
-드디어 처리된 데이터셋을 모델에 전달할 수 있습니다!
+4. 이제 이미지에 접근하면 이미지 프로세서가 `pixel_values`를 추가한 것을 알 수 있습니다. 
+드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
 
 ```py
 >>> dataset[0].keys()
@@ -448,11 +448,11 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 </Tip>
 
-### 패드[[pad]]
+### 패딩[[pad]]
 
-예를 들어, [DETR](./model_doc/detr)와 같은 경우에는 모델이 학습할 때 크기 조정 증강을 적용합니다. 
-이로 인해 배치 내 이미지 크기가 다를 수 있습니다. 
-[`DetrImageProcessor`]의 [`DetrImageProcessor.pad_and_create_pixel_mask`]를 사용하고 사용자 지정 `collate_fn`을 정의해서 배치 이미지를 처리할 수 있습니다.
+예를 들어, [DETR](./model_doc/detr)와 같은 경우에는 모델이 훈련할 때 크기 조정 증강을 적용합니다. 
+이로 인해 배치 내 이미지 크기가 달라질 수 있습니다. 
+[`DetrImageProcessor`]의 [`DetrImageProcessor.pad_and_create_pixel_mask`]를 사용하고 사용자 정의 `collate_fn`을 정의해서 배치 이미지를 처리할 수 있습니다.
 
 ```py
 >>> def collate_fn(batch):
@@ -468,11 +468,11 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 ## 멀티모달[[multimodal]]
 
-멀티모달 입력이 필요한 작업의 경우, 모델에 데이터셋을 준비하기 위한 [프로세서](main_classes/processors)가 필요합니다. 
+멀티모달 입력이 필요한 작업의 경우, 모델에 데이터 세트를 준비하기 위한 [프로세서](main_classes/processors)가 필요합니다. 
 프로세서는 토크나이저와 특성 추출기와 같은 두 가지 처리 객체를 결합합니다.
 
-[LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터셋을 로드하여 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요. 
-(데이터셋을 로드하는 방법에 대한 자세한 내용은 🤗 [데이터셋 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 볼 수 있습니다.)
+[LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터 세트를 가져와서 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요. 
+(데이터 세트를 가져오는 방법에 대한 자세한 내용은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 볼 수 있습니다.)
 
 ```py
 >>> from datasets import load_dataset
@@ -480,7 +480,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> lj_speech = load_dataset("lj_speech", split="train")
 ```
 
-ASR에서는 `audio`와 `text`에만 집중하면 되므로, 다른 열들을 제거할 수 있습니다:
+자동 음성 인식(ASR)에서는 `audio`와 `text`에만 집중하면 되므로, 다른 열들은 제거할 수 있습니다:
 
 ```py
 >>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
@@ -499,7 +499,7 @@ ASR에서는 `audio`와 `text`에만 집중하면 되므로, 다른 열들을 
 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
 ```
 
-기존에 사전 학습된 모델에서 사용된 데이터셋과 새로운 오디오 데이터셋의 샘플링 레이트를 일치시키기 위해 오디오 데이터셋의 샘플링 레이트를 [리샘플링](preprocessing#audio)해야 합니다!
+기존에 사전훈련된 모델에서 사용된 데이터 세트와 새로운 오디오 데이터 세트의 샘플링 레이트를 일치시키기 위해 오디오 데이터 세트의 샘플링 레이트를 [리샘플링](preprocessing#audio)해야 합니다!
 
 ```py
 >>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
@@ -531,5 +531,5 @@ ASR에서는 `audio`와 `text`에만 집중하면 되므로, 다른 열들을 
 >>> prepare_dataset(lj_speech[0])
 ```
 
-이제 프로세서가 `input_values`와 `labels`를 추가하고, 샘플링 레이트도 올바르게 16kHz로 다운 샘플링했습니다. 
-드디어 처리된 데이터셋을 모델에 전달할 수 있습니다!
\ No newline at end of file
+이제 프로세서가 `input_values`와 `labels`를 추가하고, 샘플링 레이트도 올바르게 16kHz로 다운샘플링했습니다. 
+드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
\ No newline at end of file

From 17e3e7d6864f1ce8e2ea2d33add06f2393cccdf5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 19 Jun 2023 12:48:53 +0200
Subject: [PATCH 454/935] pin `apex` to a speicifc commit (for DeepSpeed CI
 docker image) (#24351)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 71df34999cfc..782c97e4cabd 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -36,7 +36,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
 
 # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed

From 691b60db902c9bc2bc657f6277b5df1a19961fbc Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 19 Jun 2023 12:50:20 +0200
Subject: [PATCH 455/935] byebye Hub connection timeout (#24350)

byebye timeout

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index cbcbf63087b8..21e5b055ff6c 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -108,6 +108,8 @@ def to_dict(self):
             },
         ]
         steps.extend([{"run": l} for l in self.install_steps])
+        # TODO (ydshieh): Remove this line after the next release (the one after 2023/06/19) of `huggingface_hub`
+        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install git+https://github.com/huggingface/huggingface_hub.git@92028da882e47d61703e8c9144028e1ecadab415"}})
         steps.append(
             {
                 "save_cache": {

From 0b259a3b7e8548aff507e832ca2eb078967c3fe8 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 19 Jun 2023 12:54:02 +0200
Subject: [PATCH 456/935] Clean up disk sapce during docker image build for
 `transformers-pytorch-gpu` (#24346)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/build-docker-images.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index dfc407e69abc..a4d15a77d5f4 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -176,6 +176,16 @@ jobs:
     if: inputs.image_postfix != '-push-ci'
     runs-on: ubuntu-latest
     steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2

From 3b5a56e595df9782352eb08c472febcb8d529d07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Kripner?= <kripnermatej@gmail.com>
Date: Mon, 19 Jun 2023 13:51:25 +0200
Subject: [PATCH 457/935] Fix `KerasMetricCallback`: pass `generate_kwargs`
 even if `use_xla_generation` is False (#24333)

* Fix `KerasMetricCallback`: always pass `generate_kwargs`.

* Reformat code using Black.
---
 src/transformers/keras_callbacks.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index a9d75c9aeeaa..661a8c09d585 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -224,7 +224,9 @@ def generation_function(inputs, attention_mask):
                 if self.use_xla_generation:
                     predictions = self.generation_function(generation_inputs, attention_mask=attention_mask)
                 else:
-                    predictions = self.model.generate(generation_inputs, attention_mask=attention_mask)
+                    predictions = self.model.generate(
+                        generation_inputs, attention_mask=attention_mask, **self.generate_kwargs
+                    )
             else:
                 predictions = self.model.predict_on_batch(batch)
                 if isinstance(predictions, dict):

From 5fca839fefb220a5ed580b1bc06a6db628946c4d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:06:05 +0200
Subject: [PATCH 458/935] Fix device issue in `SwitchTransformers` (#24352)

* fix

* Update src/transformers/models/switch_transformers/modeling_switch_transformers.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/switch_transformers/modeling_switch_transformers.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index b8a90a6c1538..008e23531ac1 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -798,7 +798,7 @@ def forward(
         if isinstance(hidden_states, tuple):
             hidden_states, router_tuple = hidden_states
         else:
-            router_tuple = (torch.tensor([0]),)
+            router_tuple = (torch.tensor([0], device=hidden_states.device),)
 
         # clamp inf values to enable fp16 training
         if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():

From 7761b1893a7717fc56e81726798616f71e1b391b Mon Sep 17 00:00:00 2001
From: Vineel Pratap <vineelkpratap@gmail.com>
Date: Mon, 19 Jun 2023 06:49:01 -0700
Subject: [PATCH 459/935] Update MMS integration docs  (#24311)

* Update mms.mdx

* Update mms.mdx

* Update docs/source/en/model_doc/mms.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update mms.mdx

* Update docs/source/en/model_doc/mms.mdx

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 docs/source/en/model_doc/mms.mdx | 100 +++++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/mms.mdx b/docs/source/en/model_doc/mms.mdx
index da2986fbb336..eaf524b372f0 100644
--- a/docs/source/en/model_doc/mms.mdx
+++ b/docs/source/en/model_doc/mms.mdx
@@ -30,21 +30,21 @@ for the same number of languages, as well as a language identification model for
 Experiments show that our multilingual speech recognition model more than halves the word error rate of 
 Whisper on 54 languages of the FLEURS benchmark while being trained on a small fraction of the labeled data.*
 
-Tips:
+Here are the different models open sourced in the MMS project. The models and code are originally released [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). We have add them to the `transformers` framework, making them easier to use.
 
-- MMS is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. The raw waveform should be pre-processed with [`Wav2Vec2FeatureExtractor`].
-- MMS model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
-  [`Wav2Vec2CTCTokenizer`].
-- MMS can load different language adapter weights for different languages via [`~Wav2Vec2PreTrainedModel.load_adapter`]. Language adapters only consists of roughly 2 million parameters 
-  and can therefore be efficiently loaded on the fly when needed.
+### Automatic Speech Recognition (ASR)
 
-Relevant checkpoints can be found under https://huggingface.co/models?other=mms.
+The ASR model checkpoints  can be found here : [mms-1b-fl102](https://huggingface.co/facebook/mms-1b-fl102), [mms-1b-l1107](https://huggingface.co/facebook/mms-1b-l1107), [mms-1b-all](https://huggingface.co/facebook/mms-1b-all). For best accuracy, use the `mms-1b-all` model. 
 
-MMS's architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2).
+Tips:
 
-The original code can be found [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
+- All ASR models accept a float array corresponding to the raw waveform of the speech signal. The raw waveform should be pre-processed with [`Wav2Vec2FeatureExtractor`].
+- The models were trained using connectionist temporal classification (CTC) so the model output has to be decoded using
+  [`Wav2Vec2CTCTokenizer`].
+- You can load different language adapter weights for different languages via [`~Wav2Vec2PreTrainedModel.load_adapter`]. Language adapters only consists of roughly 2 million parameters 
+  and can therefore be efficiently loaded on the fly when needed.
 
-## Loading
+#### Loading
 
 By default MMS loads adapter weights for English. If you want to load adapter weights of another language 
 make sure to specify `target_lang=<your-chosen-target-lang>` as well as `"ignore_mismatched_sizes=True`.
@@ -86,7 +86,7 @@ target_lang = "fra"
 pipe = pipeline(model=model_id, model_kwargs={"target_lang": "fra", "ignore_mismatched_sizes": True})
 ```
 
-## Inference
+#### Inference
 
 Next, let's look at how we can run MMS in inference and change adapter layers after having called [`~PretrainedModel.from_pretrained`]
 First, we load audio data in different languages using the [Datasets](https://github.com/huggingface/datasets).
@@ -156,3 +156,81 @@ processor.tokenizer.vocab.keys()
 ```
 
 to see all supported languages.
+
+To further improve performance from ASR models, language model decoding can be used. See the documentation [here](https://huggingface.co/facebook/mms-1b-all) for further details.  
+
+### Speech Synthesis (TTS)
+
+Individual TTS models are available for each of the 1100+ languages. The models and inference documentation can be found [here](https://huggingface.co/facebook/mms-tts).
+
+### Language Identification (LID)
+
+Different LID models are available based on the number of languages they can recognize - [126](https://huggingface.co/facebook/mms-lid-126), [256](https://huggingface.co/facebook/mms-lid-256), [512](https://huggingface.co/facebook/mms-lid-512), [1024](https://huggingface.co/facebook/mms-lid-1024), [2048](https://huggingface.co/facebook/mms-lid-2048), [4017](https://huggingface.co/facebook/mms-lid-4017). 
+
+#### Inference
+First, we install transformers and some other libraries
+```
+pip install torch accelerate torchaudio datasets
+pip install --upgrade transformers
+````
+pip install torch datasets[audio]
+Next, we load a couple of audio samples via `datasets`. Make sure that the audio data is sampled to 16000 kHz.
+
+```py
+from datasets import load_dataset, Audio
+
+# English
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+en_sample = next(iter(stream_data))["audio"]["array"]
+
+# Arabic
+stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="test", streaming=True)
+stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
+ar_sample = next(iter(stream_data))["audio"]["array"]
+```
+
+Next, we load the model and processor
+
+```py
+from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
+import torch
+
+model_id = "facebook/mms-lid-126"
+
+processor = AutoFeatureExtractor.from_pretrained(model_id)
+model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id)
+```
+
+Now we process the audio data, pass the processed audio data to the model to classify it into a language, just like we usually do for Wav2Vec2 audio classification models such as [ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition](https://huggingface.co/harshit345/xlsr-wav2vec-speech-emotion-recognition)
+
+```py
+# English
+inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+lang_id = torch.argmax(outputs, dim=-1)[0].item()
+detected_lang = model.config.id2label[lang_id]
+# 'eng'
+
+# Arabic
+inputs = processor(ar_sample, sampling_rate=16_000, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs).logits
+
+lang_id = torch.argmax(outputs, dim=-1)[0].item()
+detected_lang = model.config.id2label[lang_id]
+# 'ara'
+```
+
+To see all the supported languages of a checkpoint, you can print out the language ids as follows:
+```py
+processor.id2label.values()
+```
+
+### Audio Pretrained Models
+
+Pretrained models are available for two different sizes - [300M](https://huggingface.co/facebook/mms-300m) , [1Bil](https://huggingface.co/facebook/mms-1b). The architecture is based on the Wav2Vec2 model, so one can refer to [Wav2Vec2's documentation page](wav2vec2) for further details on how to finetune with models for various downstream tasks.

From a4de24f691fa9a2757b726ee53a27058587f7b04 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 19 Jun 2023 16:02:06 +0200
Subject: [PATCH 460/935] Make `AutoFormer` work with previous torch version
 (#24357)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/autoformer/modeling_autoformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 3e482cadf670..70587add17e7 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -23,6 +23,7 @@
 
 import numpy as np
 import torch
+import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN

From 7e71eb2ef77b0200cd079d0d0383342ed1da2df5 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:23:29 +0100
Subject: [PATCH 461/935] Fix ImageGPT doctest (#24353)

Fix doctest
---
 src/transformers/models/imagegpt/modeling_imagegpt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 4c146c6efc35..539119fabf28 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -980,7 +980,7 @@ def forward(
         >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
         >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        >>> model.to(device)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
 
         >>> # unconditional generation of 8 images
         >>> batch_size = 4
@@ -1000,7 +1000,7 @@ def forward(
         ... ]  # convert color cluster tokens back to pixels
         >>> f, axes = plt.subplots(1, batch_size, dpi=300)
 
-        >>> for img, ax in zip(samples_img, axes):
+        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
         ...     ax.axis("off")
         ...     ax.imshow(img)
         ```"""

From 52c4276e443c305953b8dcf64e4309b54d26b06d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gema=20Parre=C3=B1o?= <gema.parreno.piqueras@gmail.com>
Date: Mon, 19 Jun 2023 18:12:55 +0200
Subject: [PATCH 462/935] Fix link to documentation in Install from Source
 (#24336)

Update __init__.py

Fix link to documentation to install Transformers from source
Probably the title changed at some point from 'Installing' to 'Install'
---
 src/transformers/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 9fc9d7ee3073..d153f90b4aef 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -208,7 +208,7 @@ def check_min_version(min_version):
         if "dev" in min_version:
             error_message = (
                 "This example requires a source install from HuggingFace Transformers (see "
-                "`https://huggingface.co/transformers/installation.html#installing-from-source`),"
+                "`https://huggingface.co/docs/transformers/installation#install-from-source`),"
             )
         else:
             error_message = f"This example requires a minimum version of {min_version},"

From c003c8cb52977c95d43f0dc12e0d57c5db317d4d Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Mon, 19 Jun 2023 12:17:30 -0400
Subject: [PATCH 463/935] docs: add BentoML to awesome-transformers (#24344)

* docs: add BentoML to awesome-transformers

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: add the project to the bottom of the line

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 awesome-transformers.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/awesome-transformers.md b/awesome-transformers.md
index 446b30bdada9..dad4da8a8439 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -594,3 +594,10 @@ Keywords: Active Learning, Research, Labeling
 
 Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active Learning  
 
+## [BentoML](https://github.com/bentoml/BentoML)
+
+[BentoML](https://github.com/bentoml) is the unified framework for for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models. 
+All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage.
+
+Keywords: BentoML, Framework, Deployment, AI Applications
+

From 20273ee214d253eaa133ac3f146cf778d843ace4 Mon Sep 17 00:00:00 2001
From: Ritesh Ghorse <riteshghorse@gmail.com>
Date: Mon, 19 Jun 2023 12:26:55 -0400
Subject: [PATCH 464/935] [Doc Fix] Fix model name path in the transformers doc
 for AutoClasses (#24329)

fix model name path

Co-authored-by: Ritesh Ghorse <riteshghorse@Riteshs-Air.attlocal.net>
---
 docs/source/en/tasks/masked_language_modeling.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/tasks/masked_language_modeling.mdx b/docs/source/en/tasks/masked_language_modeling.mdx
index c81b27f33c82..1452f161b161 100644
--- a/docs/source/en/tasks/masked_language_modeling.mdx
+++ b/docs/source/en/tasks/masked_language_modeling.mdx
@@ -376,7 +376,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors. You'll also nee
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
 >>> inputs = tokenizer(text, return_tensors="pt")
 >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
 ```
@@ -409,7 +409,7 @@ Tokenize the text and return the `input_ids` as TensorFlow tensors. You'll also
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_mlm_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
 >>> inputs = tokenizer(text, return_tensors="tf")
 >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
 ```

From c5454eba9eac00a3e7d0a46a3d25aacd43187f1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Mon, 19 Jun 2023 19:59:35 +0200
Subject: [PATCH 465/935] Fix the order in `GPTNeo`'s docstring (#24358)

* Fix arg sort in docstring

* further order fix

* make style
---
 .../models/gpt_neo/configuration_gpt_neo.py   | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index 476537909b58..ea1c37af2199 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -47,41 +47,46 @@ class GPTNeoConfig(PretrainedConfig):
             Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model. Defines the different
             tokens that can be represented by the *inputs_ids* passed to the forward method of [`GPTNeoModel`].
-        attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
-            The type of attention for each layer in a `List` of the following format `[[["attention_type"],
-            num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
-            value of `attention_type` from `["global", "local"]`
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
         hidden_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
         num_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
+        attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
+            The type of attention for each layer in a `List` of the following format `[[["attention_type"],
+            num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the
+            value of `attention_type` from `["global", "local"]`
         num_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         intermediate_size (`int`, *optional*, defaults to 8192):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 256):
+            The size of the sliding window for local attention.
         activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        resid_dropout (`float`, *optional*, defaults to 0.0):
+            Residual dropout used in the attention pattern.
         embed_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         classifier_dropout (`float`, *optional*, defaults to 0.1):
-            Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`].
-
-            The dropout ratio for the hidden layer.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`].
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. The
+            dropout ratio for the hidden layer.
         layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the end of sentence token in the vocabulary.
 
     Example:
 

From cfc838dd4db5801724f59928b007b2ddf2f1141c Mon Sep 17 00:00:00 2001
From: Denis Ismailaj <denis.ismailaj1@gmail.com>
Date: Tue, 20 Jun 2023 12:43:52 +0200
Subject: [PATCH 466/935] Respect explicitly set framework parameter in
 pipeline (#24322)

* Respect framework parameter

* Move check to pipeline()

* Add check inside infer_framework_load_model again
---
 src/transformers/pipelines/__init__.py | 26 +++++++++++++-------------
 src/transformers/pipelines/base.py     |  3 ++-
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 818164f3c28b..499d94e73018 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -781,19 +781,19 @@ def pipeline(
 
     model_name = model if isinstance(model, str) else None
 
-    # Infer the framework from the model
-    # Forced if framework already defined, inferred if it's None
-    # Will load the correct model if possible
-    model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
-    framework, model = infer_framework_load_model(
-        model,
-        model_classes=model_classes,
-        config=config,
-        framework=framework,
-        task=task,
-        **hub_kwargs,
-        **model_kwargs,
-    )
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    if isinstance(model, str) or framework is None:
+        model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
+        framework, model = infer_framework_load_model(
+            model,
+            model_classes=model_classes,
+            config=config,
+            framework=framework,
+            task=task,
+            **hub_kwargs,
+            **model_kwargs,
+        )
 
     model_config = model.config
     hub_kwargs["_commit_hash"] = model.config._commit_hash
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 510c07cf5f67..7360f8b7f5ae 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -277,7 +277,8 @@ def infer_framework_load_model(
         if isinstance(model, str):
             raise ValueError(f"Could not load model {model} with any of the following classes: {class_tuple}.")
 
-    framework = infer_framework(model.__class__)
+    if framework is None:
+        framework = infer_framework(model.__class__)
     return framework, model
 
 

From 0875b2509aaef6eafdc9b49856669e04fb4aa9e6 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 20 Jun 2023 12:49:06 +0100
Subject: [PATCH 467/935] Allow passing kwargs through to TFBertTokenizer
 (#24324)

---
 src/transformers/models/bert/tokenization_bert_tf.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py
index e0e38d68a58c..281d222fbdaa 100644
--- a/src/transformers/models/bert/tokenization_bert_tf.py
+++ b/src/transformers/models/bert/tokenization_bert_tf.py
@@ -48,7 +48,9 @@ class TFBertTokenizer(tf.keras.layers.Layer):
         return_attention_mask (`bool`, *optional*, defaults to `True`):
             Whether to return the attention_mask.
         use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
-            If set to false will use standard TF Text BertTokenizer, making it servable by TF Serving.
+            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
+            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
+            TFLite.
     """
 
     def __init__(
@@ -65,11 +67,12 @@ def __init__(
         return_token_type_ids: bool = True,
         return_attention_mask: bool = True,
         use_fast_bert_tokenizer: bool = True,
+        **tokenizer_kwargs,
     ):
         super().__init__()
         if use_fast_bert_tokenizer:
             self.tf_tokenizer = FastBertTokenizer(
-                vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case
+                vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case, **tokenizer_kwargs
             )
         else:
             lookup_table = tf.lookup.StaticVocabularyTable(
@@ -81,7 +84,9 @@ def __init__(
                 ),
                 num_oov_buckets=1,
             )
-            self.tf_tokenizer = BertTokenizerLayer(lookup_table, token_out_type=tf.int64, lower_case=do_lower_case)
+            self.tf_tokenizer = BertTokenizerLayer(
+                lookup_table, token_out_type=tf.int64, lower_case=do_lower_case, **tokenizer_kwargs
+            )
 
         self.vocab_list = vocab_list
         self.do_lower_case = do_lower_case

From 183f442ba8536435e8772e2d4812c7a7a6a22493 Mon Sep 17 00:00:00 2001
From: Llohann Dallagnol Speranca
 <105556006+llohann-speranca@users.noreply.github.com>
Date: Tue, 20 Jun 2023 13:57:08 +0200
Subject: [PATCH 468/935] Fix resuming PeftModel checkpoints in Trainer 
 (#24274)

* Fix resuming checkpoints for PeftModels

Fix an error occurred when resuming a PeftModel from a training checkpoint. That was caused since PeftModel.pre_trained saves only adapter-related data while _load_from_checkpoint was expecting a torch sved model. This PR fix this issue and allows the adapter checkpoint to be loaded.

Resolves: #24252

* fix last comment

* fix nits

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 src/transformers/trainer.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 0eec118cf4df..5a59567b021c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1979,14 +1979,23 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
             model = self.model
 
         config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME)
-
+        adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME)
+        adapter_safe_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
         weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME)
         weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
         safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
         safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
 
         if not any(
-            os.path.isfile(f) for f in [weights_file, safe_weights_file, weights_index_file, safe_weights_index_file]
+            os.path.isfile(f)
+            for f in [
+                weights_file,
+                safe_weights_file,
+                weights_index_file,
+                safe_weights_index_file,
+                adapter_weights_file,
+                adapter_safe_weights_file,
+            ]
         ):
             raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
 
@@ -2039,6 +2048,21 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                 # release memory
                 del state_dict
                 self._issue_warnings_after_load(load_result)
+
+        # Load adapters following PR # 24096
+        elif is_peft_available() and isinstance(model, PeftModel):
+            # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
+            if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+                if os.path.exists(resume_from_checkpoint):
+                    model.load_adapter(resume_from_checkpoint, model.active_adapter)
+                else:
+                    logger.warning(
+                        "The intermediate checkpoints of PEFT may not be saved correctly, "
+                        f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
+                        "Check some examples here: https://github.com/huggingface/peft/issues/96"
+                    )
+            else:
+                logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
         else:
             # We load the sharded checkpoint
             load_result = load_sharded_checkpoint(
@@ -2102,8 +2126,8 @@ def _load_best_model(self):
                             else:
                                 logger.warning(
                                     "The intermediate checkpoints of PEFT may not be saved correctly, "
-                                    f"using `TrainerCallback` to save {ADAPTER_WEIGHTS_NAME} in corresponding folders, "
-                                    "here are some examples https://github.com/huggingface/peft/issues/96"
+                                    f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
+                                    "Check some examples here: https://github.com/huggingface/peft/issues/96"
                                 )
                                 has_been_loaded = False
                         else:

From 56efbf430190f61177ec01e4a03de3f6db378bfe Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 20 Jun 2023 12:59:21 +0100
Subject: [PATCH 469/935] TensorFlow CI fixes (#24360)

* Fix saved_model_creation_extended

* Skip the BLIP model creation test for now

* Fix TF SAM test

* Fix longformer tests

* Fix Wav2Vec2

* Add a skip for XLNet

* make fixup

* make fix-copies

* Add comments
---
 src/transformers/models/hubert/modeling_tf_hubert.py     | 6 ++++--
 src/transformers/models/sam/modeling_tf_sam.py           | 3 ++-
 src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py | 8 +++++---
 tests/models/blip/test_modeling_tf_blip.py               | 7 +++++++
 tests/models/longformer/test_modeling_tf_longformer.py   | 4 ++++
 tests/models/xlnet/test_modeling_tf_xlnet.py             | 4 ++++
 tests/utils/test_modeling_tf_core.py                     | 1 +
 7 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index ff14ac5b7bf1..2c4d4debeac0 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -417,8 +417,10 @@ def _normalize_kernel(self):
     def build(self, input_shape):
         if not self.built:
             input_shape = input_shape.as_list()
-            # Conv1D output shapes are checked at build time since TF 2.7, so we need to account for padding
-            input_shape[-2] += self.explicit_padding * 2
+            # If a specific input shape is passed in, we need to modify it to account for padding
+            # Not necessary if those portions of the shape are None
+            if input_shape[-2] is not None:
+                input_shape[-2] += self.explicit_padding * 2
             super().build(input_shape)
 
             self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index 0d64b825d0de..a47b091a09f6 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -810,6 +810,7 @@ def __init__(self, config, window_size, **kwargs):
         if self.use_rel_pos:
             if input_size is None:
                 raise ValueError("Input size must be provided if using relative positional encoding.")
+        self.config = config
 
     def build(self, input_shape):
         if self.input_size is not None:
@@ -928,7 +929,7 @@ def call(self, hidden_states: tf.Tensor, output_attentions=False, training=False
 
         attn_output = tf.reshape(attn_probs @ value, (batch_size, self.num_attention_heads, height, width, -1))
         attn_output = tf.transpose(attn_output, perm=(0, 2, 3, 1, 4))
-        attn_output = tf.reshape(attn_output, (batch_size, height, width, -1))
+        attn_output = tf.reshape(attn_output, (batch_size, height, width, self.config.hidden_size))
 
         attn_output = self.proj(attn_output)
 
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 97174301ccbc..7ac4bc10586b 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -451,8 +451,10 @@ def _normalize_kernel(self):
     def build(self, input_shape):
         if not self.built:
             input_shape = input_shape.as_list()
-            # Conv1D output shapes are checked at build time since TF 2.7, so we need to account for padding
-            input_shape[-2] += self.explicit_padding * 2
+            # If a specific input shape is passed in, we need to modify it to account for padding
+            # Not necessary if those portions of the shape are None
+            if input_shape[-2] is not None:
+                input_shape[-2] += self.explicit_padding * 2
             super().build(input_shape)
 
             self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
@@ -1646,7 +1648,7 @@ def call(
         if attention_mask is None:
             pooled_output = tf.reduce_mean(hidden_states, axis=1)
         else:
-            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            padding_mask = self._get_feature_vector_attention_mask(shape_list(hidden_states)[1], attention_mask)
             padding_mask_float = tf.cast(padding_mask, hidden_states.dtype)
             hidden_states = tf.multiply(hidden_states, tf.expand_dims(padding_mask_float, axis=-1))
             pooled_output = tf.divide(
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index af7533c69893..a58939c09d9a 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -434,6 +434,13 @@ def test_model_from_pretrained(self):
     def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
         super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
 
+    @unittest.skip("Matt: Re-enable this test when we have a proper export function for TF models.")
+    def test_saved_model_creation(self):
+        # This fails because the if return_loss: conditional can return None or a Tensor and TF hates that.
+        # We could fix that by setting the bool to a constant when exporting, but that requires a dedicated export
+        # function that we don't have yet.
+        pass
+
 
 class BlipTextRetrievalModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index 1cba14cdb22c..67d6d234c1c6 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -360,6 +360,10 @@ def test_for_multiple_choice(self):
     def test_saved_model_creation(self):
         pass
 
+    @unittest.skip("Longformer keeps using potentially symbolic tensors in conditionals and breaks tracing.")
+    def test_compile_tf_model(self):
+        pass
+
 
 @require_tf
 @require_sentencepiece
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index 6d76462fda9e..c33579392dc9 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -413,6 +413,10 @@ def test_model_from_pretrained(self):
             model = TFXLNetModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip("Some of the XLNet models misbehave with flexible input shapes.")
+    def test_compile_tf_model(self):
+        pass
+
     # overwrite since `TFXLNetLMHeadModel` doesn't cut logits/labels
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index 135db86d4d55..b4fd805f5a99 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -217,6 +217,7 @@ def test_saved_model_creation_extended(self):
         for model_class in self.all_model_classes:
             class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
+            model.build()
             num_out = len(model(class_inputs_dict))
 
             for key in list(class_inputs_dict.keys()):

From c23d131eabcfa2cc722b483b2577ac6a0afc13ca Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 20 Jun 2023 14:43:10 +0200
Subject: [PATCH 470/935] Update tiny models for pipeline testing. (#24364)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/__init__.py                  |  4 ++
 src/transformers/models/auto/__init__.py      |  4 ++
 src/transformers/utils/dummy_tf_objects.py    |  6 ++
 .../autoformer/test_modeling_autoformer.py    |  4 +-
 tests/models/encodec/test_modeling_encodec.py |  2 +-
 tests/models/git/test_modeling_git.py         | 13 +++-
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  5 +-
 .../layoutlmv3/test_modeling_layoutlmv3.py    |  5 +-
 .../layoutlmv3/test_modeling_tf_layoutlmv3.py |  8 +--
 .../test_modeling_timm_backbone.py            |  5 +-
 .../wav2vec2/test_modeling_tf_wav2vec2.py     |  2 +-
 .../test_pipelines_audio_classification.py    |  4 +-
 .../test_pipelines_mask_generation.py         | 12 +++-
 tests/test_pipeline_mixin.py                  |  3 +-
 tests/utils/tiny_model_summary.json           | 62 +++++++++++++++++--
 15 files changed, 110 insertions(+), 29 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f4e53099b198..1254761a2aa4 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2965,11 +2965,13 @@
     )
     _import_structure["models.auto"].extend(
         [
+            "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
             "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
             "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
             "TF_MODEL_FOR_MASKED_LM_MAPPING",
+            "TF_MODEL_FOR_MASK_GENERATION_MAPPING",
             "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
             "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
             "TF_MODEL_FOR_PRETRAINING_MAPPING",
@@ -6350,9 +6352,11 @@
             TFAlbertPreTrainedModel,
         )
         from .models.auto import (
+            TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_CAUSAL_LM_MAPPING,
             TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
             TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+            TF_MODEL_FOR_MASK_GENERATION_MAPPING,
             TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             TF_MODEL_FOR_MASKED_LM_MAPPING,
             TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 7ea870a93317..5af79da56f7d 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -114,8 +114,10 @@
     pass
 else:
     _import_structure["modeling_tf_auto"] = [
+        "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
         "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
+        "TF_MODEL_FOR_MASK_GENERATION_MAPPING",
         "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
         "TF_MODEL_FOR_MASKED_LM_MAPPING",
         "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
@@ -279,9 +281,11 @@
         pass
     else:
         from .modeling_tf_auto import (
+            TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_CAUSAL_LM_MAPPING,
             TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
             TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+            TF_MODEL_FOR_MASK_GENERATION_MAPPING,
             TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             TF_MODEL_FOR_MASKED_LM_MAPPING,
             TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 4a189174eeeb..4da32ae6034a 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -216,6 +216,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
+
+
 TF_MODEL_FOR_CAUSAL_LM_MAPPING = None
 
 
@@ -225,6 +228,9 @@ def __init__(self, *args, **kwargs):
 TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
 
 
+TF_MODEL_FOR_MASK_GENERATION_MAPPING = None
+
+
 TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = None
 
 
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 9df5bf236e08..9f0434689c4b 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -25,6 +25,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 TOLERANCE = 1e-4
@@ -201,9 +202,10 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 
 @require_torch
-class AutoformerModelTest(ModelTesterMixin, unittest.TestCase):
+class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
     all_generative_model_classes = (AutoformerForPrediction,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": AutoformerModel} if is_torch_available() else {}
     test_pruning = False
     test_head_masking = False
     test_missing_keys = False
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 398da6f5d09a..a1693b758243 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -117,7 +117,7 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     test_pruning = False
     test_headmasking = False
     test_resize_embeddings = False
-    pipeline_model_mapping = {}
+    pipeline_model_mapping = {"feature-extraction": EncodecModel} if is_torch_available() else {}
     input_name = "input_values"
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index b6384ae15f90..45b4457fdccd 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -383,11 +383,22 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
-        {"feature-extraction": GitModel, "text-generation": GitForCausalLM} if is_torch_available() else {}
+        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
+        if is_torch_available()
+        else {}
     )
     fx_compatible = False
     test_torchscript = False
 
+    # `GitForCausalLM` doesn't fit into image-to-text pipeline. We might need to overwrite its `generate` function.
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "ImageToTextPipelineTests":
+            return True
+
+        return False
+
     # special case for GitForCausalLM model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index c2b7fe34ee73..7d2f35c8b942 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -270,10 +270,7 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         else ()
     )
     pipeline_model_mapping = (
-        {
-            "document-question-answering": LayoutLMv2ForQuestionAnswering,
-            "feature-extraction": LayoutLMv2Model,
-        }
+        {"document-question-answering": LayoutLMv2ForQuestionAnswering, "feature-extraction": LayoutLMv2Model}
         if is_torch_available()
         else {}
     )
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index c6a2a1bf37f2..c458024f1056 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -286,10 +286,7 @@ class LayoutLMv3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         else ()
     )
     pipeline_model_mapping = (
-        {
-            "document-question-answering": LayoutLMv3ForQuestionAnswering,
-            "feature-extraction": LayoutLMv3Model,
-        }
+        {"document-question-answering": LayoutLMv3ForQuestionAnswering, "feature-extraction": LayoutLMv3Model}
         if is_torch_available()
         else {}
     )
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index a1e2cd590836..1bdb3e2648d7 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -278,13 +278,7 @@ class TFLayoutLMv3ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Te
         else ()
     )
     pipeline_model_mapping = (
-        {
-            "feature-extraction": TFLayoutLMv3Model,
-            "question-answering": TFLayoutLMv3ForQuestionAnswering,
-            "text-classification": TFLayoutLMv3ForSequenceClassification,
-            "token-classification": TFLayoutLMv3ForTokenClassification,
-            "zero-shot": TFLayoutLMv3ForSequenceClassification,
-        }
+        {"document-question-answering": TFLayoutLMv3ForQuestionAnswering, "feature-extraction": TFLayoutLMv3Model}
         if is_tf_available()
         else {}
     )
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index f58716e0f2f0..145238c6bfd6 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -32,6 +32,8 @@
 
     from transformers import TimmBackbone, TimmBackboneConfig
 
+from ...test_pipeline_mixin import PipelineTesterMixin
+
 
 class TimmBackboneModelTester:
     def __init__(
@@ -95,8 +97,9 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 @require_timm
-class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, unittest.TestCase):
+class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TimmBackbone,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TimmBackbone} if is_torch_available() else {}
     test_resize_embeddings = False
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index 391d8e8ce1f1..3554d18957cb 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -322,7 +322,7 @@ class TFWav2Vec2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
         (TFWav2Vec2Model, TFWav2Vec2ForCTC, TFWav2Vec2ForSequenceClassification) if is_tf_available() else ()
     )
     pipeline_model_mapping = (
-        {"feature-extraction": TFWav2Vec2Model, "audio-classification": TFWav2Vec2ForSequenceClassification}
+        {"audio-classification": TFWav2Vec2ForSequenceClassification, "feature-extraction": TFWav2Vec2Model}
         if is_tf_available()
         else {}
     )
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index 208690396c4a..8f2e46e0a50b 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+from transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 from transformers.pipelines import AudioClassificationPipeline, pipeline
 from transformers.testing_utils import (
     is_pipeline_test,
@@ -31,9 +31,9 @@
 
 
 @is_pipeline_test
-@require_torch
 class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+    tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
     def get_test_pipeline(self, model, tokenizer, processor):
         audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor)
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
index 53775deda284..cf1703906515 100644
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -18,7 +18,12 @@
 
 import numpy as np
 
-from transformers import MODEL_FOR_MASK_GENERATION_MAPPING, is_vision_available, pipeline
+from transformers import (
+    MODEL_FOR_MASK_GENERATION_MAPPING,
+    TF_MODEL_FOR_MASK_GENERATION_MAPPING,
+    is_vision_available,
+    pipeline,
+)
 from transformers.pipelines import MaskGenerationPipeline
 from transformers.testing_utils import (
     is_pipeline_test,
@@ -58,6 +63,9 @@ class MaskGenerationPipelineTests(unittest.TestCase):
     model_mapping = dict(
         (list(MODEL_FOR_MASK_GENERATION_MAPPING.items()) if MODEL_FOR_MASK_GENERATION_MAPPING else [])
     )
+    tf_model_mapping = dict(
+        (list(TF_MODEL_FOR_MASK_GENERATION_MAPPING.items()) if TF_MODEL_FOR_MASK_GENERATION_MAPPING else [])
+    )
 
     def get_test_pipeline(self, model, tokenizer, processor):
         image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor)
@@ -66,7 +74,7 @@ def get_test_pipeline(self, model, tokenizer, processor):
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
         ]
 
-    # TODO: Fix me @Arthur
+    # TODO: Implement me @Arthur
     def run_pipeline_test(self, mask_generator, examples):
         pass
 
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 05ea27121a54..1fa8f378e40e 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -17,6 +17,7 @@
 import json
 import os
 import random
+import unittest
 from pathlib import Path
 
 from transformers.testing_utils import (
@@ -314,7 +315,6 @@ def data(n):
         run_batch_test(pipeline, examples)
 
     @is_pipeline_test
-    @require_torch
     def test_pipeline_audio_classification(self):
         self.run_task_tests(task="audio-classification")
 
@@ -366,6 +366,7 @@ def test_pipeline_image_segmentation(self):
     def test_pipeline_image_to_text(self):
         self.run_task_tests(task="image-to-text")
 
+    @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
     @is_pipeline_test
     @require_vision
     @require_torch
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index a186bf17e196..2d0575612d6d 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -1597,7 +1597,8 @@
             "EfficientFormerImageProcessor"
         ],
         "model_classes": [
-            "EfficientFormerForImageClassification"
+            "EfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassification"
         ],
         "sha": "ebadb628e12f268e321fcc756fa4606f7b5b3178"
     },
@@ -1607,7 +1608,8 @@
             "EfficientFormerImageProcessor"
         ],
         "model_classes": [
-            "EfficientFormerForImageClassificationWithTeacher"
+            "EfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerForImageClassificationWithTeacher"
         ],
         "sha": "1beabce6da9cb4ebbeafcd1ef23fac36b4a269e2"
     },
@@ -1617,7 +1619,8 @@
             "EfficientFormerImageProcessor"
         ],
         "model_classes": [
-            "EfficientFormerModel"
+            "EfficientFormerModel",
+            "TFEfficientFormerModel"
         ],
         "sha": "200fae5b875844d09c8a91d1c155b72b06a517f6"
     },
@@ -1736,6 +1739,16 @@
         ],
         "sha": "312b532cbef26610d80f2bd008650160cae4f7a1"
     },
+    "EncodecModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "EncodecFeatureExtractor"
+        ],
+        "model_classes": [
+            "EncodecModel"
+        ],
+        "sha": "e14c5a2fd6529c85cd4ac5a05ee9e550ced6a006"
+    },
     "EncoderDecoderModel": {
         "tokenizer_classes": [
             "BertTokenizer",
@@ -3888,6 +3901,36 @@
         ],
         "sha": "b3a1452e7cb44b600b21ee14f3d5382366855a46"
     },
+    "MobileViTV2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTV2ForImageClassification"
+        ],
+        "sha": "25752b0967ad594341d1b685401450d7f698433c"
+    },
+    "MobileViTV2ForSemanticSegmentation": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTV2ForSemanticSegmentation"
+        ],
+        "sha": "13b953f50be33219d55a12f1098be38b88000897"
+    },
+    "MobileViTV2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "MobileViTImageProcessor"
+        ],
+        "model_classes": [
+            "MobileViTV2Model"
+        ],
+        "sha": "2f46357659db2d6d54d870e28073deeea1c8cb64"
+    },
     "MvpForCausalLM": {
         "tokenizer_classes": [
             "MvpTokenizer",
@@ -4452,6 +4495,16 @@
         ],
         "sha": "83ec4d2d61ed62525ee033e13d144817beb29d19"
     },
+    "Pix2StructForConditionalGeneration": {
+        "tokenizer_classes": [
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [
+            "Pix2StructImageProcessor"
+        ],
+        "model_classes": [],
+        "sha": "42b3de00ad535076c4893e4ac5ae2d2748cc4ccb"
+    },
     "PoolFormerForImageClassification": {
         "tokenizer_classes": [],
         "processor_classes": [
@@ -5123,7 +5176,8 @@
             "SamImageProcessor"
         ],
         "model_classes": [
-            "SamModel"
+            "SamModel",
+            "TFSamModel"
         ],
         "sha": "eca8651bc84e5ac3b1b62e784b744a6bd1b82575"
     },

From f924df3c7e5150317ef47754a31cebf2893570ce Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Tue, 20 Jun 2023 14:01:17 +0100
Subject: [PATCH 471/935] [modelcard] add audio classification to task list
 (#24363)

---
 src/transformers/modelcard.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 5f4627c3d9b2..a8a591a23ed5 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -279,6 +279,7 @@ def to_json_file(self, json_file_path):
     "translation": "Translation",
     "zero-shot-classification": "Zero Shot Classification",
     "automatic-speech-recognition": "Automatic Speech Recognition",
+    "audio-classification": "Audio Classification",
 }
 
 

From 6c1344444abe45b221953cb14b038b8a06299613 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Tue, 20 Jun 2023 16:01:56 +0100
Subject: [PATCH 472/935] [Whisper] Make tests faster (#24105)

---
 tests/models/whisper/test_modeling_whisper.py | 161 +++++++++++++++++-
 1 file changed, 157 insertions(+), 4 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 3eee5ad4967c..2bb34aa1af7b 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -95,7 +95,7 @@ def __init__(
         self,
         parent,
         batch_size=2,
-        seq_length=1500,
+        seq_length=60,
         is_training=True,
         use_labels=False,
         vocab_size=200,
@@ -107,7 +107,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
-        max_source_positions=750,
+        max_source_positions=30,
         max_target_positions=40,
         bos_token_id=98,
         eos_token_id=98,
@@ -1538,7 +1538,7 @@ def __init__(
         self,
         parent,
         batch_size=2,
-        seq_length=3000,
+        seq_length=60,
         is_training=True,
         use_labels=True,
         hidden_size=16,
@@ -1549,7 +1549,7 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
-        max_source_positions=1500,
+        max_source_positions=30,
         num_mel_bins=80,
         num_conv_layers=1,
         suppress_tokens=None,
@@ -1731,3 +1731,156 @@ def test_model_common_attributes(self):
     # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
     def test_resize_tokens_embeddings(self):
         pass
+
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                # Output all for aggressive testing
+                config.output_hidden_states = True
+                config.output_attentions = self.has_attentions
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                # load Flax class
+                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
+
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                # send pytorch inputs to the correct device
+                pt_inputs = {
+                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
+                }
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                # send pytorch model to the correct device
+                pt_model.to(torch_device)
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs)
+                fx_outputs = fx_model(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
+
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                fx_model_class_name = "Flax" + model_class.__name__
+
+                if not hasattr(transformers, fx_model_class_name):
+                    # no flax model exists for this class
+                    return
+
+                # Output all for aggressive testing
+                config.output_hidden_states = True
+                config.output_attentions = self.has_attentions
+
+                fx_model_class = getattr(transformers, fx_model_class_name)
+
+                # load PyTorch class
+                pt_model = model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+
+                # load Flax class
+                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
+
+                # make sure only flax inputs are forward that actually exist in function args
+                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
+
+                # prepare inputs
+                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+                # remove function args that don't exist in Flax
+                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
+
+                # send pytorch inputs to the correct device
+                pt_inputs = {
+                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
+                }
+
+                # convert inputs to Flax
+                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                # send pytorch model to the correct device
+                pt_model.to(torch_device)
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs)
+                fx_outputs = fx_model(**fx_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                # send pytorch model to the correct device
+                pt_model_loaded.to(torch_device)
+                pt_model_loaded.eval()
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
+
+                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
+
+                self.assertEqual(fx_keys, pt_keys)
+                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)

From a6b4d1ad83d4dc0fa5b92fb92ae1889b236fd90a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Tue, 20 Jun 2023 11:14:29 -0400
Subject: [PATCH 473/935] Remove print statement

---
 src/transformers/models/whisper/tokenization_whisper.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index b62d9cbcb744..e85b610c5996 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -904,7 +904,6 @@ def new_chunk():
         if current_tokens:
             previous_tokens.append(current_tokens)
         elif not (any(p for p in previous_tokens)):
-            # print("Flushing previous tokens (END)")
             chunk = new_chunk()
             previous_tokens = []
             current_tokens = []
@@ -917,7 +916,6 @@ def new_chunk():
             )
         # Happens when we don't use timestamps
         resolved_tokens = _find_longest_common_sequence(previous_tokens)
-        # print("Flushing previous tokens (FINAL)")
         resolved_text = tokenizer.decode(resolved_tokens)
         chunk["text"] = resolved_text
         chunks.append(chunk)

From dc4449918d3d8f1814053d474e88251938513408 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 20 Jun 2023 11:54:55 -0400
Subject: [PATCH 474/935] Rename test to be more accurate (#24374)

---
 tests/models/marian/test_modeling_marian.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 877f5273c9bc..6cbcd55d3f76 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -618,7 +618,7 @@ def setUpClass(cls) -> None:
         return cls
 
     @slow
-    def test_batch_generation_en_fr(self):
+    def test_batch_generation_fi_en(self):
         self._assert_generated_batch_equal_expected()
 
 

From 0527c1c0ea79554a63b3809050980dab85c13f0d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:07:34 +0200
Subject: [PATCH 475/935] Add a check in `ImageToTextPipeline._forward`
 (#24373)

* fix

* fix

* fix

* Update src/transformers/pipelines/image_to_text.py

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 src/transformers/pipelines/image_to_text.py | 9 +++++++++
 tests/models/git/test_modeling_git.py       | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 1c082c2ecb38..25feeb1c0cb9 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -145,6 +145,15 @@ def preprocess(self, image, prompt=None):
         return model_inputs
 
     def _forward(self, model_inputs, generate_kwargs=None):
+        # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the
+        # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+        if (
+            "input_ids" in model_inputs
+            and isinstance(model_inputs["input_ids"], list)
+            and all(x is None for x in model_inputs["input_ids"])
+        ):
+            model_inputs["input_ids"] = None
+
         if generate_kwargs is None:
             generate_kwargs = {}
         # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 45b4457fdccd..1c39b61e47ca 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -390,15 +390,6 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     fx_compatible = False
     test_torchscript = False
 
-    # `GitForCausalLM` doesn't fit into image-to-text pipeline. We might need to overwrite its `generate` function.
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "ImageToTextPipelineTests":
-            return True
-
-        return False
-
     # special case for GitForCausalLM model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)

From 7feba74400fdbaf56b820f932ea810ad4f69758c Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:22:00 +0200
Subject: [PATCH 476/935] [Tokenizer doc] Clarification about
 `add_prefix_space` (#24368)

* nits

* more details

* fixup

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/generation/configuration_utils.py | 6 ++++--
 src/transformers/generation/logits_process.py      | 6 ++++--
 src/transformers/generation/tf_logits_process.py   | 5 ++++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index d024ff4718eb..26246f3a758d 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -143,8 +143,10 @@ class GenerationConfig(PushToHubMixin):
             If set to int > 0, all ngrams of that size can only occur once.
         bad_words_ids(`List[List[int]]`, *optional*):
             List of token ids that are not allowed to be generated. In order to get the token ids of the words that
-            should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
-            add_special_tokens=False).input_ids`.
+            should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing the
+            tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
+            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
+            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
         force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
             List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
             words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 14bde38b6749..8595a827efde 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -546,8 +546,10 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
     Args:
         bad_words_ids (`List[List[int]]`):
             List of list of token ids that are not allowed to be generated. In order to get the token ids of the words
-            that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
-            add_special_tokens=False).input_ids`.
+            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
+            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
+            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
+            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
         eos_token_id (`Union[int, List[int]]`):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
     """
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 0efac5804f2b..00c47e798207 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -292,7 +292,10 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
     Args:
         bad_words_ids (`List[List[int]]`):
             List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
+            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
+            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
+            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
         eos_token_id (`int`):
             The id of the *end-of-sequence* token.
     """

From 6950f70b38e76657c0d640fa7061394a7af198b0 Mon Sep 17 00:00:00 2001
From: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
Date: Tue, 20 Jun 2023 12:26:08 -0400
Subject: [PATCH 477/935] style: add BitsAndBytesConfig __repr__ function
 (#24331)

* style: add repr to BitsAndBytesConfig

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* chore: update pattern for __repr__

implement diff dict for __repr__ of BitsAndBytesConfig

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
---
 src/transformers/utils/quantization_config.py | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 9a2d4c82df9c..3c5634097cfa 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -228,3 +228,46 @@ def to_dict(self) -> Dict[str, Any]:
         output["bnb_4bit_compute_dtype"] = str(output["bnb_4bit_compute_dtype"]).split(".")[1]
 
         return output
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = BitsAndBytesConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict

From 297d769d0e38a0a3f6adefeb3fdfe057e6092b46 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:29:30 +0200
Subject: [PATCH 478/935] Better test name and enable pipeline test for
 `pix2struct` (#24377)

* best test name forever

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/pix2struct/test_modeling_pix2struct.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index e40e6c81ea39..8ec023676d63 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -35,6 +35,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -354,7 +355,7 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(model)
 
 
-class Pix2StructTextImageModelsModelTester:
+class Pix2StructModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
         if text_kwargs is None:
             text_kwargs = {}
@@ -394,8 +395,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Pix2StructTextImageModelTest(ModelTesterMixin, unittest.TestCase):
+class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -404,7 +406,7 @@ class Pix2StructTextImageModelTest(ModelTesterMixin, unittest.TestCase):
     test_torchscript = False
 
     def setUp(self):
-        self.model_tester = Pix2StructTextImageModelsModelTester(self)
+        self.model_tester = Pix2StructModelTester(self)
 
     def test_model(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 83dc5762e756d399dfcab314128efe213c264e83 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:35:45 +0200
Subject: [PATCH 479/935] Skip a tapas (tokenization) test in past CI (#24378)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/tapas/test_tokenization_tapas.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index d3af3276f751..b4cca18162d8 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -22,7 +22,7 @@
 import numpy as np
 import pandas as pd
 
-from transformers import AddedToken
+from transformers import AddedToken, is_torch_available
 from transformers.models.tapas.tokenization_tapas import (
     VOCAB_FILES_NAMES,
     BasicTokenizer,
@@ -44,6 +44,12 @@
 from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
 
 
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
+else:
+    is_torch_greater_or_equal_than_1_12 = False
+
+
 @require_tokenizers
 @require_pandas
 class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -1026,6 +1032,7 @@ def test_token_type_ids(self):
                 # Do the same test as modeling common.
                 self.assertIn(0, output["token_type_ids"][0])
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
     @slow
     def test_torch_encode_plus_sent_to_model(self):

From c2882403c406ba1ed799967b8a0a8317c623efc3 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 20 Jun 2023 19:18:52 +0200
Subject: [PATCH 480/935] [Whisper Docs] Nits (#24367)

* nits

* config doc did not match

* Apply suggestions from code review

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/whisper/configuration_whisper.py   | 4 ++--
 src/transformers/models/whisper/tokenization_whisper.py    | 7 ++++---
 .../models/whisper/tokenization_whisper_fast.py            | 7 ++++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 3fe936fccf34..c95e5b28874a 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -127,7 +127,7 @@ class WhisperConfig(PretrainedConfig):
             Padding token id.
         bos_token_id (`int`, *optional*, defaults to 50256):
             Begin of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50257):
+        eos_token_id (`int`, *optional*, defaults to 50256):
             End of stream token id.
         suppress_tokens (`List[int]`, *optional*):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
@@ -216,7 +216,7 @@ def __init__(
         max_source_positions=1500,
         max_target_positions=448,
         pad_token_id=50256,
-        bos_token_id=50257,
+        bos_token_id=50256,
         eos_token_id=50256,
         suppress_tokens=None,
         begin_suppress_tokens=[220, 50256],
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index e85b610c5996..52a2e2dd722e 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -231,8 +231,9 @@ class WhisperTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (`str`, *optional*, defaults to `"<|startoftranscript|>"`):
-            The beginning of sequence token.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
+            `"<|startoftranscript|>"` when generating.
         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
@@ -261,7 +262,7 @@ def __init__(
         normalizer_file=None,
         errors="replace",
         unk_token="<|endoftext|>",
-        bos_token="<|startoftranscript|>",
+        bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         pad_token=None,
         add_prefix_space=False,
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index 642be2e4cb07..a8cd8d662726 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -105,8 +105,9 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (`str`, *optional*, defaults to `<|startoftranscript|>`):
-            The beginning of sequence token.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
+            `"<|startoftranscript|>"` when generating.
         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The end of sequence token.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
@@ -138,7 +139,7 @@ def __init__(
         normalizer_file=None,
         tokenizer_file=None,
         unk_token="<|endoftext|>",
-        bos_token="<|startoftranscript|>",
+        bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
         language=None,

From e5c760d636549d3a5bc668debdef92c455a0f222 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 20 Jun 2023 19:19:19 +0200
Subject: [PATCH 481/935] [GPTNeoX] Nit in config (#24349)

* add raise value error for attention size

* nits to fix test_config

* style
---
 src/transformers/models/gpt_neox/configuration_gpt_neox.py | 4 ++++
 src/transformers/models/gpt_neox/modeling_gpt_neox.py      | 4 ++++
 tests/models/gpt_neox/test_modeling_gpt_neox.py            | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index 5d9433ae2f56..000d566398e4 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -126,3 +126,7 @@ def __init__(
         self.use_cache = use_cache
         self.tie_word_embeddings = tie_word_embeddings
         self.use_parallel_residual = use_parallel_residual
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
+            )
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index b7a9a46c0659..7c3bfd1035f9 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -88,6 +88,10 @@ def __init__(self, config):
         super().__init__()
         self.num_attention_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them"
+            )
         self.head_size = self.hidden_size // self.num_attention_heads
         self.rotary_ndims = int(self.head_size * config.rotary_pct)
         max_positions = config.max_position_embeddings
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 39eac1ccc2ad..ed9b5764a361 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -253,7 +253,7 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
 
     def setUp(self):
         self.model_tester = GPTNeoXModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=64, num_attention_heads=8)
 
     def test_config(self):
         self.config_tester.run_common_tests()

From b0513b013b10939a2b47ab94933c2cca909716a2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 20 Jun 2023 19:39:52 +0200
Subject: [PATCH 482/935] [Wav2Vec2 - MMS] Correct directly loading adapters
 weights (#24335)

* Correct direct lang loading

* correct more

* revert black

* Use tie weights instead=

* add tests

* add tests

* make style
---
 .../models/hubert/modeling_hubert.py          | 26 ++++++++++---
 src/transformers/models/sew/modeling_sew.py   | 26 ++++++++++---
 .../models/sew_d/modeling_sew_d.py            | 26 ++++++++++---
 .../models/unispeech/modeling_unispeech.py    | 26 ++++++++++---
 .../unispeech_sat/modeling_unispeech_sat.py   | 26 ++++++++++---
 .../models/wav2vec2/modeling_wav2vec2.py      | 37 ++++++++++++++++---
 .../modeling_wav2vec2_conformer.py            |  9 +----
 .../models/wavlm/modeling_wavlm.py            | 26 ++++++++++---
 .../models/wav2vec2/test_modeling_wav2vec2.py | 34 +++++++++++++++++
 9 files changed, 193 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index c7436c5ea4ad..70a8c079409b 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -1134,12 +1134,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Hubert, wav2vec2->hubert, WAV_2_VEC_2->HUBERT
 class HubertForCTC(HubertPreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.hubert = HubertModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1152,15 +1154,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Hubert so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Hubert never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 928b2fd3ade7..dd854c49f5c9 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -969,12 +969,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEW, wav2vec2->sew, WAV_2_VEC_2->SEW
 class SEWForCTC(SEWPreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.sew = SEWModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -987,15 +989,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for SEW so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, SEW never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 403c865cdd4c..7f7c1977d692 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -1509,12 +1509,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV_2_VEC_2->SEWD
 class SEWDForCTC(SEWDPreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.sew_d = SEWDModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1527,15 +1529,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for SEWD so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, SEWD never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index da81bc4b1404..e068fa59e579 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1378,12 +1378,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
 class UniSpeechForCTC(UniSpeechPreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.unispeech = UniSpeechModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1396,15 +1398,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for UniSpeech so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, UniSpeech never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 58ba244ade0f..2ed8a5d57204 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1385,12 +1385,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.unispeech_sat = UniSpeechSatModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1403,15 +1405,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for UniSpeechSat so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, UniSpeechSat never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 26cfe3c6cb49..43ab2408bb23 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1207,7 +1207,7 @@ def init_adapter_layers(self):
         if isinstance(self, Wav2Vec2ForCTC):
             self._init_weights(self.lm_head)
 
-    def load_adapter(self, target_lang: str, **kwargs):
+    def load_adapter(self, target_lang: str, force_load=True, **kwargs):
         r"""
         Load a language adapter model from a pre-trained adapter model.
 
@@ -1215,6 +1215,8 @@ def load_adapter(self, target_lang: str, **kwargs):
             target_lang (`str`):
                 Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
                 adapter.<lang>.safetensors or adapter.<lang>.bin
+            force_load (`bool`, defaults to `True`):
+                Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
@@ -1271,6 +1273,10 @@ def load_adapter(self, target_lang: str, **kwargs):
         if self.config.adapter_attn_dim is None:
             raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
 
+        if target_lang == self.target_lang and not force_load:
+            logger.warn(f"Adapter weights are already set to {target_lang}.")
+            return
+
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -1372,6 +1378,9 @@ def load_adapter(self, target_lang: str, **kwargs):
         state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
         self.load_state_dict(state_dict, strict=False)
 
+        # set target language corectly
+        self.target_lang = target_lang
+
 
 WAV_2_VEC_2_START_DOCSTRING = r"""
     Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
@@ -1854,12 +1863,14 @@ def forward(
     WAV_2_VEC_2_START_DOCSTRING,
 )
 class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2 = Wav2Vec2Model(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1872,15 +1883,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 4eba3023de4d..3e37a4a504b0 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1610,6 +1610,8 @@ def __init__(self, config, target_lang=None):
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1622,13 +1624,6 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
-        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
-            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
-        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
-            logger.info("By default `target_lang` is set to 'eng'.")
-        elif target_lang is not None:
-            self.load_adapter(target_lang)
-
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 4181794ce169..e4072d93724f 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -1272,12 +1272,14 @@ def forward(
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForCTC(WavLMPreTrainedModel):
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wavlm = WavLMModel(config)
         self.dropout = nn.Dropout(config.final_dropout)
 
+        self.target_lang = target_lang
+
         if config.vocab_size is None:
             raise ValueError(
                 f"You are trying to instantiate {self.__class__} with a configuration that "
@@ -1290,15 +1292,29 @@ def __init__(self, config, target_lang=None):
         )
         self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for WavLM so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, WavLM never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
         if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
             raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
         elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
             logger.info("By default `target_lang` is set to 'eng'.")
         elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-        # Initialize weights and apply final processing
-        self.post_init()
+            self.load_adapter(target_lang, force_load=True)
 
     def freeze_feature_extractor(self):
         """
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 87206a4b9b8f..65bfcb4451a9 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -1117,6 +1117,40 @@ def test_mask_time_feature_prob_ctc_single_batch(self):
     def test_feed_forward_chunking(self):
         pass
 
+    def test_load_and_set_attn_adapter(self):
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        def get_logits(model, input_features):
+            model = model.to(torch_device)
+            batch = processor(
+                input_features,
+                padding=True,
+                sampling_rate=processor.feature_extractor.sampling_rate,
+                return_tensors="pt",
+            )
+
+            with torch.no_grad():
+                logits = model(
+                    input_values=batch["input_values"].to(torch_device),
+                    attention_mask=batch["attention_mask"].to(torch_device),
+                ).logits
+            return logits
+
+        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
+
+        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="it")
+
+        logits = get_logits(model, input_features)
+
+        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
+        model_2.load_adapter("it")
+
+        logits_2 = get_logits(model_2, input_features)
+
+        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
     def test_load_attn_adapter(self):
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True

From eb849f6604c7dcc0e96d68f4851e52e253b9f0e5 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:07:47 -0400
Subject: [PATCH 483/935] Migrate doc files to Markdown. (#24376)

* Rename index.mdx to index.md

* With saved modifs

* Address review comment

* Treat all files

* .mdx -> .md

* Remove special char

* Update utils/tests_fetcher.py

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

---------

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
---
 .circleci/create_circleci_config.py           |  2 +-
 .github/ISSUE_TEMPLATE/i18n.md                | 20 +++++------
 .github/workflows/doctests.yml                |  2 +-
 docs/README.md                                |  8 ++---
 .../de/{accelerate.mdx => accelerate.md}      |  4 +++
 ...ass_tutorial.mdx => autoclass_tutorial.md} |  4 +++
 docs/source/de/{index.mdx => index.md}        |  4 +++
 .../de/{installation.mdx => installation.md}  |  4 +++
 .../{model_sharing.mdx => model_sharing.md}   |  4 +++
 ...line_tutorial.mdx => pipeline_tutorial.md} |  4 +++
 .../{preprocessing.mdx => preprocessing.md}   |  4 +++
 .../source/de/{quicktour.mdx => quicktour.md} |  4 +++
 docs/source/de/{training.mdx => training.md}  |  4 +++
 .../en/{accelerate.mdx => accelerate.md}      |  4 +++
 .../{add_new_model.mdx => add_new_model.md}   |  6 +++-
 ...d_new_pipeline.mdx => add_new_pipeline.md} |  4 +++
 ...flow_model.mdx => add_tensorflow_model.md} | 10 ++++--
 .../source/en/{attention.mdx => attention.md} |  4 +++
 ...ass_tutorial.mdx => autoclass_tutorial.md} |  4 +++
 .../en/{benchmarks.mdx => benchmarks.md}      |  4 +++
 .../source/en/{bertology.mdx => bertology.md} |  4 +++
 .../en/{big_models.mdx => big_models.md}      |  4 +++
 .../source/en/{community.mdx => community.md} |  6 +++-
 .../{create_a_model.mdx => create_a_model.md} |  4 +++
 .../{custom_models.mdx => custom_models.md}   |  4 +++
 .../en/{custom_tools.mdx => custom_tools.md}  |  4 +++
 .../source/en/{debugging.mdx => debugging.md} |  4 +++
 ...fast_tokenizers.mdx => fast_tokenizers.md} |  4 +++
 ...trategies.mdx => generation_strategies.md} |  6 +++-
 docs/source/en/{glossary.mdx => glossary.md}  |  4 +++
 .../source/en/{hpo_train.mdx => hpo_train.md} |  4 +++
 docs/source/en/{index.mdx => index.md}        |  3 ++
 .../en/{installation.mdx => installation.md}  |  4 +++
 .../{audio_utils.mdx => audio_utils.md}       |  4 +++
 .../{file_utils.mdx => file_utils.md}         |  4 +++
 ...neration_utils.mdx => generation_utils.md} |  4 +++
 ...ng_utils.mdx => image_processing_utils.md} |  4 +++
 .../{modeling_utils.mdx => modeling_utils.md} |  4 +++
 ...pipelines_utils.mdx => pipelines_utils.md} |  4 +++
 ..._series_utils.mdx => time_series_utils.md} |  4 +++
 ...zation_utils.mdx => tokenization_utils.md} |  4 +++
 .../{trainer_utils.mdx => trainer_utils.md}   |  4 +++
 .../en/main_classes/{agent.mdx => agent.md}   |  4 +++
 .../{callback.mdx => callback.md}             |  4 +++
 .../{configuration.mdx => configuration.md}   |  4 +++
 .../{data_collator.mdx => data_collator.md}   |  4 +++
 .../{deepspeed.mdx => deepspeed.md}           |  4 +++
 ...ure_extractor.mdx => feature_extractor.md} |  4 +++
 ...image_processor.mdx => image_processor.md} |  4 +++
 ...keras_callbacks.mdx => keras_callbacks.md} |  4 +++
 .../main_classes/{logging.mdx => logging.md}  |  4 +++
 .../en/main_classes/{model.mdx => model.md}   |  4 +++
 .../en/main_classes/{onnx.mdx => onnx.md}     |  4 +++
 ...r_schedules.mdx => optimizer_schedules.md} |  4 +++
 .../en/main_classes/{output.mdx => output.md} |  4 +++
 .../{pipelines.mdx => pipelines.md}           |  4 +++
 .../{processors.mdx => processors.md}         |  4 +++
 .../{quantization.mdx => quantization.md}     |  4 +++
 ...text_generation.mdx => text_generation.md} |  4 +++
 .../{tokenizer.mdx => tokenizer.md}           |  4 +++
 .../main_classes/{trainer.mdx => trainer.md}  |  4 +++
 .../en/model_doc/{albert.mdx => albert.md}    |  4 +++
 .../en/model_doc/{align.mdx => align.md}      |  4 +++
 .../en/model_doc/{altclip.mdx => altclip.md}  |  4 +++
 ...r.mdx => audio-spectrogram-transformer.md} |  4 +++
 .../source/en/model_doc/{auto.mdx => auto.md} |  4 +++
 .../{autoformer.mdx => autoformer.md}         |  4 +++
 .../source/en/model_doc/{bart.mdx => bart.md} |  4 +++
 .../en/model_doc/{barthez.mdx => barthez.md}  |  4 +++
 .../en/model_doc/{bartpho.mdx => bartpho.md}  |  4 +++
 .../source/en/model_doc/{beit.mdx => beit.md} |  4 +++
 ...bert-generation.mdx => bert-generation.md} |  4 +++
 .../{bert-japanese.mdx => bert-japanese.md}   |  4 +++
 .../source/en/model_doc/{bert.mdx => bert.md} |  4 +++
 .../model_doc/{bertweet.mdx => bertweet.md}   |  4 +++
 .../model_doc/{big_bird.mdx => big_bird.md}   |  4 +++
 ...bigbird_pegasus.mdx => bigbird_pegasus.md} |  4 +++
 .../en/model_doc/{biogpt.mdx => biogpt.md}    |  4 +++
 docs/source/en/model_doc/{bit.mdx => bit.md}  |  4 +++
 ...enderbot-small.mdx => blenderbot-small.md} |  4 +++
 .../{blenderbot.mdx => blenderbot.md}         |  4 +++
 .../en/model_doc/{blip-2.mdx => blip-2.md}    |  4 +++
 .../source/en/model_doc/{blip.mdx => blip.md} |  4 +++
 .../en/model_doc/{bloom.mdx => bloom.md}      |  4 +++
 .../source/en/model_doc/{bort.mdx => bort.md} |  4 +++
 .../{bridgetower.mdx => bridgetower.md}       |  4 +++
 .../source/en/model_doc/{byt5.mdx => byt5.md} |  4 +++
 .../model_doc/{camembert.mdx => camembert.md} |  4 +++
 .../en/model_doc/{canine.mdx => canine.md}    |  4 +++
 .../{chinese_clip.mdx => chinese_clip.md}     |  4 +++
 .../source/en/model_doc/{clap.mdx => clap.md} |  4 +++
 .../source/en/model_doc/{clip.mdx => clip.md} |  4 +++
 .../en/model_doc/{clipseg.mdx => clipseg.md}  |  4 +++
 .../en/model_doc/{codegen.mdx => codegen.md}  |  4 +++
 ...nditional_detr.mdx => conditional_detr.md} |  4 +++
 .../model_doc/{convbert.mdx => convbert.md}   |  4 +++
 .../model_doc/{convnext.mdx => convnext.md}   |  4 +++
 .../{convnextv2.mdx => convnextv2.md}         |  4 +++
 docs/source/en/model_doc/{cpm.mdx => cpm.md}  |  4 +++
 .../en/model_doc/{cpmant.mdx => cpmant.md}    |  4 +++
 .../source/en/model_doc/{ctrl.mdx => ctrl.md} |  4 +++
 docs/source/en/model_doc/{cvt.mdx => cvt.md}  |  4 +++
 .../model_doc/{data2vec.mdx => data2vec.md}   |  4 +++
 .../{deberta-v2.mdx => deberta-v2.md}         |  4 +++
 .../en/model_doc/{deberta.mdx => deberta.md}  |  4 +++
 ...ransformer.mdx => decision_transformer.md} |  4 +++
 ...deformable_detr.mdx => deformable_detr.md} |  4 +++
 .../source/en/model_doc/{deit.mdx => deit.md} |  4 +++
 .../en/model_doc/{deplot.mdx => deplot.md}    |  4 +++
 .../source/en/model_doc/{deta.mdx => deta.md} |  4 +++
 .../source/en/model_doc/{detr.mdx => detr.md} |  4 +++
 .../model_doc/{dialogpt.mdx => dialogpt.md}   |  4 +++
 .../en/model_doc/{dinat.mdx => dinat.md}      |  4 +++
 .../{distilbert.mdx => distilbert.md}         |  4 +++
 docs/source/en/model_doc/{dit.mdx => dit.md}  |  4 +++
 .../en/model_doc/{donut.mdx => donut.md}      |  4 +++
 docs/source/en/model_doc/{dpr.mdx => dpr.md}  |  4 +++
 docs/source/en/model_doc/{dpt.mdx => dpt.md}  |  4 +++
 ...efficientformer.mdx => efficientformer.md} |  4 +++
 .../{efficientnet.mdx => efficientnet.md}     |  4 +++
 .../en/model_doc/{electra.mdx => electra.md}  |  4 +++
 .../en/model_doc/{encodec.mdx => encodec.md}  |  4 +++
 ...encoder-decoder.mdx => encoder-decoder.md} |  4 +++
 .../en/model_doc/{ernie.mdx => ernie.md}      |  4 +++
 .../en/model_doc/{ernie_m.mdx => ernie_m.md}  |  4 +++
 docs/source/en/model_doc/{esm.mdx => esm.md}  |  4 +++
 .../en/model_doc/{flan-t5.mdx => flan-t5.md}  |  4 +++
 .../model_doc/{flan-ul2.mdx => flan-ul2.md}   |  4 +++
 .../model_doc/{flaubert.mdx => flaubert.md}   |  4 +++
 .../en/model_doc/{flava.mdx => flava.md}      |  4 +++
 .../source/en/model_doc/{fnet.mdx => fnet.md} |  4 +++
 .../model_doc/{focalnet.mdx => focalnet.md}   |  4 +++
 .../source/en/model_doc/{fsmt.mdx => fsmt.md} |  4 +++
 .../en/model_doc/{funnel.mdx => funnel.md}    |  4 +++
 docs/source/en/model_doc/{git.mdx => git.md}  |  4 +++
 .../source/en/model_doc/{glpn.mdx => glpn.md} |  4 +++
 .../en/model_doc/{gpt-sw3.mdx => gpt-sw3.md}  |  4 +++
 .../source/en/model_doc/{gpt2.mdx => gpt2.md} |  4 +++
 .../{gpt_bigcode.mdx => gpt_bigcode.md}       |  4 +++
 .../en/model_doc/{gpt_neo.mdx => gpt_neo.md}  |  4 +++
 .../model_doc/{gpt_neox.mdx => gpt_neox.md}   |  4 +++
 ...neox_japanese.mdx => gpt_neox_japanese.md} |  4 +++
 .../source/en/model_doc/{gptj.mdx => gptj.md} |  4 +++
 ...gptsan-japanese.mdx => gptsan-japanese.md} |  4 +++
 .../{graphormer.mdx => graphormer.md}         |  4 +++
 .../model_doc/{groupvit.mdx => groupvit.md}   |  4 +++
 .../en/model_doc/{herbert.mdx => herbert.md}  |  4 +++
 .../en/model_doc/{hubert.mdx => hubert.md}    |  4 +++
 .../en/model_doc/{ibert.mdx => ibert.md}      |  4 +++
 .../model_doc/{imagegpt.mdx => imagegpt.md}   |  4 +++
 .../model_doc/{informer.mdx => informer.md}   |  4 +++
 .../en/model_doc/{jukebox.mdx => jukebox.md}  |  4 +++
 .../model_doc/{layoutlm.mdx => layoutlm.md}   |  4 +++
 .../{layoutlmv2.mdx => layoutlmv2.md}         |  4 +++
 .../{layoutlmv3.mdx => layoutlmv3.md}         |  4 +++
 .../model_doc/{layoutxlm.mdx => layoutxlm.md} |  4 +++
 docs/source/en/model_doc/{led.mdx => led.md}  |  4 +++
 .../en/model_doc/{levit.mdx => levit.md}      |  4 +++
 .../source/en/model_doc/{lilt.mdx => lilt.md} |  4 +++
 .../en/model_doc/{llama.mdx => llama.md}      |  4 +++
 .../{longformer.mdx => longformer.md}         |  4 +++
 .../en/model_doc/{longt5.mdx => longt5.md}    |  4 +++
 .../source/en/model_doc/{luke.mdx => luke.md} |  4 +++
 .../en/model_doc/{lxmert.mdx => lxmert.md}    |  4 +++
 .../en/model_doc/{m2m_100.mdx => m2m_100.md}  |  4 +++
 .../en/model_doc/{marian.mdx => marian.md}    |  4 +++
 .../model_doc/{markuplm.mdx => markuplm.md}   |  4 +++
 .../{mask2former.mdx => mask2former.md}       |  4 +++
 .../{maskformer.mdx => maskformer.md}         |  4 +++
 .../en/model_doc/{matcha.mdx => matcha.md}    |  4 +++
 .../en/model_doc/{mbart.mdx => mbart.md}      |  4 +++
 .../en/model_doc/{mctct.mdx => mctct.md}      |  4 +++
 .../source/en/model_doc/{mega.mdx => mega.md} |  4 +++
 .../{megatron-bert.mdx => megatron-bert.md}   |  4 +++
 .../{megatron_gpt2.mdx => megatron_gpt2.md}   |  4 +++
 .../en/model_doc/{mgp-str.mdx => mgp-str.md}  |  4 +++
 .../en/model_doc/{mluke.mdx => mluke.md}      |  4 +++
 docs/source/en/model_doc/{mms.mdx => mms.md}  |  4 +++
 .../{mobilebert.mdx => mobilebert.md}         |  4 +++
 .../{mobilenet_v1.mdx => mobilenet_v1.md}     |  4 +++
 .../{mobilenet_v2.mdx => mobilenet_v2.md}     |  4 +++
 .../model_doc/{mobilevit.mdx => mobilevit.md} |  4 +++
 .../{mobilevitv2.mdx => mobilevitv2.md}       |  4 +++
 .../en/model_doc/{mpnet.mdx => mpnet.md}      |  4 +++
 docs/source/en/model_doc/{mt5.mdx => mt5.md}  |  4 +++
 docs/source/en/model_doc/{mvp.mdx => mvp.md}  |  4 +++
 docs/source/en/model_doc/{nat.mdx => nat.md}  |  4 +++
 .../en/model_doc/{nezha.mdx => nezha.md}      |  4 +++
 .../model_doc/{nllb-moe.mdx => nllb-moe.md}   |  4 +++
 .../source/en/model_doc/{nllb.mdx => nllb.md} |  4 +++
 .../{nystromformer.mdx => nystromformer.md}   |  4 +++
 .../model_doc/{oneformer.mdx => oneformer.md} |  4 +++
 .../{open-llama.mdx => open-llama.md}         |  4 +++
 .../{openai-gpt.mdx => openai-gpt.md}         |  4 +++
 docs/source/en/model_doc/{opt.mdx => opt.md}  |  6 +++-
 .../en/model_doc/{owlvit.mdx => owlvit.md}    |  4 +++
 .../en/model_doc/{pegasus.mdx => pegasus.md}  |  4 +++
 .../model_doc/{pegasus_x.mdx => pegasus_x.md} |  4 +++
 .../model_doc/{perceiver.mdx => perceiver.md} |  4 +++
 .../en/model_doc/{phobert.mdx => phobert.md}  |  4 +++
 .../{pix2struct.mdx => pix2struct.md}         |  4 +++
 .../en/model_doc/{plbart.mdx => plbart.md}    |  4 +++
 .../{poolformer.mdx => poolformer.md}         |  4 +++
 .../{prophetnet.mdx => prophetnet.md}         |  4 +++
 .../en/model_doc/{qdqbert.mdx => qdqbert.md}  |  4 +++
 docs/source/en/model_doc/{rag.mdx => rag.md}  |  4 +++
 .../en/model_doc/{realm.mdx => realm.md}      |  4 +++
 .../model_doc/{reformer.mdx => reformer.md}   |  4 +++
 .../en/model_doc/{regnet.mdx => regnet.md}    |  4 +++
 .../en/model_doc/{rembert.mdx => rembert.md}  |  4 +++
 .../en/model_doc/{resnet.mdx => resnet.md}    |  4 +++
 .../model_doc/{retribert.mdx => retribert.md} |  4 +++
 ...elayernorm.mdx => roberta-prelayernorm.md} |  4 +++
 .../en/model_doc/{roberta.mdx => roberta.md}  |  4 +++
 .../model_doc/{roc_bert.mdx => roc_bert.md}   |  4 +++
 .../model_doc/{roformer.mdx => roformer.md}   |  4 +++
 .../source/en/model_doc/{rwkv.mdx => rwkv.md} |  4 +++
 docs/source/en/model_doc/{sam.mdx => sam.md}  |  4 +++
 .../model_doc/{segformer.mdx => segformer.md} |  4 +++
 .../en/model_doc/{sew-d.mdx => sew-d.md}      |  4 +++
 docs/source/en/model_doc/{sew.mdx => sew.md}  |  4 +++
 ...-decoder.mdx => speech-encoder-decoder.md} |  4 +++
 .../{speech_to_text.mdx => speech_to_text.md} |  4 +++
 ...eech_to_text_2.mdx => speech_to_text_2.md} |  4 +++
 .../model_doc/{speecht5.mdx => speecht5.md}   |  4 +++
 .../model_doc/{splinter.mdx => splinter.md}   |  4 +++
 .../{squeezebert.mdx => squeezebert.md}       |  4 +++
 .../{swiftformer.mdx => swiftformer.md}       |  4 +++
 .../source/en/model_doc/{swin.mdx => swin.md} |  4 +++
 .../en/model_doc/{swin2sr.mdx => swin2sr.md}  |  4 +++
 .../en/model_doc/{swinv2.mdx => swinv2.md}    |  4 +++
 ...ransformers.mdx => switch_transformers.md} |  4 +++
 docs/source/en/model_doc/{t5.mdx => t5.md}    |  4 +++
 .../en/model_doc/{t5v1.1.mdx => t5v1.1.md}    |  4 +++
 ...e-transformer.mdx => table-transformer.md} |  4 +++
 .../en/model_doc/{tapas.mdx => tapas.md}      |  4 +++
 .../en/model_doc/{tapex.mdx => tapex.md}      |  4 +++
 ...sformer.mdx => time_series_transformer.md} |  4 +++
 .../{timesformer.mdx => timesformer.md}       |  4 +++
 ...nsformer.mdx => trajectory_transformer.md} |  4 +++
 .../{transfo-xl.mdx => transfo-xl.md}         |  4 +++
 .../en/model_doc/{trocr.mdx => trocr.md}      |  4 +++
 .../source/en/model_doc/{tvlt.mdx => tvlt.md} |  4 +++
 docs/source/en/model_doc/{ul2.mdx => ul2.md}  |  4 +++
 .../{unispeech-sat.mdx => unispeech-sat.md}   |  4 +++
 .../model_doc/{unispeech.mdx => unispeech.md} |  4 +++
 .../en/model_doc/{upernet.mdx => upernet.md}  |  4 +++
 docs/source/en/model_doc/{van.mdx => van.md}  |  4 +++
 .../model_doc/{videomae.mdx => videomae.md}   |  4 +++
 .../source/en/model_doc/{vilt.mdx => vilt.md} |  4 +++
 ...-decoder.mdx => vision-encoder-decoder.md} |  4 +++
 ...ncoder.mdx => vision-text-dual-encoder.md} |  4 +++
 .../{visual_bert.mdx => visual_bert.md}       |  4 +++
 docs/source/en/model_doc/{vit.mdx => vit.md}  |  4 +++
 .../{vit_hybrid.mdx => vit_hybrid.md}         |  4 +++
 .../en/model_doc/{vit_mae.mdx => vit_mae.md}  |  4 +++
 .../en/model_doc/{vit_msn.mdx => vit_msn.md}  |  4 +++
 ...c2-conformer.mdx => wav2vec2-conformer.md} |  4 +++
 .../model_doc/{wav2vec2.mdx => wav2vec2.md}   |  4 +++
 ...v2vec2_phoneme.mdx => wav2vec2_phoneme.md} |  4 +++
 .../en/model_doc/{wavlm.mdx => wavlm.md}      |  4 +++
 .../en/model_doc/{whisper.mdx => whisper.md}  |  4 +++
 .../en/model_doc/{xclip.mdx => xclip.md}      |  4 +++
 .../source/en/model_doc/{xglm.mdx => xglm.md} |  4 +++
 .../{xlm-prophetnet.mdx => xlm-prophetnet.md} |  4 +++
 .../{xlm-roberta-xl.mdx => xlm-roberta-xl.md} |  4 +++
 .../{xlm-roberta.mdx => xlm-roberta.md}       |  4 +++
 .../en/model_doc/{xlm-v.mdx => xlm-v.md}      |  4 +++
 docs/source/en/model_doc/{xlm.mdx => xlm.md}  |  4 +++
 .../en/model_doc/{xlnet.mdx => xlnet.md}      |  4 +++
 .../en/model_doc/{xls_r.mdx => xls_r.md}      |  4 +++
 .../{xlsr_wav2vec2.mdx => xlsr_wav2vec2.md}   |  4 +++
 .../source/en/model_doc/{xmod.mdx => xmod.md} |  4 +++
 .../en/model_doc/{yolos.mdx => yolos.md}      |  4 +++
 .../source/en/model_doc/{yoso.mdx => yoso.md} |  4 +++
 .../{model_sharing.mdx => model_sharing.md}   |  4 +++
 .../{model_summary.mdx => model_summary.md}   |  4 +++
 .../en/{multilingual.mdx => multilingual.md}  |  4 +++
 .../{pad_truncation.mdx => pad_truncation.md} |  4 +++
 .../{perf_hardware.mdx => perf_hardware.md}   |  4 +++
 .../{perf_infer_cpu.mdx => perf_infer_cpu.md} |  4 +++
 ...er_gpu_many.mdx => perf_infer_gpu_many.md} |  4 +++
 ...nfer_gpu_one.mdx => perf_infer_gpu_one.md} |  4 +++
 ...nfer_special.mdx => perf_infer_special.md} |  4 +++
 .../{perf_train_cpu.mdx => perf_train_cpu.md} |  4 +++
 ...in_cpu_many.mdx => perf_train_cpu_many.md} |  4 +++
 ...in_gpu_many.mdx => perf_train_gpu_many.md} |  4 +++
 ...rain_gpu_one.mdx => perf_train_gpu_one.md} |  4 +++
 ...rain_special.mdx => perf_train_special.md} |  4 +++
 .../{perf_train_tpu.mdx => perf_train_tpu.md} |  4 +++
 ..._train_tpu_tf.mdx => perf_train_tpu_tf.md} |  4 +++
 .../en/{performance.mdx => performance.md}    |  4 +++
 .../en/{perplexity.mdx => perplexity.md}      |  4 +++
 .../en/{philosophy.mdx => philosophy.md}      |  4 +++
 ...line_tutorial.mdx => pipeline_tutorial.md} |  4 +++
 ...ne_webserver.mdx => pipeline_webserver.md} |  4 +++
 .../source/en/{pr_checks.mdx => pr_checks.md} |  4 +++
 .../{preprocessing.mdx => preprocessing.md}   |  4 +++
 .../source/en/{quicktour.mdx => quicktour.md} |  4 +++
 .../en/{run_scripts.mdx => run_scripts.md}    |  4 +++
 .../source/en/{sagemaker.mdx => sagemaker.md} |  4 +++
 .../{serialization.mdx => serialization.md}   |  4 +++
 .../en/{task_summary.mdx => task_summary.md}  |  4 +++
 docs/source/en/tasks/{asr.mdx => asr.md}      |  4 +++
 ...sification.mdx => audio_classification.md} |  4 +++
 ...ing.mdx => document_question_answering.md} |  4 +++
 ...age_captioning.mdx => image_captioning.md} |  4 +++
 ...sification.mdx => image_classification.md} |  4 +++
 ...uage_modeling.mdx => language_modeling.md} |  4 +++
 ...deling.mdx => masked_language_modeling.md} |  4 +++
 ...tion.mdx => monocular_depth_estimation.md} |  4 +++
 ...multiple_choice.mdx => multiple_choice.md} |  4 +++
 ...ject_detection.mdx => object_detection.md} |  4 +++
 ...on_answering.mdx => question_answering.md} |  4 +++
 ...mentation.mdx => semantic_segmentation.md} |  4 +++
 ...ication.mdx => sequence_classification.md} |  4 +++
 .../{summarization.mdx => summarization.md}   |  4 +++
 .../{text-to-speech.mdx => text-to-speech.md} |  4 +++
 ...sification.mdx => token_classification.md} |  4 +++
 .../tasks/{translation.mdx => translation.md} |  4 +++
 ...sification.mdx => video_classification.md} |  4 +++
 ....mdx => zero_shot_image_classification.md} |  4 +++
 ...tion.mdx => zero_shot_object_detection.md} |  4 +++
 ...tasks_explained.mdx => tasks_explained.md} |  4 +++
 docs/source/en/{testing.mdx => testing.md}    |  6 +++-
 docs/source/en/{tf_xla.mdx => tf_xla.md}      |  4 +++
 docs/source/en/{tflite.mdx => tflite.md}      |  4 +++
 ...nizer_summary.mdx => tokenizer_summary.md} |  4 +++
 .../en/{torchscript.mdx => torchscript.md}    |  4 +++
 docs/source/en/{training.mdx => training.md}  |  4 +++
 ...mers_agents.mdx => transformers_agents.md} |  4 +++
 ...troubleshooting.mdx => troubleshooting.md} |  4 +++
 .../es/{accelerate.mdx => accelerate.md}      |  4 +++
 ...d_new_pipeline.mdx => add_new_pipeline.md} |  4 +++
 ...ass_tutorial.mdx => autoclass_tutorial.md} |  4 +++
 .../source/es/{bertology.mdx => bertology.md} |  4 +++
 .../source/es/{community.mdx => community.md} |  4 +++
 ...ls.mdx => converting_tensorflow_models.md} |  4 +++
 .../{create_a_model.mdx => create_a_model.md} |  4 +++
 .../{custom_models.mdx => custom_models.md}   |  4 +++
 .../source/es/{debugging.mdx => debugging.md} |  4 +++
 ...fast_tokenizers.mdx => fast_tokenizers.md} |  4 +++
 docs/source/es/{index.mdx => index.md}        |  4 +++
 .../es/{installation.mdx => installation.md}  |  4 +++
 .../{model_sharing.mdx => model_sharing.md}   |  4 +++
 .../es/{multilingual.mdx => multilingual.md}  |  4 +++
 .../es/{philosophy.mdx => philosophy.md}      |  4 +++
 ...line_tutorial.mdx => pipeline_tutorial.md} |  4 +++
 .../source/es/{pr_checks.mdx => pr_checks.md} |  4 +++
 .../{preprocessing.mdx => preprocessing.md}   |  4 +++
 .../source/es/{quicktour.mdx => quicktour.md} |  4 +++
 .../es/{run_scripts.mdx => run_scripts.md}    |  4 +++
 .../source/es/{sagemaker.mdx => sagemaker.md} |  4 +++
 .../{serialization.mdx => serialization.md}   |  4 +++
 docs/source/es/tasks/{asr.mdx => asr.md}      |  4 +++
 ...sification.mdx => image_classification.md} |  4 +++
 ...uage_modeling.mdx => language_modeling.md} |  4 +++
 ...multiple_choice.mdx => multiple_choice.md} |  4 +++
 ...on_answering.mdx => question_answering.md} |  4 +++
 .../{summarization.mdx => summarization.md}   |  4 +++
 docs/source/es/{training.mdx => training.md}  |  4 +++
 docs/source/fr/in_translation.md              |  5 +++
 docs/source/fr/in_translation.mdx             |  1 -
 docs/source/fr/{index.mdx => index.md}        |  4 +++
 .../source/fr/{quicktour.mdx => quicktour.md} |  4 +++
 .../it/{accelerate.mdx => accelerate.md}      |  4 +++
 .../{add_new_model.mdx => add_new_model.md}   |  4 +++
 ...d_new_pipeline.mdx => add_new_pipeline.md} |  4 +++
 ...ass_tutorial.mdx => autoclass_tutorial.md} |  4 +++
 .../it/{big_models.mdx => big_models.md}      |  4 +++
 .../source/it/{community.mdx => community.md} |  6 +++-
 ...ls.mdx => converting_tensorflow_models.md} |  4 +++
 .../{create_a_model.mdx => create_a_model.md} |  4 +++
 .../{custom_models.mdx => custom_models.md}   |  4 +++
 .../source/it/{debugging.mdx => debugging.md} |  4 +++
 docs/source/it/{index.mdx => index.md}        |  4 +++
 .../it/{installation.mdx => installation.md}  |  4 +++
 .../source/it/{migration.mdx => migration.md} |  4 +++
 .../{model_sharing.mdx => model_sharing.md}   |  4 +++
 .../it/{multilingual.mdx => multilingual.md}  |  4 +++
 .../{perf_hardware.mdx => perf_hardware.md}   |  4 +++
 .../{perf_infer_cpu.mdx => perf_infer_cpu.md} |  4 +++
 ...er_gpu_many.mdx => perf_infer_gpu_many.md} |  4 +++
 ...nfer_gpu_one.mdx => perf_infer_gpu_one.md} |  4 +++
 ...nfer_special.mdx => perf_infer_special.md} |  4 +++
 .../{perf_train_cpu.mdx => perf_train_cpu.md} |  4 +++
 ...in_cpu_many.mdx => perf_train_cpu_many.md} |  4 +++
 ...rain_special.mdx => perf_train_special.md} |  4 +++
 .../{perf_train_tpu.mdx => perf_train_tpu.md} |  4 +++
 ...line_tutorial.mdx => pipeline_tutorial.md} |  4 +++
 .../source/it/{pr_checks.mdx => pr_checks.md} |  4 +++
 .../{preprocessing.mdx => preprocessing.md}   |  4 +++
 .../source/it/{quicktour.mdx => quicktour.md} |  4 +++
 .../it/{run_scripts.mdx => run_scripts.md}    |  4 +++
 .../{serialization.mdx => serialization.md}   |  4 +++
 docs/source/it/{training.mdx => training.md}  |  4 +++
 .../ja/{accelerate.mdx => accelerate.md}      |  4 +++
 docs/source/ja/{index.mdx => index.md}        |  4 +++
 .../ja/{installation.mdx => installation.md}  |  4 +++
 .../ja/{multilingual.mdx => multilingual.md}  |  4 +++
 .../ko/{accelerate.mdx => accelerate.md}      |  4 +++
 .../source/ko/{attention.mdx => attention.md} |  4 +++
 ...ass_tutorial.mdx => autoclass_tutorial.md} |  4 +++
 .../source/ko/{bertology.mdx => bertology.md} |  4 +++
 .../{create_a_model.mdx => create_a_model.md} |  4 +++
 .../{custom_models.mdx => custom_models.md}   |  4 +++
 ...fast_tokenizers.mdx => fast_tokenizers.md} |  4 +++
 docs/source/ko/in_translation.md              |  5 +++
 docs/source/ko/in_translation.mdx             |  1 -
 docs/source/ko/{index.mdx => index.md}        |  4 +++
 .../ko/{installation.mdx => installation.md}  |  4 +++
 .../{model_sharing.mdx => model_sharing.md}   |  4 +++
 .../ko/{multilingual.mdx => multilingual.md}  |  4 +++
 .../{pad_truncation.mdx => pad_truncation.md} |  4 +++
 ...line_tutorial.mdx => pipeline_tutorial.md} |  4 +++
 .../{preprocessing.mdx => preprocessing.md}   |  4 +++
 .../source/ko/{quicktour.mdx => quicktour.md} |  4 +++
 .../ko/{run_scripts.mdx => run_scripts.md}    |  4 +++
 .../source/ko/{sagemaker.mdx => sagemaker.md} |  4 +++
 .../{serialization.mdx => serialization.md}   |  4 +++
 .../ko/{task_summary.mdx => task_summary.md}  |  4 +++
 docs/source/ko/tasks/{asr.mdx => asr.md}      |  4 +++
 ...age_captioning.mdx => image_captioning.md} |  4 +++
 ...sification.mdx => image_classification.md} |  4 +++
 ...uage_modeling.mdx => language_modeling.md} |  4 +++
 ...deling.mdx => masked_language_modeling.md} |  4 +++
 ...tion.mdx => monocular_depth_estimation.md} |  4 +++
 ...multiple_choice.mdx => multiple_choice.md} |  4 +++
 ...ject_detection.mdx => object_detection.md} |  4 +++
 ...on_answering.mdx => question_answering.md} |  4 +++
 ...ication.mdx => sequence_classification.md} |  4 +++
 .../{summarization.mdx => summarization.md}   |  4 +++
 ...sification.mdx => token_classification.md} |  4 +++
 .../tasks/{translation.mdx => translation.md} |  4 +++
 ...sification.mdx => video_classification.md} |  4 +++
 ....mdx => zero_shot_image_classification.md} |  4 +++
 ...tion.mdx => zero_shot_object_detection.md} |  4 +++
 ...tasks_explained.mdx => tasks_explained.md} |  4 +++
 .../ko/{torchscript.mdx => torchscript.md}    |  4 +++
 docs/source/ko/{training.mdx => training.md}  |  4 +++
 ...troubleshooting.mdx => troubleshooting.md} |  4 +++
 docs/source/ms/{index.mdx => index.md}        |  4 +++
 .../pt/{accelerate.mdx => accelerate.md}      |  4 +++
 ...ls.mdx => converting_tensorflow_models.md} |  4 +++
 .../{create_a_model.mdx => create_a_model.md} |  4 +++
 .../{custom_models.mdx => custom_models.md}   |  4 +++
 ...fast_tokenizers.mdx => fast_tokenizers.md} |  4 +++
 docs/source/pt/{index.mdx => index.md}        |  4 +++
 .../pt/{installation.mdx => installation.md}  |  4 +++
 .../pt/{multilingual.mdx => multilingual.md}  |  4 +++
 ...line_tutorial.mdx => pipeline_tutorial.md} |  6 +++-
 .../source/pt/{quicktour.mdx => quicktour.md} |  4 +++
 .../pt/{run_scripts.mdx => run_scripts.md}    |  4 +++
 .../{serialization.mdx => serialization.md}   |  4 +++
 ...ication.mdx => sequence_classification.md} |  4 +++
 ...sification.mdx => token_classification.md} |  4 +++
 docs/source/pt/{training.mdx => training.md}  |  4 +++
 docs/source/zh/{index.mdx => index.md}        |  4 +++
 .../source/zh/{quicktour.mdx => quicktour.md} |  4 +++
 setup.cfg                                     |  2 +-
 src/transformers/commands/add_new_model.py    |  4 +--
 .../commands/add_new_model_like.py            |  8 ++---
 src/transformers/testing_utils.py             |  4 +--
 templates/adding_a_new_model/README.md        |  4 +--
 tests/utils/test_add_new_model_like.py        | 30 ++++++++--------
 utils/check_copies.py                         |  6 ++--
 utils/check_repo.py                           |  4 +--
 utils/check_table.py                          |  6 ++--
 utils/check_task_guides.py                    | 36 +++++++++----------
 utils/documentation_tests.txt                 | 28 +++++++--------
 utils/notification_service_doc_tests.py       |  2 +-
 utils/tests_fetcher.py                        |  6 ++--
 472 files changed, 1909 insertions(+), 98 deletions(-)
 rename docs/source/de/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/de/{autoclass_tutorial.mdx => autoclass_tutorial.md} (97%)
 rename docs/source/de/{index.mdx => index.md} (99%)
 rename docs/source/de/{installation.mdx => installation.md} (98%)
 rename docs/source/de/{model_sharing.mdx => model_sharing.md} (98%)
 rename docs/source/de/{pipeline_tutorial.mdx => pipeline_tutorial.md} (97%)
 rename docs/source/de/{preprocessing.mdx => preprocessing.md} (99%)
 rename docs/source/de/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/de/{training.mdx => training.md} (99%)
 rename docs/source/en/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/en/{add_new_model.mdx => add_new_model.md} (99%)
 rename docs/source/en/{add_new_pipeline.mdx => add_new_pipeline.md} (98%)
 rename docs/source/en/{add_tensorflow_model.mdx => add_tensorflow_model.md} (98%)
 rename docs/source/en/{attention.mdx => attention.md} (95%)
 rename docs/source/en/{autoclass_tutorial.mdx => autoclass_tutorial.md} (97%)
 rename docs/source/en/{benchmarks.mdx => benchmarks.md} (99%)
 rename docs/source/en/{bertology.mdx => bertology.md} (92%)
 rename docs/source/en/{big_models.mdx => big_models.md} (97%)
 rename docs/source/en/{community.mdx => community.md} (99%)
 rename docs/source/en/{create_a_model.mdx => create_a_model.md} (98%)
 rename docs/source/en/{custom_models.mdx => custom_models.md} (98%)
 rename docs/source/en/{custom_tools.mdx => custom_tools.md} (99%)
 rename docs/source/en/{debugging.mdx => debugging.md} (98%)
 rename docs/source/en/{fast_tokenizers.mdx => fast_tokenizers.md} (94%)
 rename docs/source/en/{generation_strategies.mdx => generation_strategies.md} (98%)
 rename docs/source/en/{glossary.mdx => glossary.md} (99%)
 rename docs/source/en/{hpo_train.mdx => hpo_train.md} (96%)
 rename docs/source/en/{index.mdx => index.md} (99%)
 rename docs/source/en/{installation.mdx => installation.md} (98%)
 rename docs/source/en/internal/{audio_utils.mdx => audio_utils.md} (88%)
 rename docs/source/en/internal/{file_utils.mdx => file_utils.md} (88%)
 rename docs/source/en/internal/{generation_utils.mdx => generation_utils.md} (97%)
 rename docs/source/en/internal/{image_processing_utils.mdx => image_processing_utils.md} (89%)
 rename docs/source/en/internal/{modeling_utils.mdx => modeling_utils.md} (92%)
 rename docs/source/en/internal/{pipelines_utils.mdx => pipelines_utils.md} (87%)
 rename docs/source/en/internal/{time_series_utils.mdx => time_series_utils.md} (86%)
 rename docs/source/en/internal/{tokenization_utils.mdx => tokenization_utils.md} (89%)
 rename docs/source/en/internal/{trainer_utils.mdx => trainer_utils.md} (87%)
 rename docs/source/en/main_classes/{agent.mdx => agent.md} (94%)
 rename docs/source/en/main_classes/{callback.mdx => callback.md} (96%)
 rename docs/source/en/main_classes/{configuration.mdx => configuration.md} (87%)
 rename docs/source/en/main_classes/{data_collator.mdx => data_collator.md} (92%)
 rename docs/source/en/main_classes/{deepspeed.mdx => deepspeed.md} (99%)
 rename docs/source/en/main_classes/{feature_extractor.mdx => feature_extractor.md} (88%)
 rename docs/source/en/main_classes/{image_processor.mdx => image_processor.md} (87%)
 rename docs/source/en/main_classes/{keras_callbacks.mdx => keras_callbacks.md} (83%)
 rename docs/source/en/main_classes/{logging.mdx => logging.md} (95%)
 rename docs/source/en/main_classes/{model.mdx => model.md} (97%)
 rename docs/source/en/main_classes/{onnx.mdx => onnx.md} (90%)
 rename docs/source/en/main_classes/{optimizer_schedules.mdx => optimizer_schedules.md} (92%)
 rename docs/source/en/main_classes/{output.mdx => output.md} (98%)
 rename docs/source/en/main_classes/{pipelines.mdx => pipelines.md} (98%)
 rename docs/source/en/main_classes/{processors.mdx => processors.md} (97%)
 rename docs/source/en/main_classes/{quantization.mdx => quantization.md} (98%)
 rename docs/source/en/main_classes/{text_generation.mdx => text_generation.md} (92%)
 rename docs/source/en/main_classes/{tokenizer.mdx => tokenizer.md} (95%)
 rename docs/source/en/main_classes/{trainer.mdx => trainer.md} (99%)
 rename docs/source/en/model_doc/{albert.mdx => albert.md} (97%)
 rename docs/source/en/model_doc/{align.mdx => align.md} (97%)
 rename docs/source/en/model_doc/{altclip.mdx => altclip.md} (96%)
 rename docs/source/en/model_doc/{audio-spectrogram-transformer.mdx => audio-spectrogram-transformer.md} (96%)
 rename docs/source/en/model_doc/{auto.mdx => auto.md} (97%)
 rename docs/source/en/model_doc/{autoformer.mdx => autoformer.md} (94%)
 rename docs/source/en/model_doc/{bart.mdx => bart.md} (98%)
 rename docs/source/en/model_doc/{barthez.mdx => barthez.md} (94%)
 rename docs/source/en/model_doc/{bartpho.mdx => bartpho.md} (95%)
 rename docs/source/en/model_doc/{beit.mdx => beit.md} (97%)
 rename docs/source/en/model_doc/{bert-generation.mdx => bert-generation.md} (96%)
 rename docs/source/en/model_doc/{bert-japanese.mdx => bert-japanese.md} (93%)
 rename docs/source/en/model_doc/{bert.mdx => bert.md} (98%)
 rename docs/source/en/model_doc/{bertweet.mdx => bertweet.md} (93%)
 rename docs/source/en/model_doc/{big_bird.mdx => big_bird.md} (97%)
 rename docs/source/en/model_doc/{bigbird_pegasus.mdx => bigbird_pegasus.md} (96%)
 rename docs/source/en/model_doc/{biogpt.mdx => biogpt.md} (95%)
 rename docs/source/en/model_doc/{bit.mdx => bit.md} (95%)
 rename docs/source/en/model_doc/{blenderbot-small.mdx => blenderbot-small.md} (95%)
 rename docs/source/en/model_doc/{blenderbot.mdx => blenderbot.md} (96%)
 rename docs/source/en/model_doc/{blip-2.mdx => blip-2.md} (96%)
 rename docs/source/en/model_doc/{blip.mdx => blip.md} (95%)
 rename docs/source/en/model_doc/{bloom.mdx => bloom.md} (95%)
 rename docs/source/en/model_doc/{bort.mdx => bort.md} (94%)
 rename docs/source/en/model_doc/{bridgetower.mdx => bridgetower.md} (97%)
 rename docs/source/en/model_doc/{byt5.mdx => byt5.md} (97%)
 rename docs/source/en/model_doc/{camembert.mdx => camembert.md} (95%)
 rename docs/source/en/model_doc/{canine.mdx => canine.md} (97%)
 rename docs/source/en/model_doc/{chinese_clip.mdx => chinese_clip.md} (96%)
 rename docs/source/en/model_doc/{clap.mdx => clap.md} (95%)
 rename docs/source/en/model_doc/{clip.mdx => clip.md} (97%)
 rename docs/source/en/model_doc/{clipseg.mdx => clipseg.md} (96%)
 rename docs/source/en/model_doc/{codegen.mdx => codegen.md} (96%)
 rename docs/source/en/model_doc/{conditional_detr.mdx => conditional_detr.md} (95%)
 rename docs/source/en/model_doc/{convbert.mdx => convbert.md} (96%)
 rename docs/source/en/model_doc/{convnext.mdx => convnext.md} (96%)
 rename docs/source/en/model_doc/{convnextv2.mdx => convnextv2.md} (95%)
 rename docs/source/en/model_doc/{cpm.mdx => cpm.md} (93%)
 rename docs/source/en/model_doc/{cpmant.mdx => cpmant.md} (90%)
 rename docs/source/en/model_doc/{ctrl.mdx => ctrl.md} (96%)
 rename docs/source/en/model_doc/{cvt.mdx => cvt.md} (96%)
 rename docs/source/en/model_doc/{data2vec.mdx => data2vec.md} (97%)
 rename docs/source/en/model_doc/{deberta-v2.mdx => deberta-v2.md} (97%)
 rename docs/source/en/model_doc/{deberta.mdx => deberta.md} (97%)
 rename docs/source/en/model_doc/{decision_transformer.mdx => decision_transformer.md} (93%)
 rename docs/source/en/model_doc/{deformable_detr.mdx => deformable_detr.md} (95%)
 rename docs/source/en/model_doc/{deit.mdx => deit.md} (97%)
 rename docs/source/en/model_doc/{deplot.mdx => deplot.md} (96%)
 rename docs/source/en/model_doc/{deta.mdx => deta.md} (95%)
 rename docs/source/en/model_doc/{detr.mdx => detr.md} (98%)
 rename docs/source/en/model_doc/{dialogpt.mdx => dialogpt.md} (94%)
 rename docs/source/en/model_doc/{dinat.mdx => dinat.md} (96%)
 rename docs/source/en/model_doc/{distilbert.mdx => distilbert.md} (98%)
 rename docs/source/en/model_doc/{dit.mdx => dit.md} (96%)
 rename docs/source/en/model_doc/{donut.mdx => donut.md} (98%)
 rename docs/source/en/model_doc/{dpr.mdx => dpr.md} (95%)
 rename docs/source/en/model_doc/{dpt.mdx => dpt.md} (95%)
 rename docs/source/en/model_doc/{efficientformer.mdx => efficientformer.md} (95%)
 rename docs/source/en/model_doc/{efficientnet.mdx => efficientnet.md} (94%)
 rename docs/source/en/model_doc/{electra.mdx => electra.md} (97%)
 rename docs/source/en/model_doc/{encodec.mdx => encodec.md} (95%)
 rename docs/source/en/model_doc/{encoder-decoder.mdx => encoder-decoder.md} (98%)
 rename docs/source/en/model_doc/{ernie.mdx => ernie.md} (95%)
 rename docs/source/en/model_doc/{ernie_m.mdx => ernie_m.md} (95%)
 rename docs/source/en/model_doc/{esm.mdx => esm.md} (97%)
 rename docs/source/en/model_doc/{flan-t5.mdx => flan-t5.md} (93%)
 rename docs/source/en/model_doc/{flan-ul2.mdx => flan-ul2.md} (94%)
 rename docs/source/en/model_doc/{flaubert.mdx => flaubert.md} (96%)
 rename docs/source/en/model_doc/{flava.mdx => flava.md} (94%)
 rename docs/source/en/model_doc/{fnet.mdx => fnet.md} (96%)
 rename docs/source/en/model_doc/{focalnet.mdx => focalnet.md} (95%)
 rename docs/source/en/model_doc/{fsmt.mdx => fsmt.md} (94%)
 rename docs/source/en/model_doc/{funnel.mdx => funnel.md} (97%)
 rename docs/source/en/model_doc/{git.mdx => git.md} (95%)
 rename docs/source/en/model_doc/{glpn.mdx => glpn.md} (95%)
 rename docs/source/en/model_doc/{gpt-sw3.mdx => gpt-sw3.md} (94%)
 rename docs/source/en/model_doc/{gpt2.mdx => gpt2.md} (97%)
 rename docs/source/en/model_doc/{gpt_bigcode.mdx => gpt_bigcode.md} (96%)
 rename docs/source/en/model_doc/{gpt_neo.mdx => gpt_neo.md} (94%)
 rename docs/source/en/model_doc/{gpt_neox.mdx => gpt_neox.md} (95%)
 rename docs/source/en/model_doc/{gpt_neox_japanese.mdx => gpt_neox_japanese.md} (94%)
 rename docs/source/en/model_doc/{gptj.mdx => gptj.md} (97%)
 rename docs/source/en/model_doc/{gptsan-japanese.mdx => gptsan-japanese.md} (96%)
 rename docs/source/en/model_doc/{graphormer.mdx => graphormer.md} (94%)
 rename docs/source/en/model_doc/{groupvit.mdx => groupvit.md} (95%)
 rename docs/source/en/model_doc/{herbert.mdx => herbert.md} (95%)
 rename docs/source/en/model_doc/{hubert.mdx => hubert.md} (94%)
 rename docs/source/en/model_doc/{ibert.mdx => ibert.md} (95%)
 rename docs/source/en/model_doc/{imagegpt.mdx => imagegpt.md} (97%)
 rename docs/source/en/model_doc/{informer.mdx => informer.md} (95%)
 rename docs/source/en/model_doc/{jukebox.mdx => jukebox.md} (96%)
 rename docs/source/en/model_doc/{layoutlm.mdx => layoutlm.md} (97%)
 rename docs/source/en/model_doc/{layoutlmv2.mdx => layoutlmv2.md} (99%)
 rename docs/source/en/model_doc/{layoutlmv3.mdx => layoutlmv3.md} (97%)
 rename docs/source/en/model_doc/{layoutxlm.mdx => layoutxlm.md} (95%)
 rename docs/source/en/model_doc/{led.mdx => led.md} (96%)
 rename docs/source/en/model_doc/{levit.mdx => levit.md} (97%)
 rename docs/source/en/model_doc/{lilt.mdx => lilt.md} (96%)
 rename docs/source/en/model_doc/{llama.mdx => llama.md} (96%)
 rename docs/source/en/model_doc/{longformer.mdx => longformer.md} (97%)
 rename docs/source/en/model_doc/{longt5.mdx => longt5.md} (97%)
 rename docs/source/en/model_doc/{luke.mdx => luke.md} (98%)
 rename docs/source/en/model_doc/{lxmert.mdx => lxmert.md} (96%)
 rename docs/source/en/model_doc/{m2m_100.mdx => m2m_100.md} (97%)
 rename docs/source/en/model_doc/{marian.mdx => marian.md} (98%)
 rename docs/source/en/model_doc/{markuplm.mdx => markuplm.md} (98%)
 rename docs/source/en/model_doc/{mask2former.mdx => mask2former.md} (96%)
 rename docs/source/en/model_doc/{maskformer.mdx => maskformer.md} (96%)
 rename docs/source/en/model_doc/{matcha.mdx => matcha.md} (96%)
 rename docs/source/en/model_doc/{mbart.mdx => mbart.md} (98%)
 rename docs/source/en/model_doc/{mctct.mdx => mctct.md} (94%)
 rename docs/source/en/model_doc/{mega.mdx => mega.md} (96%)
 rename docs/source/en/model_doc/{megatron-bert.mdx => megatron-bert.md} (97%)
 rename docs/source/en/model_doc/{megatron_gpt2.mdx => megatron_gpt2.md} (96%)
 rename docs/source/en/model_doc/{mgp-str.mdx => mgp-str.md} (96%)
 rename docs/source/en/model_doc/{mluke.mdx => mluke.md} (94%)
 rename docs/source/en/model_doc/{mms.mdx => mms.md} (98%)
 rename docs/source/en/model_doc/{mobilebert.mdx => mobilebert.md} (96%)
 rename docs/source/en/model_doc/{mobilenet_v1.mdx => mobilenet_v1.md} (96%)
 rename docs/source/en/model_doc/{mobilenet_v2.mdx => mobilenet_v2.md} (97%)
 rename docs/source/en/model_doc/{mobilevit.mdx => mobilevit.md} (97%)
 rename docs/source/en/model_doc/{mobilevitv2.mdx => mobilevitv2.md} (95%)
 rename docs/source/en/model_doc/{mpnet.mdx => mpnet.md} (96%)
 rename docs/source/en/model_doc/{mt5.mdx => mt5.md} (95%)
 rename docs/source/en/model_doc/{mvp.mdx => mvp.md} (97%)
 rename docs/source/en/model_doc/{nat.mdx => nat.md} (96%)
 rename docs/source/en/model_doc/{nezha.mdx => nezha.md} (94%)
 rename docs/source/en/model_doc/{nllb-moe.mdx => nllb-moe.md} (97%)
 rename docs/source/en/model_doc/{nllb.mdx => nllb.md} (97%)
 rename docs/source/en/model_doc/{nystromformer.mdx => nystromformer.md} (95%)
 rename docs/source/en/model_doc/{oneformer.mdx => oneformer.md} (97%)
 rename docs/source/en/model_doc/{open-llama.mdx => open-llama.md} (90%)
 rename docs/source/en/model_doc/{openai-gpt.mdx => openai-gpt.md} (97%)
 rename docs/source/en/model_doc/{opt.mdx => opt.md} (95%)
 rename docs/source/en/model_doc/{owlvit.mdx => owlvit.md} (97%)
 rename docs/source/en/model_doc/{pegasus.mdx => pegasus.md} (97%)
 rename docs/source/en/model_doc/{pegasus_x.mdx => pegasus_x.md} (94%)
 rename docs/source/en/model_doc/{perceiver.mdx => perceiver.md} (98%)
 rename docs/source/en/model_doc/{phobert.mdx => phobert.md} (92%)
 rename docs/source/en/model_doc/{pix2struct.mdx => pix2struct.md} (96%)
 rename docs/source/en/model_doc/{plbart.mdx => plbart.md} (97%)
 rename docs/source/en/model_doc/{poolformer.mdx => poolformer.md} (96%)
 rename docs/source/en/model_doc/{prophetnet.mdx => prophetnet.md} (96%)
 rename docs/source/en/model_doc/{qdqbert.mdx => qdqbert.md} (97%)
 rename docs/source/en/model_doc/{rag.mdx => rag.md} (96%)
 rename docs/source/en/model_doc/{realm.mdx => realm.md} (95%)
 rename docs/source/en/model_doc/{reformer.mdx => reformer.md} (98%)
 rename docs/source/en/model_doc/{regnet.mdx => regnet.md} (96%)
 rename docs/source/en/model_doc/{rembert.mdx => rembert.md} (96%)
 rename docs/source/en/model_doc/{resnet.mdx => resnet.md} (96%)
 rename docs/source/en/model_doc/{retribert.mdx => retribert.md} (89%)
 rename docs/source/en/model_doc/{roberta-prelayernorm.mdx => roberta-prelayernorm.md} (96%)
 rename docs/source/en/model_doc/{roberta.mdx => roberta.md} (98%)
 rename docs/source/en/model_doc/{roc_bert.mdx => roc_bert.md} (95%)
 rename docs/source/en/model_doc/{roformer.mdx => roformer.md} (96%)
 rename docs/source/en/model_doc/{rwkv.mdx => rwkv.md} (97%)
 rename docs/source/en/model_doc/{sam.mdx => sam.md} (96%)
 rename docs/source/en/model_doc/{segformer.mdx => segformer.md} (98%)
 rename docs/source/en/model_doc/{sew-d.mdx => sew-d.md} (93%)
 rename docs/source/en/model_doc/{sew.mdx => sew.md} (93%)
 rename docs/source/en/model_doc/{speech-encoder-decoder.mdx => speech-encoder-decoder.md} (97%)
 rename docs/source/en/model_doc/{speech_to_text.mdx => speech_to_text.md} (97%)
 rename docs/source/en/model_doc/{speech_to_text_2.mdx => speech_to_text_2.md} (96%)
 rename docs/source/en/model_doc/{speecht5.mdx => speecht5.md} (95%)
 rename docs/source/en/model_doc/{splinter.mdx => splinter.md} (96%)
 rename docs/source/en/model_doc/{squeezebert.mdx => squeezebert.md} (96%)
 rename docs/source/en/model_doc/{swiftformer.mdx => swiftformer.md} (95%)
 rename docs/source/en/model_doc/{swin.mdx => swin.md} (96%)
 rename docs/source/en/model_doc/{swin2sr.mdx => swin2sr.md} (95%)
 rename docs/source/en/model_doc/{swinv2.mdx => swinv2.md} (95%)
 rename docs/source/en/model_doc/{switch_transformers.mdx => switch_transformers.md} (96%)
 rename docs/source/en/model_doc/{t5.mdx => t5.md} (99%)
 rename docs/source/en/model_doc/{t5v1.1.mdx => t5v1.1.md} (94%)
 rename docs/source/en/model_doc/{table-transformer.mdx => table-transformer.md} (96%)
 rename docs/source/en/model_doc/{tapas.mdx => tapas.md} (99%)
 rename docs/source/en/model_doc/{tapex.mdx => tapex.md} (97%)
 rename docs/source/en/model_doc/{time_series_transformer.mdx => time_series_transformer.md} (96%)
 rename docs/source/en/model_doc/{timesformer.mdx => timesformer.md} (94%)
 rename docs/source/en/model_doc/{trajectory_transformer.mdx => trajectory_transformer.md} (94%)
 rename docs/source/en/model_doc/{transfo-xl.mdx => transfo-xl.md} (97%)
 rename docs/source/en/model_doc/{trocr.mdx => trocr.md} (97%)
 rename docs/source/en/model_doc/{tvlt.mdx => tvlt.md} (96%)
 rename docs/source/en/model_doc/{ul2.mdx => ul2.md} (94%)
 rename docs/source/en/model_doc/{unispeech-sat.mdx => unispeech-sat.md} (95%)
 rename docs/source/en/model_doc/{unispeech.mdx => unispeech.md} (94%)
 rename docs/source/en/model_doc/{upernet.mdx => upernet.md} (96%)
 rename docs/source/en/model_doc/{van.mdx => van.md} (95%)
 rename docs/source/en/model_doc/{videomae.mdx => videomae.md} (96%)
 rename docs/source/en/model_doc/{vilt.mdx => vilt.md} (96%)
 rename docs/source/en/model_doc/{vision-encoder-decoder.mdx => vision-encoder-decoder.md} (98%)
 rename docs/source/en/model_doc/{vision-text-dual-encoder.mdx => vision-text-dual-encoder.md} (92%)
 rename docs/source/en/model_doc/{visual_bert.mdx => visual_bert.md} (97%)
 rename docs/source/en/model_doc/{vit.mdx => vit.md} (98%)
 rename docs/source/en/model_doc/{vit_hybrid.mdx => vit_hybrid.md} (95%)
 rename docs/source/en/model_doc/{vit_mae.mdx => vit_mae.md} (96%)
 rename docs/source/en/model_doc/{vit_msn.mdx => vit_msn.md} (96%)
 rename docs/source/en/model_doc/{wav2vec2-conformer.mdx => wav2vec2-conformer.md} (94%)
 rename docs/source/en/model_doc/{wav2vec2.mdx => wav2vec2.md} (98%)
 rename docs/source/en/model_doc/{wav2vec2_phoneme.mdx => wav2vec2_phoneme.md} (94%)
 rename docs/source/en/model_doc/{wavlm.mdx => wavlm.md} (95%)
 rename docs/source/en/model_doc/{whisper.mdx => whisper.md} (95%)
 rename docs/source/en/model_doc/{xclip.mdx => xclip.md} (96%)
 rename docs/source/en/model_doc/{xglm.mdx => xglm.md} (95%)
 rename docs/source/en/model_doc/{xlm-prophetnet.mdx => xlm-prophetnet.md} (95%)
 rename docs/source/en/model_doc/{xlm-roberta-xl.mdx => xlm-roberta-xl.md} (95%)
 rename docs/source/en/model_doc/{xlm-roberta.mdx => xlm-roberta.md} (98%)
 rename docs/source/en/model_doc/{xlm-v.mdx => xlm-v.md} (94%)
 rename docs/source/en/model_doc/{xlm.mdx => xlm.md} (97%)
 rename docs/source/en/model_doc/{xlnet.mdx => xlnet.md} (97%)
 rename docs/source/en/model_doc/{xls_r.mdx => xls_r.md} (94%)
 rename docs/source/en/model_doc/{xlsr_wav2vec2.mdx => xlsr_wav2vec2.md} (93%)
 rename docs/source/en/model_doc/{xmod.mdx => xmod.md} (96%)
 rename docs/source/en/model_doc/{yolos.mdx => yolos.md} (95%)
 rename docs/source/en/model_doc/{yoso.mdx => yoso.md} (96%)
 rename docs/source/en/{model_sharing.mdx => model_sharing.md} (98%)
 rename docs/source/en/{model_summary.mdx => model_summary.md} (99%)
 rename docs/source/en/{multilingual.mdx => multilingual.md} (97%)
 rename docs/source/en/{pad_truncation.mdx => pad_truncation.md} (97%)
 rename docs/source/en/{perf_hardware.mdx => perf_hardware.md} (97%)
 rename docs/source/en/{perf_infer_cpu.mdx => perf_infer_cpu.md} (96%)
 rename docs/source/en/{perf_infer_gpu_many.mdx => perf_infer_gpu_many.md} (87%)
 rename docs/source/en/{perf_infer_gpu_one.mdx => perf_infer_gpu_one.md} (98%)
 rename docs/source/en/{perf_infer_special.mdx => perf_infer_special.md} (81%)
 rename docs/source/en/{perf_train_cpu.mdx => perf_train_cpu.md} (95%)
 rename docs/source/en/{perf_train_cpu_many.mdx => perf_train_cpu_many.md} (97%)
 rename docs/source/en/{perf_train_gpu_many.mdx => perf_train_gpu_many.md} (99%)
 rename docs/source/en/{perf_train_gpu_one.mdx => perf_train_gpu_one.md} (99%)
 rename docs/source/en/{perf_train_special.mdx => perf_train_special.md} (85%)
 rename docs/source/en/{perf_train_tpu.mdx => perf_train_tpu.md} (84%)
 rename docs/source/en/{perf_train_tpu_tf.mdx => perf_train_tpu_tf.md} (98%)
 rename docs/source/en/{performance.mdx => performance.md} (96%)
 rename docs/source/en/{perplexity.mdx => perplexity.md} (97%)
 rename docs/source/en/{philosophy.mdx => philosophy.md} (97%)
 rename docs/source/en/{pipeline_tutorial.mdx => pipeline_tutorial.md} (98%)
 rename docs/source/en/{pipeline_webserver.mdx => pipeline_webserver.md} (97%)
 rename docs/source/en/{pr_checks.mdx => pr_checks.md} (97%)
 rename docs/source/en/{preprocessing.mdx => preprocessing.md} (99%)
 rename docs/source/en/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/en/{run_scripts.mdx => run_scripts.md} (98%)
 rename docs/source/en/{sagemaker.mdx => sagemaker.md} (87%)
 rename docs/source/en/{serialization.mdx => serialization.md} (98%)
 rename docs/source/en/{task_summary.mdx => task_summary.md} (99%)
 rename docs/source/en/tasks/{asr.mdx => asr.md} (98%)
 rename docs/source/en/tasks/{audio_classification.mdx => audio_classification.md} (98%)
 rename docs/source/en/tasks/{document_question_answering.mdx => document_question_answering.md} (99%)
 rename docs/source/en/tasks/{image_captioning.mdx => image_captioning.md} (97%)
 rename docs/source/en/tasks/{image_classification.mdx => image_classification.md} (99%)
 rename docs/source/en/tasks/{language_modeling.mdx => language_modeling.md} (99%)
 rename docs/source/en/tasks/{masked_language_modeling.mdx => masked_language_modeling.md} (99%)
 rename docs/source/en/tasks/{monocular_depth_estimation.mdx => monocular_depth_estimation.md} (96%)
 rename docs/source/en/tasks/{multiple_choice.mdx => multiple_choice.md} (99%)
 rename docs/source/en/tasks/{object_detection.mdx => object_detection.md} (99%)
 rename docs/source/en/tasks/{question_answering.mdx => question_answering.md} (99%)
 rename docs/source/en/tasks/{semantic_segmentation.mdx => semantic_segmentation.md} (99%)
 rename docs/source/en/tasks/{sequence_classification.mdx => sequence_classification.md} (99%)
 rename docs/source/en/tasks/{summarization.mdx => summarization.md} (99%)
 rename docs/source/en/tasks/{text-to-speech.mdx => text-to-speech.md} (99%)
 rename docs/source/en/tasks/{token_classification.mdx => token_classification.md} (99%)
 rename docs/source/en/tasks/{translation.mdx => translation.md} (98%)
 rename docs/source/en/tasks/{video_classification.mdx => video_classification.md} (99%)
 rename docs/source/en/tasks/{zero_shot_image_classification.mdx => zero_shot_image_classification.md} (97%)
 rename docs/source/en/tasks/{zero_shot_object_detection.mdx => zero_shot_object_detection.md} (98%)
 rename docs/source/en/{tasks_explained.mdx => tasks_explained.md} (99%)
 rename docs/source/en/{testing.mdx => testing.md} (99%)
 rename docs/source/en/{tf_xla.mdx => tf_xla.md} (98%)
 rename docs/source/en/{tflite.mdx => tflite.md} (94%)
 rename docs/source/en/{tokenizer_summary.mdx => tokenizer_summary.md} (99%)
 rename docs/source/en/{torchscript.mdx => torchscript.md} (98%)
 rename docs/source/en/{training.mdx => training.md} (99%)
 rename docs/source/en/{transformers_agents.mdx => transformers_agents.md} (98%)
 rename docs/source/en/{troubleshooting.mdx => troubleshooting.md} (98%)
 rename docs/source/es/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/es/{add_new_pipeline.mdx => add_new_pipeline.md} (98%)
 rename docs/source/es/{autoclass_tutorial.mdx => autoclass_tutorial.md} (97%)
 rename docs/source/es/{bertology.mdx => bertology.md} (93%)
 rename docs/source/es/{community.mdx => community.md} (99%)
 rename docs/source/es/{converting_tensorflow_models.mdx => converting_tensorflow_models.md} (97%)
 rename docs/source/es/{create_a_model.mdx => create_a_model.md} (99%)
 rename docs/source/es/{custom_models.mdx => custom_models.md} (98%)
 rename docs/source/es/{debugging.mdx => debugging.md} (98%)
 rename docs/source/es/{fast_tokenizers.mdx => fast_tokenizers.md} (94%)
 rename docs/source/es/{index.mdx => index.md} (99%)
 rename docs/source/es/{installation.mdx => installation.md} (98%)
 rename docs/source/es/{model_sharing.mdx => model_sharing.md} (98%)
 rename docs/source/es/{multilingual.mdx => multilingual.md} (98%)
 rename docs/source/es/{philosophy.mdx => philosophy.md} (97%)
 rename docs/source/es/{pipeline_tutorial.mdx => pipeline_tutorial.md} (97%)
 rename docs/source/es/{pr_checks.mdx => pr_checks.md} (97%)
 rename docs/source/es/{preprocessing.mdx => preprocessing.md} (99%)
 rename docs/source/es/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/es/{run_scripts.mdx => run_scripts.md} (99%)
 rename docs/source/es/{sagemaker.mdx => sagemaker.md} (87%)
 rename docs/source/es/{serialization.mdx => serialization.md} (99%)
 rename docs/source/es/tasks/{asr.mdx => asr.md} (98%)
 rename docs/source/es/tasks/{image_classification.mdx => image_classification.md} (97%)
 rename docs/source/es/tasks/{language_modeling.mdx => language_modeling.md} (99%)
 rename docs/source/es/tasks/{multiple_choice.mdx => multiple_choice.md} (98%)
 rename docs/source/es/tasks/{question_answering.mdx => question_answering.md} (98%)
 rename docs/source/es/tasks/{summarization.mdx => summarization.md} (98%)
 rename docs/source/es/{training.mdx => training.md} (98%)
 create mode 100644 docs/source/fr/in_translation.md
 delete mode 100644 docs/source/fr/in_translation.mdx
 rename docs/source/fr/{index.mdx => index.md} (99%)
 rename docs/source/fr/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/it/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/it/{add_new_model.mdx => add_new_model.md} (99%)
 rename docs/source/it/{add_new_pipeline.mdx => add_new_pipeline.md} (98%)
 rename docs/source/it/{autoclass_tutorial.mdx => autoclass_tutorial.md} (97%)
 rename docs/source/it/{big_models.mdx => big_models.md} (97%)
 rename docs/source/it/{community.mdx => community.md} (99%)
 rename docs/source/it/{converting_tensorflow_models.mdx => converting_tensorflow_models.md} (97%)
 rename docs/source/it/{create_a_model.mdx => create_a_model.md} (98%)
 rename docs/source/it/{custom_models.mdx => custom_models.md} (98%)
 rename docs/source/it/{debugging.mdx => debugging.md} (98%)
 rename docs/source/it/{index.mdx => index.md} (99%)
 rename docs/source/it/{installation.mdx => installation.md} (98%)
 rename docs/source/it/{migration.mdx => migration.md} (98%)
 rename docs/source/it/{model_sharing.mdx => model_sharing.md} (98%)
 rename docs/source/it/{multilingual.mdx => multilingual.md} (98%)
 rename docs/source/it/{perf_hardware.mdx => perf_hardware.md} (97%)
 rename docs/source/it/{perf_infer_cpu.mdx => perf_infer_cpu.md} (96%)
 rename docs/source/it/{perf_infer_gpu_many.mdx => perf_infer_gpu_many.md} (88%)
 rename docs/source/it/{perf_infer_gpu_one.mdx => perf_infer_gpu_one.md} (97%)
 rename docs/source/it/{perf_infer_special.mdx => perf_infer_special.md} (81%)
 rename docs/source/it/{perf_train_cpu.mdx => perf_train_cpu.md} (95%)
 rename docs/source/it/{perf_train_cpu_many.mdx => perf_train_cpu_many.md} (97%)
 rename docs/source/it/{perf_train_special.mdx => perf_train_special.md} (86%)
 rename docs/source/it/{perf_train_tpu.mdx => perf_train_tpu.md} (85%)
 rename docs/source/it/{pipeline_tutorial.mdx => pipeline_tutorial.md} (97%)
 rename docs/source/it/{pr_checks.mdx => pr_checks.md} (97%)
 rename docs/source/it/{preprocessing.mdx => preprocessing.md} (99%)
 rename docs/source/it/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/it/{run_scripts.mdx => run_scripts.md} (99%)
 rename docs/source/it/{serialization.mdx => serialization.md} (99%)
 rename docs/source/it/{training.mdx => training.md} (98%)
 rename docs/source/ja/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/ja/{index.mdx => index.md} (99%)
 rename docs/source/ja/{installation.mdx => installation.md} (98%)
 rename docs/source/ja/{multilingual.mdx => multilingual.md} (98%)
 rename docs/source/ko/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/ko/{attention.mdx => attention.md} (96%)
 rename docs/source/ko/{autoclass_tutorial.mdx => autoclass_tutorial.md} (97%)
 rename docs/source/ko/{bertology.mdx => bertology.md} (93%)
 rename docs/source/ko/{create_a_model.mdx => create_a_model.md} (99%)
 rename docs/source/ko/{custom_models.mdx => custom_models.md} (99%)
 rename docs/source/ko/{fast_tokenizers.mdx => fast_tokenizers.md} (95%)
 create mode 100644 docs/source/ko/in_translation.md
 delete mode 100644 docs/source/ko/in_translation.mdx
 rename docs/source/ko/{index.mdx => index.md} (99%)
 rename docs/source/ko/{installation.mdx => installation.md} (98%)
 rename docs/source/ko/{model_sharing.mdx => model_sharing.md} (98%)
 rename docs/source/ko/{multilingual.mdx => multilingual.md} (98%)
 rename docs/source/ko/{pad_truncation.mdx => pad_truncation.md} (97%)
 rename docs/source/ko/{pipeline_tutorial.mdx => pipeline_tutorial.md} (98%)
 rename docs/source/ko/{preprocessing.mdx => preprocessing.md} (99%)
 rename docs/source/ko/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/ko/{run_scripts.mdx => run_scripts.md} (99%)
 rename docs/source/ko/{sagemaker.mdx => sagemaker.md} (87%)
 rename docs/source/ko/{serialization.mdx => serialization.md} (99%)
 rename docs/source/ko/{task_summary.mdx => task_summary.md} (99%)
 rename docs/source/ko/tasks/{asr.mdx => asr.md} (98%)
 rename docs/source/ko/tasks/{image_captioning.mdx => image_captioning.md} (98%)
 rename docs/source/ko/tasks/{image_classification.mdx => image_classification.md} (99%)
 rename docs/source/ko/tasks/{language_modeling.mdx => language_modeling.md} (99%)
 rename docs/source/ko/tasks/{masked_language_modeling.mdx => masked_language_modeling.md} (99%)
 rename docs/source/ko/tasks/{monocular_depth_estimation.mdx => monocular_depth_estimation.md} (97%)
 rename docs/source/ko/tasks/{multiple_choice.mdx => multiple_choice.md} (99%)
 rename docs/source/ko/tasks/{object_detection.mdx => object_detection.md} (99%)
 rename docs/source/ko/tasks/{question_answering.mdx => question_answering.md} (99%)
 rename docs/source/ko/tasks/{sequence_classification.mdx => sequence_classification.md} (99%)
 rename docs/source/ko/tasks/{summarization.mdx => summarization.md} (99%)
 rename docs/source/ko/tasks/{token_classification.mdx => token_classification.md} (99%)
 rename docs/source/ko/tasks/{translation.mdx => translation.md} (99%)
 rename docs/source/ko/tasks/{video_classification.mdx => video_classification.md} (99%)
 rename docs/source/ko/tasks/{zero_shot_image_classification.mdx => zero_shot_image_classification.md} (97%)
 rename docs/source/ko/tasks/{zero_shot_object_detection.mdx => zero_shot_object_detection.md} (98%)
 rename docs/source/ko/{tasks_explained.mdx => tasks_explained.md} (99%)
 rename docs/source/ko/{torchscript.mdx => torchscript.md} (98%)
 rename docs/source/ko/{training.mdx => training.md} (99%)
 rename docs/source/ko/{troubleshooting.mdx => troubleshooting.md} (98%)
 rename docs/source/ms/{index.mdx => index.md} (99%)
 rename docs/source/pt/{accelerate.mdx => accelerate.md} (96%)
 rename docs/source/pt/{converting_tensorflow_models.mdx => converting_tensorflow_models.md} (97%)
 rename docs/source/pt/{create_a_model.mdx => create_a_model.md} (98%)
 rename docs/source/pt/{custom_models.mdx => custom_models.md} (98%)
 rename docs/source/pt/{fast_tokenizers.mdx => fast_tokenizers.md} (94%)
 rename docs/source/pt/{index.mdx => index.md} (99%)
 rename docs/source/pt/{installation.mdx => installation.md} (98%)
 rename docs/source/pt/{multilingual.mdx => multilingual.md} (98%)
 rename docs/source/pt/{pipeline_tutorial.mdx => pipeline_tutorial.md} (96%)
 rename docs/source/pt/{quicktour.mdx => quicktour.md} (99%)
 rename docs/source/pt/{run_scripts.mdx => run_scripts.md} (99%)
 rename docs/source/pt/{serialization.mdx => serialization.md} (99%)
 rename docs/source/pt/tasks/{sequence_classification.mdx => sequence_classification.md} (98%)
 rename docs/source/pt/tasks/{token_classification.mdx => token_classification.md} (98%)
 rename docs/source/pt/{training.mdx => training.md} (98%)
 rename docs/source/zh/{index.mdx => index.md} (99%)
 rename docs/source/zh/{quicktour.mdx => quicktour.md} (99%)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 21e5b055ff6c..ef0acdb66534 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -480,7 +480,7 @@ def job_name(self):
         },
     ],
     tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
-    pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
+    pytest_options={"-doctest-modules": None, "doctest-glob": "*.md", "dist": "loadfile", "rvsA": None},
     command_timeout=1200,  # test cannot run longer than 1200 seconds
     pytest_num_workers=1,
 )
diff --git a/.github/ISSUE_TEMPLATE/i18n.md b/.github/ISSUE_TEMPLATE/i18n.md
index 39d369a25324..3d0b6938ed84 100644
--- a/.github/ISSUE_TEMPLATE/i18n.md
+++ b/.github/ISSUE_TEMPLATE/i18n.md
@@ -28,18 +28,18 @@ Some notes:
 
 ## Get Started section
 
-- [ ] [index.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.mdx) https://github.com/huggingface/transformers/pull/20180
-- [ ] [quicktour.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.mdx) (waiting for initial PR to go through)
-- [ ] [installation.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.mdx).
+- [ ] [index.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/index.md) https://github.com/huggingface/transformers/pull/20180
+- [ ] [quicktour.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/quicktour.md) (waiting for initial PR to go through)
+- [ ] [installation.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/installation.md).
 
 ## Tutorial section
-- [ ] [pipeline_tutorial.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.mdx)
-- [ ]  [autoclass_tutorial.mdx](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.mdx)
-- [ ]  [preprocessing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.mdx)
-- [ ]  [training.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.mdx)
-- [ ]  [accelerate.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.mdx)
-- [ ]  [model_sharing.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.mdx)
-- [ ]  [multilingual.mdx](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.mdx)
+- [ ] [pipeline_tutorial.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_tutorial.md)
+- [ ]  [autoclass_tutorial.md](https://github.com/huggingface/transformers/blob/master/docs/source/autoclass_tutorial.md)
+- [ ]  [preprocessing.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/preprocessing.md)
+- [ ]  [training.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/training.md)
+- [ ]  [accelerate.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/accelerate.md)
+- [ ]  [model_sharing.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/model_sharing.md)
+- [ ]  [multilingual.md](https://github.com/huggingface/transformers/blob/main/docs/source/en/multilingual.md)
 
 <!--
 Keep on adding more as you go 🔥
diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
index 76eeb001dfd2..2dde2bd4d59b 100644
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -45,7 +45,7 @@ jobs:
 
       - name: Run doctests
         run: |
-          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
+          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md"
 
       - name: Failure short reports
         if: ${{ failure() }}
diff --git a/docs/README.md b/docs/README.md
index b4278cc0dc3f..bc6efe6efb9b 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -81,7 +81,7 @@ The `preview` command only works with existing doc files. When you add a complet
 
 ## Adding a new element to the navigation bar
 
-Accepted files are Markdown (.md or .mdx).
+Accepted files are Markdown (.md or .md).
 
 Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
 the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/_toctree.yml) file.
@@ -109,7 +109,7 @@ Sections that were moved:
 
 Use the relative style to link to the new file so that the versioned docs continue to work.
 
-For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.mdx).
+For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
 
 
 ## Writing Documentation - Specification
@@ -138,7 +138,7 @@ When translating, refer to the guide at [./TRANSLATING.md](https://github.com/hu
 
 When adding a new model:
 
-- Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
+- Create a file `xxx.md` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
 - Link that file in `./source/_toctree.yml`.
 - Write a short overview of the model:
     - Overview with paper & authors
@@ -386,7 +386,7 @@ pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::
 You can test locally a given file with this command (here testing the quicktour):
 
 ```bash
-pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
+pytest --doctest-modules docs/source/quicktour.md -sv --doctest-continue-on-failure --doctest-glob="*.md"
 ```
 
 ### Writing doctests
diff --git a/docs/source/de/accelerate.mdx b/docs/source/de/accelerate.md
similarity index 96%
rename from docs/source/de/accelerate.mdx
rename to docs/source/de/accelerate.md
index 64f85f205f8a..98a11cbdc417 100644
--- a/docs/source/de/accelerate.mdx
+++ b/docs/source/de/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Verteiltes Training mit 🤗 Accelerate
diff --git a/docs/source/de/autoclass_tutorial.mdx b/docs/source/de/autoclass_tutorial.md
similarity index 97%
rename from docs/source/de/autoclass_tutorial.mdx
rename to docs/source/de/autoclass_tutorial.md
index 95247cd04ba0..7707f7b39b49 100644
--- a/docs/source/de/autoclass_tutorial.mdx
+++ b/docs/source/de/autoclass_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Vortrainierte Instanzen mit einer AutoClass laden
diff --git a/docs/source/de/index.mdx b/docs/source/de/index.md
similarity index 99%
rename from docs/source/de/index.mdx
rename to docs/source/de/index.md
index c14e803ed022..22f34aa84758 100644
--- a/docs/source/de/index.mdx
+++ b/docs/source/de/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/de/installation.mdx b/docs/source/de/installation.md
similarity index 98%
rename from docs/source/de/installation.mdx
rename to docs/source/de/installation.md
index 3103830ee7fd..295c9cad97bc 100644
--- a/docs/source/de/installation.mdx
+++ b/docs/source/de/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Installation
diff --git a/docs/source/de/model_sharing.mdx b/docs/source/de/model_sharing.md
similarity index 98%
rename from docs/source/de/model_sharing.mdx
rename to docs/source/de/model_sharing.md
index 42b09d40d70a..415277e00e5e 100644
--- a/docs/source/de/model_sharing.mdx
+++ b/docs/source/de/model_sharing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Ein Modell teilen
diff --git a/docs/source/de/pipeline_tutorial.mdx b/docs/source/de/pipeline_tutorial.md
similarity index 97%
rename from docs/source/de/pipeline_tutorial.mdx
rename to docs/source/de/pipeline_tutorial.md
index 19c37c35dea1..06ab440d73a6 100644
--- a/docs/source/de/pipeline_tutorial.mdx
+++ b/docs/source/de/pipeline_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pipelines für Inferenzen
diff --git a/docs/source/de/preprocessing.mdx b/docs/source/de/preprocessing.md
similarity index 99%
rename from docs/source/de/preprocessing.mdx
rename to docs/source/de/preprocessing.md
index ea6c185cc101..eb3b3057c3e9 100644
--- a/docs/source/de/preprocessing.mdx
+++ b/docs/source/de/preprocessing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Vorverarbeiten
diff --git a/docs/source/de/quicktour.mdx b/docs/source/de/quicktour.md
similarity index 99%
rename from docs/source/de/quicktour.mdx
rename to docs/source/de/quicktour.md
index 4c668bf419b1..7026b087bf50 100644
--- a/docs/source/de/quicktour.mdx
+++ b/docs/source/de/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Schnellstart
diff --git a/docs/source/de/training.mdx b/docs/source/de/training.md
similarity index 99%
rename from docs/source/de/training.mdx
rename to docs/source/de/training.md
index e38779ba5571..493de3052bbf 100644
--- a/docs/source/de/training.mdx
+++ b/docs/source/de/training.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Optimierung eines vortrainierten Modells
diff --git a/docs/source/en/accelerate.mdx b/docs/source/en/accelerate.md
similarity index 96%
rename from docs/source/en/accelerate.mdx
rename to docs/source/en/accelerate.md
index 02e05df39074..17be77d677a5 100644
--- a/docs/source/en/accelerate.mdx
+++ b/docs/source/en/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Distributed training with 🤗 Accelerate
diff --git a/docs/source/en/add_new_model.mdx b/docs/source/en/add_new_model.md
similarity index 99%
rename from docs/source/en/add_new_model.mdx
rename to docs/source/en/add_new_model.md
index 38efaa945ded..b33053540802 100644
--- a/docs/source/en/add_new_model.mdx
+++ b/docs/source/en/add_new_model.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # How to add a model to 🤗 Transformers?
@@ -817,7 +821,7 @@ tests for you.
 
 Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
 a nice docstring and a doc page. The Cookiecutter should have added a template file called
-`docs/source/model_doc/brand_new_bert.mdx` that you should fill out. Users of your model will usually first look at
+`docs/source/model_doc/brand_new_bert.md` that you should fill out. Users of your model will usually first look at
 this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
 the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
 regarding the docstrings.
diff --git a/docs/source/en/add_new_pipeline.mdx b/docs/source/en/add_new_pipeline.md
similarity index 98%
rename from docs/source/en/add_new_pipeline.mdx
rename to docs/source/en/add_new_pipeline.md
index b0cc2cd0ff72..cb1518752bf1 100644
--- a/docs/source/en/add_new_pipeline.mdx
+++ b/docs/source/en/add_new_pipeline.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # How to create a custom pipeline?
diff --git a/docs/source/en/add_tensorflow_model.mdx b/docs/source/en/add_tensorflow_model.md
similarity index 98%
rename from docs/source/en/add_tensorflow_model.mdx
rename to docs/source/en/add_tensorflow_model.md
index f59b318b3f4b..6efbdee1bf37 100644
--- a/docs/source/en/add_tensorflow_model.mdx
+++ b/docs/source/en/add_tensorflow_model.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # How to convert a 🤗 Transformers model to TensorFlow?
@@ -228,9 +232,9 @@ changes:
 - Include the modeling file in the documentation test file list in `utils/documentation_tests.txt`
 - Add the lazy loading classes related to *BrandNewBert* in `src/transformers/utils/dummy_tf_objects.py`
 - Update the import structures for the public classes in `src/transformers/models/brand_new_bert/__init__.py`
-- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.mdx`
-- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.mdx`
-- Finally, add a green tick ✅ to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.mdx`
+- Add the documentation pointers to the public methods of *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md`
+- Add yourself to the list of contributors to *BrandNewBert* in `docs/source/en/model_doc/brand_new_bert.md`
+- Finally, add a green tick ✅ to the TensorFlow column of *BrandNewBert* in `docs/source/en/index.md`
 
 When you're happy with your implementation, run the following checklist to confirm that your model architecture is
 ready:
diff --git a/docs/source/en/attention.mdx b/docs/source/en/attention.md
similarity index 95%
rename from docs/source/en/attention.mdx
rename to docs/source/en/attention.md
index d6542b3be454..3a4f93b33ff2 100644
--- a/docs/source/en/attention.mdx
+++ b/docs/source/en/attention.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Attention mechanisms
diff --git a/docs/source/en/autoclass_tutorial.mdx b/docs/source/en/autoclass_tutorial.md
similarity index 97%
rename from docs/source/en/autoclass_tutorial.mdx
rename to docs/source/en/autoclass_tutorial.md
index 6b44e41a856c..e26a5612db01 100644
--- a/docs/source/en/autoclass_tutorial.mdx
+++ b/docs/source/en/autoclass_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Load pretrained instances with an AutoClass
diff --git a/docs/source/en/benchmarks.mdx b/docs/source/en/benchmarks.md
similarity index 99%
rename from docs/source/en/benchmarks.mdx
rename to docs/source/en/benchmarks.md
index 244112001f5c..5023d2486979 100644
--- a/docs/source/en/benchmarks.mdx
+++ b/docs/source/en/benchmarks.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Benchmarks
diff --git a/docs/source/en/bertology.mdx b/docs/source/en/bertology.md
similarity index 92%
rename from docs/source/en/bertology.mdx
rename to docs/source/en/bertology.md
index 505b56902817..ba1b4bd4002b 100644
--- a/docs/source/en/bertology.mdx
+++ b/docs/source/en/bertology.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BERTology
diff --git a/docs/source/en/big_models.mdx b/docs/source/en/big_models.md
similarity index 97%
rename from docs/source/en/big_models.mdx
rename to docs/source/en/big_models.md
index 971403b62d4a..4b35126f4d33 100644
--- a/docs/source/en/big_models.mdx
+++ b/docs/source/en/big_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Instantiating a big model
diff --git a/docs/source/en/community.mdx b/docs/source/en/community.md
similarity index 99%
rename from docs/source/en/community.mdx
rename to docs/source/en/community.md
index 05164ffdaa19..74c577567ab6 100644
--- a/docs/source/en/community.mdx
+++ b/docs/source/en/community.md
@@ -1,4 +1,8 @@
-﻿# Community
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Community
 
 This page regroups resources around 🤗 Transformers developed by the community.
 
diff --git a/docs/source/en/create_a_model.mdx b/docs/source/en/create_a_model.md
similarity index 98%
rename from docs/source/en/create_a_model.mdx
rename to docs/source/en/create_a_model.md
index 5c736f1d7943..239edf085ce5 100644
--- a/docs/source/en/create_a_model.mdx
+++ b/docs/source/en/create_a_model.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Create a custom architecture
diff --git a/docs/source/en/custom_models.mdx b/docs/source/en/custom_models.md
similarity index 98%
rename from docs/source/en/custom_models.mdx
rename to docs/source/en/custom_models.md
index f5ad55856243..5caedb32ef9b 100644
--- a/docs/source/en/custom_models.mdx
+++ b/docs/source/en/custom_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Sharing custom models
diff --git a/docs/source/en/custom_tools.mdx b/docs/source/en/custom_tools.md
similarity index 99%
rename from docs/source/en/custom_tools.mdx
rename to docs/source/en/custom_tools.md
index feb4c7ac46d7..3d11c2473daf 100644
--- a/docs/source/en/custom_tools.mdx
+++ b/docs/source/en/custom_tools.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Custom Tools and Prompts
diff --git a/docs/source/en/debugging.mdx b/docs/source/en/debugging.md
similarity index 98%
rename from docs/source/en/debugging.mdx
rename to docs/source/en/debugging.md
index 92dfe639c1f4..b1a430e7344e 100644
--- a/docs/source/en/debugging.mdx
+++ b/docs/source/en/debugging.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Debugging
diff --git a/docs/source/en/fast_tokenizers.mdx b/docs/source/en/fast_tokenizers.md
similarity index 94%
rename from docs/source/en/fast_tokenizers.mdx
rename to docs/source/en/fast_tokenizers.md
index 234f9bb42d19..aebc17106008 100644
--- a/docs/source/en/fast_tokenizers.mdx
+++ b/docs/source/en/fast_tokenizers.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Use tokenizers from 🤗 Tokenizers
diff --git a/docs/source/en/generation_strategies.mdx b/docs/source/en/generation_strategies.md
similarity index 98%
rename from docs/source/en/generation_strategies.mdx
rename to docs/source/en/generation_strategies.md
index b59649bae41d..5552fc08295b 100644
--- a/docs/source/en/generation_strategies.mdx
+++ b/docs/source/en/generation_strategies.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Text generation strategies
@@ -331,7 +335,7 @@ The groups are selected to ensure they are distinct enough compared to the other
 
 This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
 [`generate`] method, which gives you even further control over the [`generate`] method's behavior.
-For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.mdx).
+For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.md).
 
 ### Assisted Decoding
 
diff --git a/docs/source/en/glossary.mdx b/docs/source/en/glossary.md
similarity index 99%
rename from docs/source/en/glossary.mdx
rename to docs/source/en/glossary.md
index 6d47f647642b..08d71ee18b5c 100644
--- a/docs/source/en/glossary.mdx
+++ b/docs/source/en/glossary.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Glossary
diff --git a/docs/source/en/hpo_train.mdx b/docs/source/en/hpo_train.md
similarity index 96%
rename from docs/source/en/hpo_train.mdx
rename to docs/source/en/hpo_train.md
index 5a7cc429acc1..e78fc4fd39d6 100644
--- a/docs/source/en/hpo_train.mdx
+++ b/docs/source/en/hpo_train.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Hyperparameter Search using Trainer API
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.md
similarity index 99%
rename from docs/source/en/index.mdx
rename to docs/source/en/index.md
index 1690b4cb7c51..aeb73adf4646 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.md
@@ -8,6 +8,9 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/en/installation.mdx b/docs/source/en/installation.md
similarity index 98%
rename from docs/source/en/installation.mdx
rename to docs/source/en/installation.md
index 2c59d9146ba4..b011714ab976 100644
--- a/docs/source/en/installation.mdx
+++ b/docs/source/en/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Installation
diff --git a/docs/source/en/internal/audio_utils.mdx b/docs/source/en/internal/audio_utils.md
similarity index 88%
rename from docs/source/en/internal/audio_utils.mdx
rename to docs/source/en/internal/audio_utils.md
index 74c2fe82a363..e6a39c7c1c49 100644
--- a/docs/source/en/internal/audio_utils.mdx
+++ b/docs/source/en/internal/audio_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Utilities for `FeatureExtractors`
diff --git a/docs/source/en/internal/file_utils.mdx b/docs/source/en/internal/file_utils.md
similarity index 88%
rename from docs/source/en/internal/file_utils.mdx
rename to docs/source/en/internal/file_utils.md
index 936629314358..6f5657f7743c 100644
--- a/docs/source/en/internal/file_utils.mdx
+++ b/docs/source/en/internal/file_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # General Utilities
diff --git a/docs/source/en/internal/generation_utils.mdx b/docs/source/en/internal/generation_utils.md
similarity index 97%
rename from docs/source/en/internal/generation_utils.mdx
rename to docs/source/en/internal/generation_utils.md
index 10b050b3f8b2..c158be36ba59 100644
--- a/docs/source/en/internal/generation_utils.mdx
+++ b/docs/source/en/internal/generation_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Utilities for Generation
diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.md
similarity index 89%
rename from docs/source/en/internal/image_processing_utils.mdx
rename to docs/source/en/internal/image_processing_utils.md
index 831458bedab1..42f99f361703 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Utilities for Image Processors
diff --git a/docs/source/en/internal/modeling_utils.mdx b/docs/source/en/internal/modeling_utils.md
similarity index 92%
rename from docs/source/en/internal/modeling_utils.mdx
rename to docs/source/en/internal/modeling_utils.md
index 578740df0274..afc8123558f5 100644
--- a/docs/source/en/internal/modeling_utils.mdx
+++ b/docs/source/en/internal/modeling_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Custom Layers and Utilities
diff --git a/docs/source/en/internal/pipelines_utils.mdx b/docs/source/en/internal/pipelines_utils.md
similarity index 87%
rename from docs/source/en/internal/pipelines_utils.mdx
rename to docs/source/en/internal/pipelines_utils.md
index ed8e75b414bb..6ea6de9a61b8 100644
--- a/docs/source/en/internal/pipelines_utils.mdx
+++ b/docs/source/en/internal/pipelines_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Utilities for pipelines
diff --git a/docs/source/en/internal/time_series_utils.mdx b/docs/source/en/internal/time_series_utils.md
similarity index 86%
rename from docs/source/en/internal/time_series_utils.mdx
rename to docs/source/en/internal/time_series_utils.md
index 7ee9b3ecef0e..11c562fbe32a 100644
--- a/docs/source/en/internal/time_series_utils.mdx
+++ b/docs/source/en/internal/time_series_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Time Series Utilities
diff --git a/docs/source/en/internal/tokenization_utils.mdx b/docs/source/en/internal/tokenization_utils.md
similarity index 89%
rename from docs/source/en/internal/tokenization_utils.mdx
rename to docs/source/en/internal/tokenization_utils.md
index 24e81f702064..5aa650991760 100644
--- a/docs/source/en/internal/tokenization_utils.mdx
+++ b/docs/source/en/internal/tokenization_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Utilities for Tokenizers
diff --git a/docs/source/en/internal/trainer_utils.mdx b/docs/source/en/internal/trainer_utils.md
similarity index 87%
rename from docs/source/en/internal/trainer_utils.mdx
rename to docs/source/en/internal/trainer_utils.md
index bba182d5ab64..e3f8a9b04536 100644
--- a/docs/source/en/internal/trainer_utils.mdx
+++ b/docs/source/en/internal/trainer_utils.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Utilities for Trainer
diff --git a/docs/source/en/main_classes/agent.mdx b/docs/source/en/main_classes/agent.md
similarity index 94%
rename from docs/source/en/main_classes/agent.mdx
rename to docs/source/en/main_classes/agent.md
index 37e9f00ecb59..4b29e15721c3 100644
--- a/docs/source/en/main_classes/agent.mdx
+++ b/docs/source/en/main_classes/agent.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Agents & Tools
diff --git a/docs/source/en/main_classes/callback.mdx b/docs/source/en/main_classes/callback.md
similarity index 96%
rename from docs/source/en/main_classes/callback.mdx
rename to docs/source/en/main_classes/callback.md
index 2636130473c7..ccfdf2568324 100644
--- a/docs/source/en/main_classes/callback.mdx
+++ b/docs/source/en/main_classes/callback.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Callbacks
diff --git a/docs/source/en/main_classes/configuration.mdx b/docs/source/en/main_classes/configuration.md
similarity index 87%
rename from docs/source/en/main_classes/configuration.mdx
rename to docs/source/en/main_classes/configuration.md
index 541781eff76a..0cfef06d3ce9 100644
--- a/docs/source/en/main_classes/configuration.mdx
+++ b/docs/source/en/main_classes/configuration.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Configuration
diff --git a/docs/source/en/main_classes/data_collator.mdx b/docs/source/en/main_classes/data_collator.md
similarity index 92%
rename from docs/source/en/main_classes/data_collator.mdx
rename to docs/source/en/main_classes/data_collator.md
index ee1c1418e493..74e653dd1185 100644
--- a/docs/source/en/main_classes/data_collator.mdx
+++ b/docs/source/en/main_classes/data_collator.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Data Collator
diff --git a/docs/source/en/main_classes/deepspeed.mdx b/docs/source/en/main_classes/deepspeed.md
similarity index 99%
rename from docs/source/en/main_classes/deepspeed.mdx
rename to docs/source/en/main_classes/deepspeed.md
index 3733b0a1b883..bb0cd54372a7 100644
--- a/docs/source/en/main_classes/deepspeed.mdx
+++ b/docs/source/en/main_classes/deepspeed.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DeepSpeed Integration
diff --git a/docs/source/en/main_classes/feature_extractor.mdx b/docs/source/en/main_classes/feature_extractor.md
similarity index 88%
rename from docs/source/en/main_classes/feature_extractor.mdx
rename to docs/source/en/main_classes/feature_extractor.md
index 41ee21e4b12e..029034fa60de 100644
--- a/docs/source/en/main_classes/feature_extractor.mdx
+++ b/docs/source/en/main_classes/feature_extractor.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Feature Extractor
diff --git a/docs/source/en/main_classes/image_processor.mdx b/docs/source/en/main_classes/image_processor.md
similarity index 87%
rename from docs/source/en/main_classes/image_processor.mdx
rename to docs/source/en/main_classes/image_processor.md
index 6a108397213f..04a3cd1337a5 100644
--- a/docs/source/en/main_classes/image_processor.mdx
+++ b/docs/source/en/main_classes/image_processor.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Image Processor
diff --git a/docs/source/en/main_classes/keras_callbacks.mdx b/docs/source/en/main_classes/keras_callbacks.md
similarity index 83%
rename from docs/source/en/main_classes/keras_callbacks.mdx
rename to docs/source/en/main_classes/keras_callbacks.md
index bc44a0967cc9..c9932300dbc5 100644
--- a/docs/source/en/main_classes/keras_callbacks.mdx
+++ b/docs/source/en/main_classes/keras_callbacks.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Keras callbacks
diff --git a/docs/source/en/main_classes/logging.mdx b/docs/source/en/main_classes/logging.md
similarity index 95%
rename from docs/source/en/main_classes/logging.mdx
rename to docs/source/en/main_classes/logging.md
index 9d4432a7290d..d117281f650f 100644
--- a/docs/source/en/main_classes/logging.mdx
+++ b/docs/source/en/main_classes/logging.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Logging
diff --git a/docs/source/en/main_classes/model.mdx b/docs/source/en/main_classes/model.md
similarity index 97%
rename from docs/source/en/main_classes/model.mdx
rename to docs/source/en/main_classes/model.md
index fee685b3efc7..f4ac72c405fc 100644
--- a/docs/source/en/main_classes/model.mdx
+++ b/docs/source/en/main_classes/model.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Models
diff --git a/docs/source/en/main_classes/onnx.mdx b/docs/source/en/main_classes/onnx.md
similarity index 90%
rename from docs/source/en/main_classes/onnx.mdx
rename to docs/source/en/main_classes/onnx.md
index ff20f315a1a9..81d31c97e88d 100644
--- a/docs/source/en/main_classes/onnx.mdx
+++ b/docs/source/en/main_classes/onnx.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Exporting 🤗 Transformers models to ONNX
diff --git a/docs/source/en/main_classes/optimizer_schedules.mdx b/docs/source/en/main_classes/optimizer_schedules.md
similarity index 92%
rename from docs/source/en/main_classes/optimizer_schedules.mdx
rename to docs/source/en/main_classes/optimizer_schedules.md
index 4808f4a2a4d9..dfcab9e91465 100644
--- a/docs/source/en/main_classes/optimizer_schedules.mdx
+++ b/docs/source/en/main_classes/optimizer_schedules.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Optimization
diff --git a/docs/source/en/main_classes/output.mdx b/docs/source/en/main_classes/output.md
similarity index 98%
rename from docs/source/en/main_classes/output.mdx
rename to docs/source/en/main_classes/output.md
index 921031671cad..93e1d7805493 100644
--- a/docs/source/en/main_classes/output.mdx
+++ b/docs/source/en/main_classes/output.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Model outputs
diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.md
similarity index 98%
rename from docs/source/en/main_classes/pipelines.mdx
rename to docs/source/en/main_classes/pipelines.md
index d64c7ee3f042..94272b438345 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pipelines
diff --git a/docs/source/en/main_classes/processors.mdx b/docs/source/en/main_classes/processors.md
similarity index 97%
rename from docs/source/en/main_classes/processors.mdx
rename to docs/source/en/main_classes/processors.md
index 5530720b1cb6..9763122ef4f9 100644
--- a/docs/source/en/main_classes/processors.mdx
+++ b/docs/source/en/main_classes/processors.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Processors
diff --git a/docs/source/en/main_classes/quantization.mdx b/docs/source/en/main_classes/quantization.md
similarity index 98%
rename from docs/source/en/main_classes/quantization.mdx
rename to docs/source/en/main_classes/quantization.md
index e093bc3bafa9..eb360e603ab0 100644
--- a/docs/source/en/main_classes/quantization.mdx
+++ b/docs/source/en/main_classes/quantization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Quantize 🤗 Transformers models
diff --git a/docs/source/en/main_classes/text_generation.mdx b/docs/source/en/main_classes/text_generation.md
similarity index 92%
rename from docs/source/en/main_classes/text_generation.mdx
rename to docs/source/en/main_classes/text_generation.md
index 39a15160346f..309d7298eec7 100644
--- a/docs/source/en/main_classes/text_generation.mdx
+++ b/docs/source/en/main_classes/text_generation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Generation
diff --git a/docs/source/en/main_classes/tokenizer.mdx b/docs/source/en/main_classes/tokenizer.md
similarity index 95%
rename from docs/source/en/main_classes/tokenizer.mdx
rename to docs/source/en/main_classes/tokenizer.md
index 032373435cd5..251cbb43ea72 100644
--- a/docs/source/en/main_classes/tokenizer.mdx
+++ b/docs/source/en/main_classes/tokenizer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Tokenizer
diff --git a/docs/source/en/main_classes/trainer.mdx b/docs/source/en/main_classes/trainer.md
similarity index 99%
rename from docs/source/en/main_classes/trainer.mdx
rename to docs/source/en/main_classes/trainer.md
index 9a941964a57c..c74b764c5e6d 100644
--- a/docs/source/en/main_classes/trainer.mdx
+++ b/docs/source/en/main_classes/trainer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Trainer
diff --git a/docs/source/en/model_doc/albert.mdx b/docs/source/en/model_doc/albert.md
similarity index 97%
rename from docs/source/en/model_doc/albert.mdx
rename to docs/source/en/model_doc/albert.md
index 5076a5150be6..9e821f2f4d02 100644
--- a/docs/source/en/model_doc/albert.mdx
+++ b/docs/source/en/model_doc/albert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ALBERT
diff --git a/docs/source/en/model_doc/align.mdx b/docs/source/en/model_doc/align.md
similarity index 97%
rename from docs/source/en/model_doc/align.mdx
rename to docs/source/en/model_doc/align.md
index 159c3b2a0d05..faf76853f609 100644
--- a/docs/source/en/model_doc/align.mdx
+++ b/docs/source/en/model_doc/align.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ALIGN
diff --git a/docs/source/en/model_doc/altclip.mdx b/docs/source/en/model_doc/altclip.md
similarity index 96%
rename from docs/source/en/model_doc/altclip.mdx
rename to docs/source/en/model_doc/altclip.md
index 681bea22c72e..23cdcb63fbd2 100644
--- a/docs/source/en/model_doc/altclip.mdx
+++ b/docs/source/en/model_doc/altclip.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # AltCLIP
diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx b/docs/source/en/model_doc/audio-spectrogram-transformer.md
similarity index 96%
rename from docs/source/en/model_doc/audio-spectrogram-transformer.mdx
rename to docs/source/en/model_doc/audio-spectrogram-transformer.md
index f4a8f71d9797..df9fe78c2d4c 100644
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Audio Spectrogram Transformer
diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.md
similarity index 97%
rename from docs/source/en/model_doc/auto.mdx
rename to docs/source/en/model_doc/auto.md
index 0f82f383dad6..722cafa87554 100644
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Auto Classes
diff --git a/docs/source/en/model_doc/autoformer.mdx b/docs/source/en/model_doc/autoformer.md
similarity index 94%
rename from docs/source/en/model_doc/autoformer.mdx
rename to docs/source/en/model_doc/autoformer.md
index c1bd30555cdb..f50705d0e3ef 100644
--- a/docs/source/en/model_doc/autoformer.mdx
+++ b/docs/source/en/model_doc/autoformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Autoformer
diff --git a/docs/source/en/model_doc/bart.mdx b/docs/source/en/model_doc/bart.md
similarity index 98%
rename from docs/source/en/model_doc/bart.mdx
rename to docs/source/en/model_doc/bart.md
index ef4bebd01704..dcf149fd85e1 100644
--- a/docs/source/en/model_doc/bart.mdx
+++ b/docs/source/en/model_doc/bart.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BART
diff --git a/docs/source/en/model_doc/barthez.mdx b/docs/source/en/model_doc/barthez.md
similarity index 94%
rename from docs/source/en/model_doc/barthez.mdx
rename to docs/source/en/model_doc/barthez.md
index f1969e8e9424..fdeb8e2fed23 100644
--- a/docs/source/en/model_doc/barthez.mdx
+++ b/docs/source/en/model_doc/barthez.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BARThez
diff --git a/docs/source/en/model_doc/bartpho.mdx b/docs/source/en/model_doc/bartpho.md
similarity index 95%
rename from docs/source/en/model_doc/bartpho.mdx
rename to docs/source/en/model_doc/bartpho.md
index d940173b42f8..3529c11a7ed2 100644
--- a/docs/source/en/model_doc/bartpho.mdx
+++ b/docs/source/en/model_doc/bartpho.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BARTpho
diff --git a/docs/source/en/model_doc/beit.mdx b/docs/source/en/model_doc/beit.md
similarity index 97%
rename from docs/source/en/model_doc/beit.mdx
rename to docs/source/en/model_doc/beit.md
index c9ca2ee9921d..69586724713d 100644
--- a/docs/source/en/model_doc/beit.mdx
+++ b/docs/source/en/model_doc/beit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BEiT
diff --git a/docs/source/en/model_doc/bert-generation.mdx b/docs/source/en/model_doc/bert-generation.md
similarity index 96%
rename from docs/source/en/model_doc/bert-generation.mdx
rename to docs/source/en/model_doc/bert-generation.md
index e300917ea5e6..9cc7bac6c7e4 100644
--- a/docs/source/en/model_doc/bert-generation.mdx
+++ b/docs/source/en/model_doc/bert-generation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BertGeneration
diff --git a/docs/source/en/model_doc/bert-japanese.mdx b/docs/source/en/model_doc/bert-japanese.md
similarity index 93%
rename from docs/source/en/model_doc/bert-japanese.mdx
rename to docs/source/en/model_doc/bert-japanese.md
index 312714b379e8..208b775307a6 100644
--- a/docs/source/en/model_doc/bert-japanese.mdx
+++ b/docs/source/en/model_doc/bert-japanese.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BertJapanese
diff --git a/docs/source/en/model_doc/bert.mdx b/docs/source/en/model_doc/bert.md
similarity index 98%
rename from docs/source/en/model_doc/bert.mdx
rename to docs/source/en/model_doc/bert.md
index eae3bce130cf..19d15cfc05a4 100644
--- a/docs/source/en/model_doc/bert.mdx
+++ b/docs/source/en/model_doc/bert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BERT
diff --git a/docs/source/en/model_doc/bertweet.mdx b/docs/source/en/model_doc/bertweet.md
similarity index 93%
rename from docs/source/en/model_doc/bertweet.mdx
rename to docs/source/en/model_doc/bertweet.md
index df55360646f9..50629445aee8 100644
--- a/docs/source/en/model_doc/bertweet.mdx
+++ b/docs/source/en/model_doc/bertweet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BERTweet
diff --git a/docs/source/en/model_doc/big_bird.mdx b/docs/source/en/model_doc/big_bird.md
similarity index 97%
rename from docs/source/en/model_doc/big_bird.mdx
rename to docs/source/en/model_doc/big_bird.md
index 22bc4debf45e..b8bbb388d6e9 100644
--- a/docs/source/en/model_doc/big_bird.mdx
+++ b/docs/source/en/model_doc/big_bird.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BigBird
diff --git a/docs/source/en/model_doc/bigbird_pegasus.mdx b/docs/source/en/model_doc/bigbird_pegasus.md
similarity index 96%
rename from docs/source/en/model_doc/bigbird_pegasus.mdx
rename to docs/source/en/model_doc/bigbird_pegasus.md
index 00ccb88dc0fa..d767f548a768 100644
--- a/docs/source/en/model_doc/bigbird_pegasus.mdx
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BigBirdPegasus
diff --git a/docs/source/en/model_doc/biogpt.mdx b/docs/source/en/model_doc/biogpt.md
similarity index 95%
rename from docs/source/en/model_doc/biogpt.mdx
rename to docs/source/en/model_doc/biogpt.md
index 37b23402c265..29327df21a02 100644
--- a/docs/source/en/model_doc/biogpt.mdx
+++ b/docs/source/en/model_doc/biogpt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BioGPT
diff --git a/docs/source/en/model_doc/bit.mdx b/docs/source/en/model_doc/bit.md
similarity index 95%
rename from docs/source/en/model_doc/bit.mdx
rename to docs/source/en/model_doc/bit.md
index 343832e62e63..80b9fdd2caff 100644
--- a/docs/source/en/model_doc/bit.mdx
+++ b/docs/source/en/model_doc/bit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Big Transfer (BiT)
diff --git a/docs/source/en/model_doc/blenderbot-small.mdx b/docs/source/en/model_doc/blenderbot-small.md
similarity index 95%
rename from docs/source/en/model_doc/blenderbot-small.mdx
rename to docs/source/en/model_doc/blenderbot-small.md
index fa3d32ab0ec7..c126bc9b1451 100644
--- a/docs/source/en/model_doc/blenderbot-small.mdx
+++ b/docs/source/en/model_doc/blenderbot-small.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Blenderbot Small
diff --git a/docs/source/en/model_doc/blenderbot.mdx b/docs/source/en/model_doc/blenderbot.md
similarity index 96%
rename from docs/source/en/model_doc/blenderbot.mdx
rename to docs/source/en/model_doc/blenderbot.md
index 485b4ca1e0f6..5a10af77b698 100644
--- a/docs/source/en/model_doc/blenderbot.mdx
+++ b/docs/source/en/model_doc/blenderbot.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Blenderbot
diff --git a/docs/source/en/model_doc/blip-2.mdx b/docs/source/en/model_doc/blip-2.md
similarity index 96%
rename from docs/source/en/model_doc/blip-2.mdx
rename to docs/source/en/model_doc/blip-2.md
index 57b380f99fe0..0890e612561a 100644
--- a/docs/source/en/model_doc/blip-2.mdx
+++ b/docs/source/en/model_doc/blip-2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BLIP-2
diff --git a/docs/source/en/model_doc/blip.mdx b/docs/source/en/model_doc/blip.md
similarity index 95%
rename from docs/source/en/model_doc/blip.mdx
rename to docs/source/en/model_doc/blip.md
index 12cc26f41826..a67e7411c35e 100644
--- a/docs/source/en/model_doc/blip.mdx
+++ b/docs/source/en/model_doc/blip.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BLIP
diff --git a/docs/source/en/model_doc/bloom.mdx b/docs/source/en/model_doc/bloom.md
similarity index 95%
rename from docs/source/en/model_doc/bloom.mdx
rename to docs/source/en/model_doc/bloom.md
index 3f30f14706a1..3fba4af3de22 100644
--- a/docs/source/en/model_doc/bloom.mdx
+++ b/docs/source/en/model_doc/bloom.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BLOOM
diff --git a/docs/source/en/model_doc/bort.mdx b/docs/source/en/model_doc/bort.md
similarity index 94%
rename from docs/source/en/model_doc/bort.mdx
rename to docs/source/en/model_doc/bort.md
index e90f042b6566..23f004cc9b60 100644
--- a/docs/source/en/model_doc/bort.mdx
+++ b/docs/source/en/model_doc/bort.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BORT
diff --git a/docs/source/en/model_doc/bridgetower.mdx b/docs/source/en/model_doc/bridgetower.md
similarity index 97%
rename from docs/source/en/model_doc/bridgetower.mdx
rename to docs/source/en/model_doc/bridgetower.md
index 9f7572f3122b..ba98cea91d21 100644
--- a/docs/source/en/model_doc/bridgetower.mdx
+++ b/docs/source/en/model_doc/bridgetower.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BridgeTower
diff --git a/docs/source/en/model_doc/byt5.mdx b/docs/source/en/model_doc/byt5.md
similarity index 97%
rename from docs/source/en/model_doc/byt5.mdx
rename to docs/source/en/model_doc/byt5.md
index dc4c5a6caf8f..2df7c4ddaa24 100644
--- a/docs/source/en/model_doc/byt5.mdx
+++ b/docs/source/en/model_doc/byt5.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ByT5
diff --git a/docs/source/en/model_doc/camembert.mdx b/docs/source/en/model_doc/camembert.md
similarity index 95%
rename from docs/source/en/model_doc/camembert.mdx
rename to docs/source/en/model_doc/camembert.md
index 4299603613c9..3ec4cd5dd0b1 100644
--- a/docs/source/en/model_doc/camembert.mdx
+++ b/docs/source/en/model_doc/camembert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CamemBERT
diff --git a/docs/source/en/model_doc/canine.mdx b/docs/source/en/model_doc/canine.md
similarity index 97%
rename from docs/source/en/model_doc/canine.mdx
rename to docs/source/en/model_doc/canine.md
index b23da136d318..748ec63eccce 100644
--- a/docs/source/en/model_doc/canine.mdx
+++ b/docs/source/en/model_doc/canine.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CANINE
diff --git a/docs/source/en/model_doc/chinese_clip.mdx b/docs/source/en/model_doc/chinese_clip.md
similarity index 96%
rename from docs/source/en/model_doc/chinese_clip.mdx
rename to docs/source/en/model_doc/chinese_clip.md
index d8973759ed5a..430a734014c5 100644
--- a/docs/source/en/model_doc/chinese_clip.mdx
+++ b/docs/source/en/model_doc/chinese_clip.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Chinese-CLIP
diff --git a/docs/source/en/model_doc/clap.mdx b/docs/source/en/model_doc/clap.md
similarity index 95%
rename from docs/source/en/model_doc/clap.mdx
rename to docs/source/en/model_doc/clap.md
index 2074934debcf..54082ec8aada 100644
--- a/docs/source/en/model_doc/clap.mdx
+++ b/docs/source/en/model_doc/clap.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CLAP
diff --git a/docs/source/en/model_doc/clip.mdx b/docs/source/en/model_doc/clip.md
similarity index 97%
rename from docs/source/en/model_doc/clip.mdx
rename to docs/source/en/model_doc/clip.md
index 790bce6c7ff0..7bd40fc05b0b 100644
--- a/docs/source/en/model_doc/clip.mdx
+++ b/docs/source/en/model_doc/clip.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CLIP
diff --git a/docs/source/en/model_doc/clipseg.mdx b/docs/source/en/model_doc/clipseg.md
similarity index 96%
rename from docs/source/en/model_doc/clipseg.mdx
rename to docs/source/en/model_doc/clipseg.md
index 94b58275f6d8..c4c60a48d055 100644
--- a/docs/source/en/model_doc/clipseg.mdx
+++ b/docs/source/en/model_doc/clipseg.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CLIPSeg
diff --git a/docs/source/en/model_doc/codegen.mdx b/docs/source/en/model_doc/codegen.md
similarity index 96%
rename from docs/source/en/model_doc/codegen.mdx
rename to docs/source/en/model_doc/codegen.md
index bc6de54b4e23..695f45f9ae17 100644
--- a/docs/source/en/model_doc/codegen.mdx
+++ b/docs/source/en/model_doc/codegen.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CodeGen
diff --git a/docs/source/en/model_doc/conditional_detr.mdx b/docs/source/en/model_doc/conditional_detr.md
similarity index 95%
rename from docs/source/en/model_doc/conditional_detr.mdx
rename to docs/source/en/model_doc/conditional_detr.md
index 0adb028908e9..4dff1d0e353b 100644
--- a/docs/source/en/model_doc/conditional_detr.mdx
+++ b/docs/source/en/model_doc/conditional_detr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Conditional DETR
diff --git a/docs/source/en/model_doc/convbert.mdx b/docs/source/en/model_doc/convbert.md
similarity index 96%
rename from docs/source/en/model_doc/convbert.mdx
rename to docs/source/en/model_doc/convbert.md
index cc2a79f9616f..8a0aa7a946cc 100644
--- a/docs/source/en/model_doc/convbert.mdx
+++ b/docs/source/en/model_doc/convbert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ConvBERT
diff --git a/docs/source/en/model_doc/convnext.mdx b/docs/source/en/model_doc/convnext.md
similarity index 96%
rename from docs/source/en/model_doc/convnext.mdx
rename to docs/source/en/model_doc/convnext.md
index 3da76eb0e046..acbb0265b2e6 100644
--- a/docs/source/en/model_doc/convnext.mdx
+++ b/docs/source/en/model_doc/convnext.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ConvNeXT
diff --git a/docs/source/en/model_doc/convnextv2.mdx b/docs/source/en/model_doc/convnextv2.md
similarity index 95%
rename from docs/source/en/model_doc/convnextv2.mdx
rename to docs/source/en/model_doc/convnextv2.md
index 6572854bfa6c..9479cdd56fa1 100644
--- a/docs/source/en/model_doc/convnextv2.mdx
+++ b/docs/source/en/model_doc/convnextv2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ConvNeXt V2
diff --git a/docs/source/en/model_doc/cpm.mdx b/docs/source/en/model_doc/cpm.md
similarity index 93%
rename from docs/source/en/model_doc/cpm.mdx
rename to docs/source/en/model_doc/cpm.md
index ac8ed8fdbafb..a2ecf1a1e092 100644
--- a/docs/source/en/model_doc/cpm.mdx
+++ b/docs/source/en/model_doc/cpm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CPM
diff --git a/docs/source/en/model_doc/cpmant.mdx b/docs/source/en/model_doc/cpmant.md
similarity index 90%
rename from docs/source/en/model_doc/cpmant.mdx
rename to docs/source/en/model_doc/cpmant.md
index 8f855355b3a7..2c4ad92a629e 100644
--- a/docs/source/en/model_doc/cpmant.mdx
+++ b/docs/source/en/model_doc/cpmant.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CPMAnt
diff --git a/docs/source/en/model_doc/ctrl.mdx b/docs/source/en/model_doc/ctrl.md
similarity index 96%
rename from docs/source/en/model_doc/ctrl.mdx
rename to docs/source/en/model_doc/ctrl.md
index 5b42fbe7c387..9c2413d27769 100644
--- a/docs/source/en/model_doc/ctrl.mdx
+++ b/docs/source/en/model_doc/ctrl.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # CTRL
diff --git a/docs/source/en/model_doc/cvt.mdx b/docs/source/en/model_doc/cvt.md
similarity index 96%
rename from docs/source/en/model_doc/cvt.mdx
rename to docs/source/en/model_doc/cvt.md
index 3c58c5b7a2b5..6c9aea5ec863 100644
--- a/docs/source/en/model_doc/cvt.mdx
+++ b/docs/source/en/model_doc/cvt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Convolutional Vision Transformer (CvT)
diff --git a/docs/source/en/model_doc/data2vec.mdx b/docs/source/en/model_doc/data2vec.md
similarity index 97%
rename from docs/source/en/model_doc/data2vec.mdx
rename to docs/source/en/model_doc/data2vec.md
index 091d34a163d6..dc05c44be90c 100644
--- a/docs/source/en/model_doc/data2vec.mdx
+++ b/docs/source/en/model_doc/data2vec.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Data2Vec
diff --git a/docs/source/en/model_doc/deberta-v2.mdx b/docs/source/en/model_doc/deberta-v2.md
similarity index 97%
rename from docs/source/en/model_doc/deberta-v2.mdx
rename to docs/source/en/model_doc/deberta-v2.md
index 74d537cf2243..38f575877e37 100644
--- a/docs/source/en/model_doc/deberta-v2.mdx
+++ b/docs/source/en/model_doc/deberta-v2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DeBERTa-v2
diff --git a/docs/source/en/model_doc/deberta.mdx b/docs/source/en/model_doc/deberta.md
similarity index 97%
rename from docs/source/en/model_doc/deberta.mdx
rename to docs/source/en/model_doc/deberta.md
index 8faf2e1c28c1..ed66364a4b5a 100644
--- a/docs/source/en/model_doc/deberta.mdx
+++ b/docs/source/en/model_doc/deberta.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DeBERTa
diff --git a/docs/source/en/model_doc/decision_transformer.mdx b/docs/source/en/model_doc/decision_transformer.md
similarity index 93%
rename from docs/source/en/model_doc/decision_transformer.mdx
rename to docs/source/en/model_doc/decision_transformer.md
index 2f0c70b6791b..a46673d87ac8 100644
--- a/docs/source/en/model_doc/decision_transformer.mdx
+++ b/docs/source/en/model_doc/decision_transformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Decision Transformer
diff --git a/docs/source/en/model_doc/deformable_detr.mdx b/docs/source/en/model_doc/deformable_detr.md
similarity index 95%
rename from docs/source/en/model_doc/deformable_detr.mdx
rename to docs/source/en/model_doc/deformable_detr.md
index 16ea70511b34..a9b1267c15a8 100644
--- a/docs/source/en/model_doc/deformable_detr.mdx
+++ b/docs/source/en/model_doc/deformable_detr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Deformable DETR
diff --git a/docs/source/en/model_doc/deit.mdx b/docs/source/en/model_doc/deit.md
similarity index 97%
rename from docs/source/en/model_doc/deit.mdx
rename to docs/source/en/model_doc/deit.md
index 306915e7483e..ef32e05ebd92 100644
--- a/docs/source/en/model_doc/deit.mdx
+++ b/docs/source/en/model_doc/deit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DeiT
diff --git a/docs/source/en/model_doc/deplot.mdx b/docs/source/en/model_doc/deplot.md
similarity index 96%
rename from docs/source/en/model_doc/deplot.mdx
rename to docs/source/en/model_doc/deplot.md
index 8949e22ceb0a..f425a8268fdf 100644
--- a/docs/source/en/model_doc/deplot.mdx
+++ b/docs/source/en/model_doc/deplot.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DePlot
diff --git a/docs/source/en/model_doc/deta.mdx b/docs/source/en/model_doc/deta.md
similarity index 95%
rename from docs/source/en/model_doc/deta.mdx
rename to docs/source/en/model_doc/deta.md
index 9b45e4475329..d384f5564e5e 100644
--- a/docs/source/en/model_doc/deta.mdx
+++ b/docs/source/en/model_doc/deta.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DETA
diff --git a/docs/source/en/model_doc/detr.mdx b/docs/source/en/model_doc/detr.md
similarity index 98%
rename from docs/source/en/model_doc/detr.mdx
rename to docs/source/en/model_doc/detr.md
index 57fbd04e1883..a83f3097bf48 100644
--- a/docs/source/en/model_doc/detr.mdx
+++ b/docs/source/en/model_doc/detr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DETR
diff --git a/docs/source/en/model_doc/dialogpt.mdx b/docs/source/en/model_doc/dialogpt.md
similarity index 94%
rename from docs/source/en/model_doc/dialogpt.mdx
rename to docs/source/en/model_doc/dialogpt.md
index 62c6b45130e3..70929409b294 100644
--- a/docs/source/en/model_doc/dialogpt.mdx
+++ b/docs/source/en/model_doc/dialogpt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DialoGPT
diff --git a/docs/source/en/model_doc/dinat.mdx b/docs/source/en/model_doc/dinat.md
similarity index 96%
rename from docs/source/en/model_doc/dinat.mdx
rename to docs/source/en/model_doc/dinat.md
index 2ddbc63224c3..2317b13b7f9c 100644
--- a/docs/source/en/model_doc/dinat.mdx
+++ b/docs/source/en/model_doc/dinat.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Dilated Neighborhood Attention Transformer
diff --git a/docs/source/en/model_doc/distilbert.mdx b/docs/source/en/model_doc/distilbert.md
similarity index 98%
rename from docs/source/en/model_doc/distilbert.mdx
rename to docs/source/en/model_doc/distilbert.md
index 837f0319ec9e..2e68119a20b2 100644
--- a/docs/source/en/model_doc/distilbert.mdx
+++ b/docs/source/en/model_doc/distilbert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DistilBERT
diff --git a/docs/source/en/model_doc/dit.mdx b/docs/source/en/model_doc/dit.md
similarity index 96%
rename from docs/source/en/model_doc/dit.mdx
rename to docs/source/en/model_doc/dit.md
index 4843ca71f5ac..7d5f873e78bb 100644
--- a/docs/source/en/model_doc/dit.mdx
+++ b/docs/source/en/model_doc/dit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DiT
diff --git a/docs/source/en/model_doc/donut.mdx b/docs/source/en/model_doc/donut.md
similarity index 98%
rename from docs/source/en/model_doc/donut.mdx
rename to docs/source/en/model_doc/donut.md
index 62ce32fd9c80..6ccfe39dbb3a 100644
--- a/docs/source/en/model_doc/donut.mdx
+++ b/docs/source/en/model_doc/donut.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 specific language governing permissions and limitations under the License. -->
 
 # Donut
diff --git a/docs/source/en/model_doc/dpr.mdx b/docs/source/en/model_doc/dpr.md
similarity index 95%
rename from docs/source/en/model_doc/dpr.mdx
rename to docs/source/en/model_doc/dpr.md
index 12c139e33b65..10bc76b72dd6 100644
--- a/docs/source/en/model_doc/dpr.mdx
+++ b/docs/source/en/model_doc/dpr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DPR
diff --git a/docs/source/en/model_doc/dpt.mdx b/docs/source/en/model_doc/dpt.md
similarity index 95%
rename from docs/source/en/model_doc/dpt.mdx
rename to docs/source/en/model_doc/dpt.md
index e67dfde084d8..5e3e25343cdd 100644
--- a/docs/source/en/model_doc/dpt.mdx
+++ b/docs/source/en/model_doc/dpt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DPT
diff --git a/docs/source/en/model_doc/efficientformer.mdx b/docs/source/en/model_doc/efficientformer.md
similarity index 95%
rename from docs/source/en/model_doc/efficientformer.mdx
rename to docs/source/en/model_doc/efficientformer.md
index 0ef8cfb53f80..1f16f9811b77 100644
--- a/docs/source/en/model_doc/efficientformer.mdx
+++ b/docs/source/en/model_doc/efficientformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # EfficientFormer
diff --git a/docs/source/en/model_doc/efficientnet.mdx b/docs/source/en/model_doc/efficientnet.md
similarity index 94%
rename from docs/source/en/model_doc/efficientnet.mdx
rename to docs/source/en/model_doc/efficientnet.md
index 6aeec7651c8c..a69b255dba5e 100644
--- a/docs/source/en/model_doc/efficientnet.mdx
+++ b/docs/source/en/model_doc/efficientnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # EfficientNet
diff --git a/docs/source/en/model_doc/electra.mdx b/docs/source/en/model_doc/electra.md
similarity index 97%
rename from docs/source/en/model_doc/electra.mdx
rename to docs/source/en/model_doc/electra.md
index e0422dbae5a8..26830950ae3a 100644
--- a/docs/source/en/model_doc/electra.mdx
+++ b/docs/source/en/model_doc/electra.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ELECTRA
diff --git a/docs/source/en/model_doc/encodec.mdx b/docs/source/en/model_doc/encodec.md
similarity index 95%
rename from docs/source/en/model_doc/encodec.mdx
rename to docs/source/en/model_doc/encodec.md
index f98156db1d91..bc7f64676eee 100644
--- a/docs/source/en/model_doc/encodec.mdx
+++ b/docs/source/en/model_doc/encodec.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # EnCodec
diff --git a/docs/source/en/model_doc/encoder-decoder.mdx b/docs/source/en/model_doc/encoder-decoder.md
similarity index 98%
rename from docs/source/en/model_doc/encoder-decoder.mdx
rename to docs/source/en/model_doc/encoder-decoder.md
index 8130b4945d4c..8e26a3b9e407 100644
--- a/docs/source/en/model_doc/encoder-decoder.mdx
+++ b/docs/source/en/model_doc/encoder-decoder.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Encoder Decoder Models
diff --git a/docs/source/en/model_doc/ernie.mdx b/docs/source/en/model_doc/ernie.md
similarity index 95%
rename from docs/source/en/model_doc/ernie.mdx
rename to docs/source/en/model_doc/ernie.md
index 0159c0f4bc6a..a64291a7d4f5 100644
--- a/docs/source/en/model_doc/ernie.mdx
+++ b/docs/source/en/model_doc/ernie.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ERNIE
diff --git a/docs/source/en/model_doc/ernie_m.mdx b/docs/source/en/model_doc/ernie_m.md
similarity index 95%
rename from docs/source/en/model_doc/ernie_m.mdx
rename to docs/source/en/model_doc/ernie_m.md
index bd04483fc5b5..83e08e09bfcf 100644
--- a/docs/source/en/model_doc/ernie_m.mdx
+++ b/docs/source/en/model_doc/ernie_m.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ErnieM
diff --git a/docs/source/en/model_doc/esm.mdx b/docs/source/en/model_doc/esm.md
similarity index 97%
rename from docs/source/en/model_doc/esm.mdx
rename to docs/source/en/model_doc/esm.md
index ca7c950dabe0..47b25650847e 100644
--- a/docs/source/en/model_doc/esm.mdx
+++ b/docs/source/en/model_doc/esm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ESM
diff --git a/docs/source/en/model_doc/flan-t5.mdx b/docs/source/en/model_doc/flan-t5.md
similarity index 93%
rename from docs/source/en/model_doc/flan-t5.mdx
rename to docs/source/en/model_doc/flan-t5.md
index 5a2d6fc934fd..5d781f75b179 100644
--- a/docs/source/en/model_doc/flan-t5.mdx
+++ b/docs/source/en/model_doc/flan-t5.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FLAN-T5
diff --git a/docs/source/en/model_doc/flan-ul2.mdx b/docs/source/en/model_doc/flan-ul2.md
similarity index 94%
rename from docs/source/en/model_doc/flan-ul2.mdx
rename to docs/source/en/model_doc/flan-ul2.md
index bcc00295420e..e20603920888 100644
--- a/docs/source/en/model_doc/flan-ul2.mdx
+++ b/docs/source/en/model_doc/flan-ul2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FLAN-UL2
diff --git a/docs/source/en/model_doc/flaubert.mdx b/docs/source/en/model_doc/flaubert.md
similarity index 96%
rename from docs/source/en/model_doc/flaubert.mdx
rename to docs/source/en/model_doc/flaubert.md
index 0e8a1b252ab7..3e85bd6fa9d9 100644
--- a/docs/source/en/model_doc/flaubert.mdx
+++ b/docs/source/en/model_doc/flaubert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FlauBERT
diff --git a/docs/source/en/model_doc/flava.mdx b/docs/source/en/model_doc/flava.md
similarity index 94%
rename from docs/source/en/model_doc/flava.mdx
rename to docs/source/en/model_doc/flava.md
index 4df11a5758a2..ae9da0d184a5 100644
--- a/docs/source/en/model_doc/flava.mdx
+++ b/docs/source/en/model_doc/flava.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FLAVA
diff --git a/docs/source/en/model_doc/fnet.mdx b/docs/source/en/model_doc/fnet.md
similarity index 96%
rename from docs/source/en/model_doc/fnet.mdx
rename to docs/source/en/model_doc/fnet.md
index 9bf858bee427..a6d862f8a1a7 100644
--- a/docs/source/en/model_doc/fnet.mdx
+++ b/docs/source/en/model_doc/fnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FNet
diff --git a/docs/source/en/model_doc/focalnet.mdx b/docs/source/en/model_doc/focalnet.md
similarity index 95%
rename from docs/source/en/model_doc/focalnet.mdx
rename to docs/source/en/model_doc/focalnet.md
index e87042e78dd1..21a75440b136 100644
--- a/docs/source/en/model_doc/focalnet.mdx
+++ b/docs/source/en/model_doc/focalnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FocalNet
diff --git a/docs/source/en/model_doc/fsmt.mdx b/docs/source/en/model_doc/fsmt.md
similarity index 94%
rename from docs/source/en/model_doc/fsmt.mdx
rename to docs/source/en/model_doc/fsmt.md
index 25c8d85cf486..49625f6c472e 100644
--- a/docs/source/en/model_doc/fsmt.mdx
+++ b/docs/source/en/model_doc/fsmt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # FSMT
diff --git a/docs/source/en/model_doc/funnel.mdx b/docs/source/en/model_doc/funnel.md
similarity index 97%
rename from docs/source/en/model_doc/funnel.mdx
rename to docs/source/en/model_doc/funnel.md
index ebbc67927872..3cc4eb0aaed6 100644
--- a/docs/source/en/model_doc/funnel.mdx
+++ b/docs/source/en/model_doc/funnel.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Funnel Transformer
diff --git a/docs/source/en/model_doc/git.mdx b/docs/source/en/model_doc/git.md
similarity index 95%
rename from docs/source/en/model_doc/git.mdx
rename to docs/source/en/model_doc/git.md
index ac13874d472f..b0c96200af3c 100644
--- a/docs/source/en/model_doc/git.mdx
+++ b/docs/source/en/model_doc/git.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GIT
diff --git a/docs/source/en/model_doc/glpn.mdx b/docs/source/en/model_doc/glpn.md
similarity index 95%
rename from docs/source/en/model_doc/glpn.mdx
rename to docs/source/en/model_doc/glpn.md
index d1ae7da094d5..be9a7d2d7910 100644
--- a/docs/source/en/model_doc/glpn.mdx
+++ b/docs/source/en/model_doc/glpn.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GLPN
diff --git a/docs/source/en/model_doc/gpt-sw3.mdx b/docs/source/en/model_doc/gpt-sw3.md
similarity index 94%
rename from docs/source/en/model_doc/gpt-sw3.mdx
rename to docs/source/en/model_doc/gpt-sw3.md
index 2f20937a4c1e..286cac12c998 100644
--- a/docs/source/en/model_doc/gpt-sw3.mdx
+++ b/docs/source/en/model_doc/gpt-sw3.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPT-Sw3
diff --git a/docs/source/en/model_doc/gpt2.mdx b/docs/source/en/model_doc/gpt2.md
similarity index 97%
rename from docs/source/en/model_doc/gpt2.mdx
rename to docs/source/en/model_doc/gpt2.md
index 6288e46eb555..878bf84a3fac 100644
--- a/docs/source/en/model_doc/gpt2.mdx
+++ b/docs/source/en/model_doc/gpt2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # OpenAI GPT2
diff --git a/docs/source/en/model_doc/gpt_bigcode.mdx b/docs/source/en/model_doc/gpt_bigcode.md
similarity index 96%
rename from docs/source/en/model_doc/gpt_bigcode.mdx
rename to docs/source/en/model_doc/gpt_bigcode.md
index b0991567b2ed..6965d5837d8e 100644
--- a/docs/source/en/model_doc/gpt_bigcode.mdx
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPTBigCode
diff --git a/docs/source/en/model_doc/gpt_neo.mdx b/docs/source/en/model_doc/gpt_neo.md
similarity index 94%
rename from docs/source/en/model_doc/gpt_neo.mdx
rename to docs/source/en/model_doc/gpt_neo.md
index a766a85cf151..6b925aad10e4 100644
--- a/docs/source/en/model_doc/gpt_neo.mdx
+++ b/docs/source/en/model_doc/gpt_neo.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPT Neo
diff --git a/docs/source/en/model_doc/gpt_neox.mdx b/docs/source/en/model_doc/gpt_neox.md
similarity index 95%
rename from docs/source/en/model_doc/gpt_neox.mdx
rename to docs/source/en/model_doc/gpt_neox.md
index bc8bbd67331f..0ee7c8630c65 100644
--- a/docs/source/en/model_doc/gpt_neox.mdx
+++ b/docs/source/en/model_doc/gpt_neox.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPT-NeoX
diff --git a/docs/source/en/model_doc/gpt_neox_japanese.mdx b/docs/source/en/model_doc/gpt_neox_japanese.md
similarity index 94%
rename from docs/source/en/model_doc/gpt_neox_japanese.mdx
rename to docs/source/en/model_doc/gpt_neox_japanese.md
index d4bd3276e07c..c21ba838792a 100644
--- a/docs/source/en/model_doc/gpt_neox_japanese.mdx
+++ b/docs/source/en/model_doc/gpt_neox_japanese.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPT-NeoX-Japanese
diff --git a/docs/source/en/model_doc/gptj.mdx b/docs/source/en/model_doc/gptj.md
similarity index 97%
rename from docs/source/en/model_doc/gptj.mdx
rename to docs/source/en/model_doc/gptj.md
index b97251332670..5ad80a010951 100644
--- a/docs/source/en/model_doc/gptj.mdx
+++ b/docs/source/en/model_doc/gptj.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPT-J
diff --git a/docs/source/en/model_doc/gptsan-japanese.mdx b/docs/source/en/model_doc/gptsan-japanese.md
similarity index 96%
rename from docs/source/en/model_doc/gptsan-japanese.mdx
rename to docs/source/en/model_doc/gptsan-japanese.md
index aa06cb73b55f..48f67f850655 100644
--- a/docs/source/en/model_doc/gptsan-japanese.mdx
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GPTSAN-japanese
diff --git a/docs/source/en/model_doc/graphormer.mdx b/docs/source/en/model_doc/graphormer.md
similarity index 94%
rename from docs/source/en/model_doc/graphormer.mdx
rename to docs/source/en/model_doc/graphormer.md
index cfa8e30dcf03..16d61bccbef0 100644
--- a/docs/source/en/model_doc/graphormer.mdx
+++ b/docs/source/en/model_doc/graphormer.md
@@ -6,6 +6,10 @@ the License.
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Graphormer
diff --git a/docs/source/en/model_doc/groupvit.mdx b/docs/source/en/model_doc/groupvit.md
similarity index 95%
rename from docs/source/en/model_doc/groupvit.mdx
rename to docs/source/en/model_doc/groupvit.md
index 200ec7ccb8a1..cf006e284b14 100644
--- a/docs/source/en/model_doc/groupvit.mdx
+++ b/docs/source/en/model_doc/groupvit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # GroupViT
diff --git a/docs/source/en/model_doc/herbert.mdx b/docs/source/en/model_doc/herbert.md
similarity index 95%
rename from docs/source/en/model_doc/herbert.mdx
rename to docs/source/en/model_doc/herbert.md
index 90e08ebe9ac7..ee927bddb025 100644
--- a/docs/source/en/model_doc/herbert.mdx
+++ b/docs/source/en/model_doc/herbert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # HerBERT
diff --git a/docs/source/en/model_doc/hubert.mdx b/docs/source/en/model_doc/hubert.md
similarity index 94%
rename from docs/source/en/model_doc/hubert.mdx
rename to docs/source/en/model_doc/hubert.md
index 0fcf93939544..5349e1388523 100644
--- a/docs/source/en/model_doc/hubert.mdx
+++ b/docs/source/en/model_doc/hubert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Hubert
diff --git a/docs/source/en/model_doc/ibert.mdx b/docs/source/en/model_doc/ibert.md
similarity index 95%
rename from docs/source/en/model_doc/ibert.mdx
rename to docs/source/en/model_doc/ibert.md
index 90735832c83e..9c5f9c3e8de6 100644
--- a/docs/source/en/model_doc/ibert.mdx
+++ b/docs/source/en/model_doc/ibert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # I-BERT
diff --git a/docs/source/en/model_doc/imagegpt.mdx b/docs/source/en/model_doc/imagegpt.md
similarity index 97%
rename from docs/source/en/model_doc/imagegpt.mdx
rename to docs/source/en/model_doc/imagegpt.md
index 13b03560b834..01eb7dde5fc2 100644
--- a/docs/source/en/model_doc/imagegpt.mdx
+++ b/docs/source/en/model_doc/imagegpt.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 specific language governing permissions and limitations under the License. -->
 
 # ImageGPT
diff --git a/docs/source/en/model_doc/informer.mdx b/docs/source/en/model_doc/informer.md
similarity index 95%
rename from docs/source/en/model_doc/informer.mdx
rename to docs/source/en/model_doc/informer.md
index 7631e532c48c..4ab48f6f8287 100644
--- a/docs/source/en/model_doc/informer.mdx
+++ b/docs/source/en/model_doc/informer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Informer
diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.md
similarity index 96%
rename from docs/source/en/model_doc/jukebox.mdx
rename to docs/source/en/model_doc/jukebox.md
index 5ecd71ea364e..5dc87ab6de5f 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 # Jukebox
 
diff --git a/docs/source/en/model_doc/layoutlm.mdx b/docs/source/en/model_doc/layoutlm.md
similarity index 97%
rename from docs/source/en/model_doc/layoutlm.mdx
rename to docs/source/en/model_doc/layoutlm.md
index f9e34922f6c9..ebf6b1a4b4fc 100644
--- a/docs/source/en/model_doc/layoutlm.mdx
+++ b/docs/source/en/model_doc/layoutlm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LayoutLM
diff --git a/docs/source/en/model_doc/layoutlmv2.mdx b/docs/source/en/model_doc/layoutlmv2.md
similarity index 99%
rename from docs/source/en/model_doc/layoutlmv2.mdx
rename to docs/source/en/model_doc/layoutlmv2.md
index 6d2a9dc3bbee..37d1c1671e7f 100644
--- a/docs/source/en/model_doc/layoutlmv2.mdx
+++ b/docs/source/en/model_doc/layoutlmv2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LayoutLMV2
diff --git a/docs/source/en/model_doc/layoutlmv3.mdx b/docs/source/en/model_doc/layoutlmv3.md
similarity index 97%
rename from docs/source/en/model_doc/layoutlmv3.mdx
rename to docs/source/en/model_doc/layoutlmv3.md
index 2a8299a41334..39408ab88a79 100644
--- a/docs/source/en/model_doc/layoutlmv3.mdx
+++ b/docs/source/en/model_doc/layoutlmv3.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LayoutLMv3
diff --git a/docs/source/en/model_doc/layoutxlm.mdx b/docs/source/en/model_doc/layoutxlm.md
similarity index 95%
rename from docs/source/en/model_doc/layoutxlm.mdx
rename to docs/source/en/model_doc/layoutxlm.md
index ed112453beae..20f9ffd603d0 100644
--- a/docs/source/en/model_doc/layoutxlm.mdx
+++ b/docs/source/en/model_doc/layoutxlm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LayoutXLM
diff --git a/docs/source/en/model_doc/led.mdx b/docs/source/en/model_doc/led.md
similarity index 96%
rename from docs/source/en/model_doc/led.mdx
rename to docs/source/en/model_doc/led.md
index 537cfba52896..9ba9383a59d5 100644
--- a/docs/source/en/model_doc/led.mdx
+++ b/docs/source/en/model_doc/led.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LED
diff --git a/docs/source/en/model_doc/levit.mdx b/docs/source/en/model_doc/levit.md
similarity index 97%
rename from docs/source/en/model_doc/levit.mdx
rename to docs/source/en/model_doc/levit.md
index bfca2a5c063b..8145be775f52 100644
--- a/docs/source/en/model_doc/levit.mdx
+++ b/docs/source/en/model_doc/levit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LeViT
diff --git a/docs/source/en/model_doc/lilt.mdx b/docs/source/en/model_doc/lilt.md
similarity index 96%
rename from docs/source/en/model_doc/lilt.mdx
rename to docs/source/en/model_doc/lilt.md
index 421bc066d1b2..901deefd7ffe 100644
--- a/docs/source/en/model_doc/lilt.mdx
+++ b/docs/source/en/model_doc/lilt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LiLT
diff --git a/docs/source/en/model_doc/llama.mdx b/docs/source/en/model_doc/llama.md
similarity index 96%
rename from docs/source/en/model_doc/llama.mdx
rename to docs/source/en/model_doc/llama.md
index 5a107eaa8479..01b3e28d2180 100644
--- a/docs/source/en/model_doc/llama.mdx
+++ b/docs/source/en/model_doc/llama.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LLaMA
diff --git a/docs/source/en/model_doc/longformer.mdx b/docs/source/en/model_doc/longformer.md
similarity index 97%
rename from docs/source/en/model_doc/longformer.mdx
rename to docs/source/en/model_doc/longformer.md
index 8170992624b2..9947195058cc 100644
--- a/docs/source/en/model_doc/longformer.mdx
+++ b/docs/source/en/model_doc/longformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Longformer
diff --git a/docs/source/en/model_doc/longt5.mdx b/docs/source/en/model_doc/longt5.md
similarity index 97%
rename from docs/source/en/model_doc/longt5.mdx
rename to docs/source/en/model_doc/longt5.md
index 9ae3d29a21e0..e8dcfe3b237f 100644
--- a/docs/source/en/model_doc/longt5.mdx
+++ b/docs/source/en/model_doc/longt5.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LongT5
diff --git a/docs/source/en/model_doc/luke.mdx b/docs/source/en/model_doc/luke.md
similarity index 98%
rename from docs/source/en/model_doc/luke.mdx
rename to docs/source/en/model_doc/luke.md
index c96c098eb029..2947c7c41bdf 100644
--- a/docs/source/en/model_doc/luke.mdx
+++ b/docs/source/en/model_doc/luke.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LUKE
diff --git a/docs/source/en/model_doc/lxmert.mdx b/docs/source/en/model_doc/lxmert.md
similarity index 96%
rename from docs/source/en/model_doc/lxmert.mdx
rename to docs/source/en/model_doc/lxmert.md
index 083a98c34ceb..114539f61e81 100644
--- a/docs/source/en/model_doc/lxmert.mdx
+++ b/docs/source/en/model_doc/lxmert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LXMERT
diff --git a/docs/source/en/model_doc/m2m_100.mdx b/docs/source/en/model_doc/m2m_100.md
similarity index 97%
rename from docs/source/en/model_doc/m2m_100.mdx
rename to docs/source/en/model_doc/m2m_100.md
index ebe49d578066..c2b4354c6d5f 100644
--- a/docs/source/en/model_doc/m2m_100.mdx
+++ b/docs/source/en/model_doc/m2m_100.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # M2M100
diff --git a/docs/source/en/model_doc/marian.mdx b/docs/source/en/model_doc/marian.md
similarity index 98%
rename from docs/source/en/model_doc/marian.mdx
rename to docs/source/en/model_doc/marian.md
index dbca456978d0..8be41686594c 100644
--- a/docs/source/en/model_doc/marian.mdx
+++ b/docs/source/en/model_doc/marian.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MarianMT
diff --git a/docs/source/en/model_doc/markuplm.mdx b/docs/source/en/model_doc/markuplm.md
similarity index 98%
rename from docs/source/en/model_doc/markuplm.mdx
rename to docs/source/en/model_doc/markuplm.md
index ef8d52ab255a..b286c4fc00c1 100644
--- a/docs/source/en/model_doc/markuplm.mdx
+++ b/docs/source/en/model_doc/markuplm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MarkupLM
diff --git a/docs/source/en/model_doc/mask2former.mdx b/docs/source/en/model_doc/mask2former.md
similarity index 96%
rename from docs/source/en/model_doc/mask2former.mdx
rename to docs/source/en/model_doc/mask2former.md
index f0d43ba78f34..ddfa5da2ba2c 100644
--- a/docs/source/en/model_doc/mask2former.mdx
+++ b/docs/source/en/model_doc/mask2former.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Mask2Former
diff --git a/docs/source/en/model_doc/maskformer.mdx b/docs/source/en/model_doc/maskformer.md
similarity index 96%
rename from docs/source/en/model_doc/maskformer.mdx
rename to docs/source/en/model_doc/maskformer.md
index 5620c803bf28..4695e54857f7 100644
--- a/docs/source/en/model_doc/maskformer.mdx
+++ b/docs/source/en/model_doc/maskformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MaskFormer
diff --git a/docs/source/en/model_doc/matcha.mdx b/docs/source/en/model_doc/matcha.md
similarity index 96%
rename from docs/source/en/model_doc/matcha.mdx
rename to docs/source/en/model_doc/matcha.md
index 5087157273fa..20c403413feb 100644
--- a/docs/source/en/model_doc/matcha.mdx
+++ b/docs/source/en/model_doc/matcha.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MatCha
diff --git a/docs/source/en/model_doc/mbart.mdx b/docs/source/en/model_doc/mbart.md
similarity index 98%
rename from docs/source/en/model_doc/mbart.mdx
rename to docs/source/en/model_doc/mbart.md
index ed9f1f5cfdc8..8a614dd50556 100644
--- a/docs/source/en/model_doc/mbart.mdx
+++ b/docs/source/en/model_doc/mbart.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MBart and MBart-50
diff --git a/docs/source/en/model_doc/mctct.mdx b/docs/source/en/model_doc/mctct.md
similarity index 94%
rename from docs/source/en/model_doc/mctct.mdx
rename to docs/source/en/model_doc/mctct.md
index 89952a82fb97..1fbe78dca6d2 100644
--- a/docs/source/en/model_doc/mctct.mdx
+++ b/docs/source/en/model_doc/mctct.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # M-CTC-T
diff --git a/docs/source/en/model_doc/mega.mdx b/docs/source/en/model_doc/mega.md
similarity index 96%
rename from docs/source/en/model_doc/mega.mdx
rename to docs/source/en/model_doc/mega.md
index 2a6fa1e33069..d4d68b9becd1 100644
--- a/docs/source/en/model_doc/mega.mdx
+++ b/docs/source/en/model_doc/mega.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MEGA
diff --git a/docs/source/en/model_doc/megatron-bert.mdx b/docs/source/en/model_doc/megatron-bert.md
similarity index 97%
rename from docs/source/en/model_doc/megatron-bert.mdx
rename to docs/source/en/model_doc/megatron-bert.md
index 9fe5db6c7a89..88ccff23587b 100644
--- a/docs/source/en/model_doc/megatron-bert.mdx
+++ b/docs/source/en/model_doc/megatron-bert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MegatronBERT
diff --git a/docs/source/en/model_doc/megatron_gpt2.mdx b/docs/source/en/model_doc/megatron_gpt2.md
similarity index 96%
rename from docs/source/en/model_doc/megatron_gpt2.mdx
rename to docs/source/en/model_doc/megatron_gpt2.md
index a0d91e5a1630..1eea7d82bf3a 100644
--- a/docs/source/en/model_doc/megatron_gpt2.mdx
+++ b/docs/source/en/model_doc/megatron_gpt2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MegatronGPT2
diff --git a/docs/source/en/model_doc/mgp-str.mdx b/docs/source/en/model_doc/mgp-str.md
similarity index 96%
rename from docs/source/en/model_doc/mgp-str.mdx
rename to docs/source/en/model_doc/mgp-str.md
index 77c937bfb2b6..e384c0620170 100644
--- a/docs/source/en/model_doc/mgp-str.mdx
+++ b/docs/source/en/model_doc/mgp-str.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MGP-STR
diff --git a/docs/source/en/model_doc/mluke.mdx b/docs/source/en/model_doc/mluke.md
similarity index 94%
rename from docs/source/en/model_doc/mluke.mdx
rename to docs/source/en/model_doc/mluke.md
index b910f17ae2f6..ec9430848cea 100644
--- a/docs/source/en/model_doc/mluke.mdx
+++ b/docs/source/en/model_doc/mluke.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # mLUKE
diff --git a/docs/source/en/model_doc/mms.mdx b/docs/source/en/model_doc/mms.md
similarity index 98%
rename from docs/source/en/model_doc/mms.mdx
rename to docs/source/en/model_doc/mms.md
index eaf524b372f0..4f3cc93e53a5 100644
--- a/docs/source/en/model_doc/mms.mdx
+++ b/docs/source/en/model_doc/mms.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MMS
diff --git a/docs/source/en/model_doc/mobilebert.mdx b/docs/source/en/model_doc/mobilebert.md
similarity index 96%
rename from docs/source/en/model_doc/mobilebert.mdx
rename to docs/source/en/model_doc/mobilebert.md
index 9f6d6c6af011..e652756351d2 100644
--- a/docs/source/en/model_doc/mobilebert.mdx
+++ b/docs/source/en/model_doc/mobilebert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MobileBERT
diff --git a/docs/source/en/model_doc/mobilenet_v1.mdx b/docs/source/en/model_doc/mobilenet_v1.md
similarity index 96%
rename from docs/source/en/model_doc/mobilenet_v1.mdx
rename to docs/source/en/model_doc/mobilenet_v1.md
index 6a9fff2765f4..56743efe1416 100644
--- a/docs/source/en/model_doc/mobilenet_v1.mdx
+++ b/docs/source/en/model_doc/mobilenet_v1.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MobileNet V1
diff --git a/docs/source/en/model_doc/mobilenet_v2.mdx b/docs/source/en/model_doc/mobilenet_v2.md
similarity index 97%
rename from docs/source/en/model_doc/mobilenet_v2.mdx
rename to docs/source/en/model_doc/mobilenet_v2.md
index d4d6856a9122..bd4114dc71ad 100644
--- a/docs/source/en/model_doc/mobilenet_v2.mdx
+++ b/docs/source/en/model_doc/mobilenet_v2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MobileNet V2
diff --git a/docs/source/en/model_doc/mobilevit.mdx b/docs/source/en/model_doc/mobilevit.md
similarity index 97%
rename from docs/source/en/model_doc/mobilevit.mdx
rename to docs/source/en/model_doc/mobilevit.md
index c10069714546..2d815795689a 100644
--- a/docs/source/en/model_doc/mobilevit.mdx
+++ b/docs/source/en/model_doc/mobilevit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MobileViT
diff --git a/docs/source/en/model_doc/mobilevitv2.mdx b/docs/source/en/model_doc/mobilevitv2.md
similarity index 95%
rename from docs/source/en/model_doc/mobilevitv2.mdx
rename to docs/source/en/model_doc/mobilevitv2.md
index 1cfbb6808ab1..4b6689ef2b40 100644
--- a/docs/source/en/model_doc/mobilevitv2.mdx
+++ b/docs/source/en/model_doc/mobilevitv2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MobileViTV2
diff --git a/docs/source/en/model_doc/mpnet.mdx b/docs/source/en/model_doc/mpnet.md
similarity index 96%
rename from docs/source/en/model_doc/mpnet.mdx
rename to docs/source/en/model_doc/mpnet.md
index cf8b4de7a633..97c140f631d1 100644
--- a/docs/source/en/model_doc/mpnet.mdx
+++ b/docs/source/en/model_doc/mpnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MPNet
diff --git a/docs/source/en/model_doc/mt5.mdx b/docs/source/en/model_doc/mt5.md
similarity index 95%
rename from docs/source/en/model_doc/mt5.mdx
rename to docs/source/en/model_doc/mt5.md
index 9de91982ac05..1e769f5ac1f4 100644
--- a/docs/source/en/model_doc/mt5.mdx
+++ b/docs/source/en/model_doc/mt5.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # mT5
diff --git a/docs/source/en/model_doc/mvp.mdx b/docs/source/en/model_doc/mvp.md
similarity index 97%
rename from docs/source/en/model_doc/mvp.mdx
rename to docs/source/en/model_doc/mvp.md
index d457212e6cc1..043163f40b30 100644
--- a/docs/source/en/model_doc/mvp.mdx
+++ b/docs/source/en/model_doc/mvp.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # MVP
diff --git a/docs/source/en/model_doc/nat.mdx b/docs/source/en/model_doc/nat.md
similarity index 96%
rename from docs/source/en/model_doc/nat.mdx
rename to docs/source/en/model_doc/nat.md
index 5487dbee0c92..668951c241f5 100644
--- a/docs/source/en/model_doc/nat.mdx
+++ b/docs/source/en/model_doc/nat.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Neighborhood Attention Transformer
diff --git a/docs/source/en/model_doc/nezha.mdx b/docs/source/en/model_doc/nezha.md
similarity index 94%
rename from docs/source/en/model_doc/nezha.mdx
rename to docs/source/en/model_doc/nezha.md
index 3d6baf5d8a09..9c136cdf0660 100644
--- a/docs/source/en/model_doc/nezha.mdx
+++ b/docs/source/en/model_doc/nezha.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Nezha
diff --git a/docs/source/en/model_doc/nllb-moe.mdx b/docs/source/en/model_doc/nllb-moe.md
similarity index 97%
rename from docs/source/en/model_doc/nllb-moe.mdx
rename to docs/source/en/model_doc/nllb-moe.md
index eae0b644e9ef..4688e67ef451 100644
--- a/docs/source/en/model_doc/nllb-moe.mdx
+++ b/docs/source/en/model_doc/nllb-moe.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # NLLB-MOE
diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.md
similarity index 97%
rename from docs/source/en/model_doc/nllb.mdx
rename to docs/source/en/model_doc/nllb.md
index 60819da998d0..ec50716c73c8 100644
--- a/docs/source/en/model_doc/nllb.mdx
+++ b/docs/source/en/model_doc/nllb.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # NLLB
diff --git a/docs/source/en/model_doc/nystromformer.mdx b/docs/source/en/model_doc/nystromformer.md
similarity index 95%
rename from docs/source/en/model_doc/nystromformer.mdx
rename to docs/source/en/model_doc/nystromformer.md
index 1e6d1b212150..6434944aba8a 100644
--- a/docs/source/en/model_doc/nystromformer.mdx
+++ b/docs/source/en/model_doc/nystromformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Nyströmformer
diff --git a/docs/source/en/model_doc/oneformer.mdx b/docs/source/en/model_doc/oneformer.md
similarity index 97%
rename from docs/source/en/model_doc/oneformer.mdx
rename to docs/source/en/model_doc/oneformer.md
index 3560d84bc7e1..5f8f46e1529e 100644
--- a/docs/source/en/model_doc/oneformer.mdx
+++ b/docs/source/en/model_doc/oneformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # OneFormer
diff --git a/docs/source/en/model_doc/open-llama.mdx b/docs/source/en/model_doc/open-llama.md
similarity index 90%
rename from docs/source/en/model_doc/open-llama.mdx
rename to docs/source/en/model_doc/open-llama.md
index fa463953eeb4..23d35b805765 100644
--- a/docs/source/en/model_doc/open-llama.mdx
+++ b/docs/source/en/model_doc/open-llama.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Open-Llama
diff --git a/docs/source/en/model_doc/openai-gpt.mdx b/docs/source/en/model_doc/openai-gpt.md
similarity index 97%
rename from docs/source/en/model_doc/openai-gpt.mdx
rename to docs/source/en/model_doc/openai-gpt.md
index 5c1b935ea9b7..ff98930b576e 100644
--- a/docs/source/en/model_doc/openai-gpt.mdx
+++ b/docs/source/en/model_doc/openai-gpt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # OpenAI GPT
diff --git a/docs/source/en/model_doc/opt.mdx b/docs/source/en/model_doc/opt.md
similarity index 95%
rename from docs/source/en/model_doc/opt.mdx
rename to docs/source/en/model_doc/opt.md
index 073c7c3bfc68..332c63600acb 100644
--- a/docs/source/en/model_doc/opt.mdx
+++ b/docs/source/en/model_doc/opt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # OPT
@@ -45,7 +49,7 @@ The resource should ideally demonstrate something new instead of duplicating an
 
 <PipelineTag pipeline="text-classification" />
 
-- [Text classification task guide](sequence_classification.mdx)
+- [Text classification task guide](sequence_classification.md)
 - [`OPTForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
 
 <PipelineTag pipeline="question-answering" />
diff --git a/docs/source/en/model_doc/owlvit.mdx b/docs/source/en/model_doc/owlvit.md
similarity index 97%
rename from docs/source/en/model_doc/owlvit.mdx
rename to docs/source/en/model_doc/owlvit.md
index f13ad4a540e1..59a4aa0bc31c 100644
--- a/docs/source/en/model_doc/owlvit.mdx
+++ b/docs/source/en/model_doc/owlvit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # OWL-ViT
diff --git a/docs/source/en/model_doc/pegasus.mdx b/docs/source/en/model_doc/pegasus.md
similarity index 97%
rename from docs/source/en/model_doc/pegasus.mdx
rename to docs/source/en/model_doc/pegasus.md
index 41919a7e3c7f..14608aae31c9 100644
--- a/docs/source/en/model_doc/pegasus.mdx
+++ b/docs/source/en/model_doc/pegasus.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pegasus
diff --git a/docs/source/en/model_doc/pegasus_x.mdx b/docs/source/en/model_doc/pegasus_x.md
similarity index 94%
rename from docs/source/en/model_doc/pegasus_x.mdx
rename to docs/source/en/model_doc/pegasus_x.md
index 12e579a0545f..a0fd670fc7c9 100644
--- a/docs/source/en/model_doc/pegasus_x.mdx
+++ b/docs/source/en/model_doc/pegasus_x.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # PEGASUS-X
diff --git a/docs/source/en/model_doc/perceiver.mdx b/docs/source/en/model_doc/perceiver.md
similarity index 98%
rename from docs/source/en/model_doc/perceiver.mdx
rename to docs/source/en/model_doc/perceiver.md
index 42297b22ce17..97921baed2b1 100644
--- a/docs/source/en/model_doc/perceiver.mdx
+++ b/docs/source/en/model_doc/perceiver.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Perceiver
diff --git a/docs/source/en/model_doc/phobert.mdx b/docs/source/en/model_doc/phobert.md
similarity index 92%
rename from docs/source/en/model_doc/phobert.mdx
rename to docs/source/en/model_doc/phobert.md
index 4ae9b0aa6251..5543a9b3541a 100644
--- a/docs/source/en/model_doc/phobert.mdx
+++ b/docs/source/en/model_doc/phobert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # PhoBERT
diff --git a/docs/source/en/model_doc/pix2struct.mdx b/docs/source/en/model_doc/pix2struct.md
similarity index 96%
rename from docs/source/en/model_doc/pix2struct.mdx
rename to docs/source/en/model_doc/pix2struct.md
index f4ead88f5cc8..b722a59b82e6 100644
--- a/docs/source/en/model_doc/pix2struct.mdx
+++ b/docs/source/en/model_doc/pix2struct.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pix2Struct
diff --git a/docs/source/en/model_doc/plbart.mdx b/docs/source/en/model_doc/plbart.md
similarity index 97%
rename from docs/source/en/model_doc/plbart.mdx
rename to docs/source/en/model_doc/plbart.md
index bca1a5426cc7..c9f502021485 100644
--- a/docs/source/en/model_doc/plbart.mdx
+++ b/docs/source/en/model_doc/plbart.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # PLBart
diff --git a/docs/source/en/model_doc/poolformer.mdx b/docs/source/en/model_doc/poolformer.md
similarity index 96%
rename from docs/source/en/model_doc/poolformer.mdx
rename to docs/source/en/model_doc/poolformer.md
index bec2fdc23b56..537c60bdbcf6 100644
--- a/docs/source/en/model_doc/poolformer.mdx
+++ b/docs/source/en/model_doc/poolformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # PoolFormer
diff --git a/docs/source/en/model_doc/prophetnet.mdx b/docs/source/en/model_doc/prophetnet.md
similarity index 96%
rename from docs/source/en/model_doc/prophetnet.mdx
rename to docs/source/en/model_doc/prophetnet.md
index 071bdfbadf8c..6ab0937da77e 100644
--- a/docs/source/en/model_doc/prophetnet.mdx
+++ b/docs/source/en/model_doc/prophetnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ProphetNet
diff --git a/docs/source/en/model_doc/qdqbert.mdx b/docs/source/en/model_doc/qdqbert.md
similarity index 97%
rename from docs/source/en/model_doc/qdqbert.mdx
rename to docs/source/en/model_doc/qdqbert.md
index 3a65bd26c3a2..62a0e010843d 100644
--- a/docs/source/en/model_doc/qdqbert.mdx
+++ b/docs/source/en/model_doc/qdqbert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # QDQBERT
diff --git a/docs/source/en/model_doc/rag.mdx b/docs/source/en/model_doc/rag.md
similarity index 96%
rename from docs/source/en/model_doc/rag.mdx
rename to docs/source/en/model_doc/rag.md
index f1cf9e4bc232..b467c6169f66 100644
--- a/docs/source/en/model_doc/rag.mdx
+++ b/docs/source/en/model_doc/rag.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RAG
diff --git a/docs/source/en/model_doc/realm.mdx b/docs/source/en/model_doc/realm.md
similarity index 95%
rename from docs/source/en/model_doc/realm.mdx
rename to docs/source/en/model_doc/realm.md
index 545b1e0a3bf8..a8227bc83c73 100644
--- a/docs/source/en/model_doc/realm.mdx
+++ b/docs/source/en/model_doc/realm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # REALM
diff --git a/docs/source/en/model_doc/reformer.mdx b/docs/source/en/model_doc/reformer.md
similarity index 98%
rename from docs/source/en/model_doc/reformer.mdx
rename to docs/source/en/model_doc/reformer.md
index 6725173d54da..05274c7667b7 100644
--- a/docs/source/en/model_doc/reformer.mdx
+++ b/docs/source/en/model_doc/reformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Reformer
diff --git a/docs/source/en/model_doc/regnet.mdx b/docs/source/en/model_doc/regnet.md
similarity index 96%
rename from docs/source/en/model_doc/regnet.mdx
rename to docs/source/en/model_doc/regnet.md
index ac94b5b172f3..89e89459bd7f 100644
--- a/docs/source/en/model_doc/regnet.mdx
+++ b/docs/source/en/model_doc/regnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RegNet
diff --git a/docs/source/en/model_doc/rembert.mdx b/docs/source/en/model_doc/rembert.md
similarity index 96%
rename from docs/source/en/model_doc/rembert.mdx
rename to docs/source/en/model_doc/rembert.md
index 2a17a9b38470..b2e4d0f5adae 100644
--- a/docs/source/en/model_doc/rembert.mdx
+++ b/docs/source/en/model_doc/rembert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RemBERT
diff --git a/docs/source/en/model_doc/resnet.mdx b/docs/source/en/model_doc/resnet.md
similarity index 96%
rename from docs/source/en/model_doc/resnet.mdx
rename to docs/source/en/model_doc/resnet.md
index a34596bdd68a..9bb36a776f16 100644
--- a/docs/source/en/model_doc/resnet.mdx
+++ b/docs/source/en/model_doc/resnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ResNet
diff --git a/docs/source/en/model_doc/retribert.mdx b/docs/source/en/model_doc/retribert.md
similarity index 89%
rename from docs/source/en/model_doc/retribert.mdx
rename to docs/source/en/model_doc/retribert.md
index e83fae32300d..d982cf4a0b99 100644
--- a/docs/source/en/model_doc/retribert.mdx
+++ b/docs/source/en/model_doc/retribert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RetriBERT
diff --git a/docs/source/en/model_doc/roberta-prelayernorm.mdx b/docs/source/en/model_doc/roberta-prelayernorm.md
similarity index 96%
rename from docs/source/en/model_doc/roberta-prelayernorm.mdx
rename to docs/source/en/model_doc/roberta-prelayernorm.md
index 41b0762de761..9822fd7af961 100644
--- a/docs/source/en/model_doc/roberta-prelayernorm.mdx
+++ b/docs/source/en/model_doc/roberta-prelayernorm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RoBERTa-PreLayerNorm
diff --git a/docs/source/en/model_doc/roberta.mdx b/docs/source/en/model_doc/roberta.md
similarity index 98%
rename from docs/source/en/model_doc/roberta.mdx
rename to docs/source/en/model_doc/roberta.md
index 49007409ad39..5a2ba6b5cf66 100644
--- a/docs/source/en/model_doc/roberta.mdx
+++ b/docs/source/en/model_doc/roberta.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RoBERTa
diff --git a/docs/source/en/model_doc/roc_bert.mdx b/docs/source/en/model_doc/roc_bert.md
similarity index 95%
rename from docs/source/en/model_doc/roc_bert.mdx
rename to docs/source/en/model_doc/roc_bert.md
index 50d123261e95..831c656fb817 100644
--- a/docs/source/en/model_doc/roc_bert.mdx
+++ b/docs/source/en/model_doc/roc_bert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RoCBert
diff --git a/docs/source/en/model_doc/roformer.mdx b/docs/source/en/model_doc/roformer.md
similarity index 96%
rename from docs/source/en/model_doc/roformer.mdx
rename to docs/source/en/model_doc/roformer.md
index 942ca9ec5345..f15a1062965f 100644
--- a/docs/source/en/model_doc/roformer.mdx
+++ b/docs/source/en/model_doc/roformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RoFormer
diff --git a/docs/source/en/model_doc/rwkv.mdx b/docs/source/en/model_doc/rwkv.md
similarity index 97%
rename from docs/source/en/model_doc/rwkv.mdx
rename to docs/source/en/model_doc/rwkv.md
index b8812516560f..cde8218bd082 100644
--- a/docs/source/en/model_doc/rwkv.mdx
+++ b/docs/source/en/model_doc/rwkv.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # RWKV
diff --git a/docs/source/en/model_doc/sam.mdx b/docs/source/en/model_doc/sam.md
similarity index 96%
rename from docs/source/en/model_doc/sam.mdx
rename to docs/source/en/model_doc/sam.md
index ac8aa4a57302..fe0d24623fd7 100644
--- a/docs/source/en/model_doc/sam.mdx
+++ b/docs/source/en/model_doc/sam.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SAM
diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.md
similarity index 98%
rename from docs/source/en/model_doc/segformer.mdx
rename to docs/source/en/model_doc/segformer.md
index 1734a201958b..0f535351af5c 100644
--- a/docs/source/en/model_doc/segformer.mdx
+++ b/docs/source/en/model_doc/segformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SegFormer
diff --git a/docs/source/en/model_doc/sew-d.mdx b/docs/source/en/model_doc/sew-d.md
similarity index 93%
rename from docs/source/en/model_doc/sew-d.mdx
rename to docs/source/en/model_doc/sew-d.md
index 1141be68570c..b70c59061b57 100644
--- a/docs/source/en/model_doc/sew-d.mdx
+++ b/docs/source/en/model_doc/sew-d.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SEW-D
diff --git a/docs/source/en/model_doc/sew.mdx b/docs/source/en/model_doc/sew.md
similarity index 93%
rename from docs/source/en/model_doc/sew.mdx
rename to docs/source/en/model_doc/sew.md
index a4a780e992a2..ebf128ea429f 100644
--- a/docs/source/en/model_doc/sew.mdx
+++ b/docs/source/en/model_doc/sew.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SEW
diff --git a/docs/source/en/model_doc/speech-encoder-decoder.mdx b/docs/source/en/model_doc/speech-encoder-decoder.md
similarity index 97%
rename from docs/source/en/model_doc/speech-encoder-decoder.mdx
rename to docs/source/en/model_doc/speech-encoder-decoder.md
index b0718a27a88c..495f7da659c0 100644
--- a/docs/source/en/model_doc/speech-encoder-decoder.mdx
+++ b/docs/source/en/model_doc/speech-encoder-decoder.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Speech Encoder Decoder Models
diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.md
similarity index 97%
rename from docs/source/en/model_doc/speech_to_text.mdx
rename to docs/source/en/model_doc/speech_to_text.md
index 95efc5504ff8..cb13a1871ae6 100644
--- a/docs/source/en/model_doc/speech_to_text.mdx
+++ b/docs/source/en/model_doc/speech_to_text.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Speech2Text
diff --git a/docs/source/en/model_doc/speech_to_text_2.mdx b/docs/source/en/model_doc/speech_to_text_2.md
similarity index 96%
rename from docs/source/en/model_doc/speech_to_text_2.mdx
rename to docs/source/en/model_doc/speech_to_text_2.md
index 745102c30bdb..1abdeced580e 100644
--- a/docs/source/en/model_doc/speech_to_text_2.mdx
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Speech2Text2
diff --git a/docs/source/en/model_doc/speecht5.mdx b/docs/source/en/model_doc/speecht5.md
similarity index 95%
rename from docs/source/en/model_doc/speecht5.mdx
rename to docs/source/en/model_doc/speecht5.md
index 848744b24d54..915fe2a9c43f 100644
--- a/docs/source/en/model_doc/speecht5.mdx
+++ b/docs/source/en/model_doc/speecht5.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SpeechT5
diff --git a/docs/source/en/model_doc/splinter.mdx b/docs/source/en/model_doc/splinter.md
similarity index 96%
rename from docs/source/en/model_doc/splinter.mdx
rename to docs/source/en/model_doc/splinter.md
index 6c89ac6fe77a..f16169d9b218 100644
--- a/docs/source/en/model_doc/splinter.mdx
+++ b/docs/source/en/model_doc/splinter.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Splinter
diff --git a/docs/source/en/model_doc/squeezebert.mdx b/docs/source/en/model_doc/squeezebert.md
similarity index 96%
rename from docs/source/en/model_doc/squeezebert.mdx
rename to docs/source/en/model_doc/squeezebert.md
index a5f443f9a96e..515a2ef31781 100644
--- a/docs/source/en/model_doc/squeezebert.mdx
+++ b/docs/source/en/model_doc/squeezebert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SqueezeBERT
diff --git a/docs/source/en/model_doc/swiftformer.mdx b/docs/source/en/model_doc/swiftformer.md
similarity index 95%
rename from docs/source/en/model_doc/swiftformer.mdx
rename to docs/source/en/model_doc/swiftformer.md
index 29ca6e229441..67c9597d2123 100644
--- a/docs/source/en/model_doc/swiftformer.mdx
+++ b/docs/source/en/model_doc/swiftformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SwiftFormer
diff --git a/docs/source/en/model_doc/swin.mdx b/docs/source/en/model_doc/swin.md
similarity index 96%
rename from docs/source/en/model_doc/swin.mdx
rename to docs/source/en/model_doc/swin.md
index 5f1cf1bfd086..37bb86db951a 100644
--- a/docs/source/en/model_doc/swin.mdx
+++ b/docs/source/en/model_doc/swin.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Swin Transformer
diff --git a/docs/source/en/model_doc/swin2sr.mdx b/docs/source/en/model_doc/swin2sr.md
similarity index 95%
rename from docs/source/en/model_doc/swin2sr.mdx
rename to docs/source/en/model_doc/swin2sr.md
index edb073d1ee38..dfee144e50c4 100644
--- a/docs/source/en/model_doc/swin2sr.mdx
+++ b/docs/source/en/model_doc/swin2sr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Swin2SR
diff --git a/docs/source/en/model_doc/swinv2.mdx b/docs/source/en/model_doc/swinv2.md
similarity index 95%
rename from docs/source/en/model_doc/swinv2.mdx
rename to docs/source/en/model_doc/swinv2.md
index 02ff62c68574..e08389527ece 100644
--- a/docs/source/en/model_doc/swinv2.mdx
+++ b/docs/source/en/model_doc/swinv2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Swin Transformer V2
diff --git a/docs/source/en/model_doc/switch_transformers.mdx b/docs/source/en/model_doc/switch_transformers.md
similarity index 96%
rename from docs/source/en/model_doc/switch_transformers.mdx
rename to docs/source/en/model_doc/switch_transformers.md
index 679863d2645c..8f6a231b7ef7 100644
--- a/docs/source/en/model_doc/switch_transformers.mdx
+++ b/docs/source/en/model_doc/switch_transformers.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # SwitchTransformers
diff --git a/docs/source/en/model_doc/t5.mdx b/docs/source/en/model_doc/t5.md
similarity index 99%
rename from docs/source/en/model_doc/t5.mdx
rename to docs/source/en/model_doc/t5.md
index 58074f1403b1..b5fa747a8130 100644
--- a/docs/source/en/model_doc/t5.mdx
+++ b/docs/source/en/model_doc/t5.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # T5
diff --git a/docs/source/en/model_doc/t5v1.1.mdx b/docs/source/en/model_doc/t5v1.1.md
similarity index 94%
rename from docs/source/en/model_doc/t5v1.1.mdx
rename to docs/source/en/model_doc/t5v1.1.md
index a5b64f77dc7c..900e26f521dd 100644
--- a/docs/source/en/model_doc/t5v1.1.mdx
+++ b/docs/source/en/model_doc/t5v1.1.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # T5v1.1
diff --git a/docs/source/en/model_doc/table-transformer.mdx b/docs/source/en/model_doc/table-transformer.md
similarity index 96%
rename from docs/source/en/model_doc/table-transformer.mdx
rename to docs/source/en/model_doc/table-transformer.md
index 07197f233dff..7ea7ae8cd352 100644
--- a/docs/source/en/model_doc/table-transformer.mdx
+++ b/docs/source/en/model_doc/table-transformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Table Transformer
diff --git a/docs/source/en/model_doc/tapas.mdx b/docs/source/en/model_doc/tapas.md
similarity index 99%
rename from docs/source/en/model_doc/tapas.mdx
rename to docs/source/en/model_doc/tapas.md
index fadda58957c9..1c76015f2857 100644
--- a/docs/source/en/model_doc/tapas.mdx
+++ b/docs/source/en/model_doc/tapas.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # TAPAS
diff --git a/docs/source/en/model_doc/tapex.mdx b/docs/source/en/model_doc/tapex.md
similarity index 97%
rename from docs/source/en/model_doc/tapex.mdx
rename to docs/source/en/model_doc/tapex.md
index f6e65764e50d..8cebceeb73bb 100644
--- a/docs/source/en/model_doc/tapex.mdx
+++ b/docs/source/en/model_doc/tapex.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # TAPEX
diff --git a/docs/source/en/model_doc/time_series_transformer.mdx b/docs/source/en/model_doc/time_series_transformer.md
similarity index 96%
rename from docs/source/en/model_doc/time_series_transformer.mdx
rename to docs/source/en/model_doc/time_series_transformer.md
index 23be65142668..f20387fb3ad2 100644
--- a/docs/source/en/model_doc/time_series_transformer.mdx
+++ b/docs/source/en/model_doc/time_series_transformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Time Series Transformer
diff --git a/docs/source/en/model_doc/timesformer.mdx b/docs/source/en/model_doc/timesformer.md
similarity index 94%
rename from docs/source/en/model_doc/timesformer.mdx
rename to docs/source/en/model_doc/timesformer.md
index 157f806e4263..d87fde4fb2b3 100644
--- a/docs/source/en/model_doc/timesformer.mdx
+++ b/docs/source/en/model_doc/timesformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # TimeSformer
diff --git a/docs/source/en/model_doc/trajectory_transformer.mdx b/docs/source/en/model_doc/trajectory_transformer.md
similarity index 94%
rename from docs/source/en/model_doc/trajectory_transformer.mdx
rename to docs/source/en/model_doc/trajectory_transformer.md
index da7a55a50eca..25b24e3db6e6 100644
--- a/docs/source/en/model_doc/trajectory_transformer.mdx
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Trajectory Transformer
diff --git a/docs/source/en/model_doc/transfo-xl.mdx b/docs/source/en/model_doc/transfo-xl.md
similarity index 97%
rename from docs/source/en/model_doc/transfo-xl.mdx
rename to docs/source/en/model_doc/transfo-xl.md
index 83ce8bc76fce..beb5ba2fea83 100644
--- a/docs/source/en/model_doc/transfo-xl.mdx
+++ b/docs/source/en/model_doc/transfo-xl.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Transformer XL
diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.md
similarity index 97%
rename from docs/source/en/model_doc/trocr.mdx
rename to docs/source/en/model_doc/trocr.md
index 8ad65668627d..bfab93ad663b 100644
--- a/docs/source/en/model_doc/trocr.mdx
+++ b/docs/source/en/model_doc/trocr.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 specific language governing permissions and limitations under the License. -->
 
 # TrOCR
diff --git a/docs/source/en/model_doc/tvlt.mdx b/docs/source/en/model_doc/tvlt.md
similarity index 96%
rename from docs/source/en/model_doc/tvlt.mdx
rename to docs/source/en/model_doc/tvlt.md
index 56bc37d024d2..5ddb6badb71f 100644
--- a/docs/source/en/model_doc/tvlt.mdx
+++ b/docs/source/en/model_doc/tvlt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # TVLT
diff --git a/docs/source/en/model_doc/ul2.mdx b/docs/source/en/model_doc/ul2.md
similarity index 94%
rename from docs/source/en/model_doc/ul2.mdx
rename to docs/source/en/model_doc/ul2.md
index 2481285747fa..3863f23a7d73 100644
--- a/docs/source/en/model_doc/ul2.mdx
+++ b/docs/source/en/model_doc/ul2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # UL2
diff --git a/docs/source/en/model_doc/unispeech-sat.mdx b/docs/source/en/model_doc/unispeech-sat.md
similarity index 95%
rename from docs/source/en/model_doc/unispeech-sat.mdx
rename to docs/source/en/model_doc/unispeech-sat.md
index d045bcbe69d9..25489d9eeffd 100644
--- a/docs/source/en/model_doc/unispeech-sat.mdx
+++ b/docs/source/en/model_doc/unispeech-sat.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # UniSpeech-SAT
diff --git a/docs/source/en/model_doc/unispeech.mdx b/docs/source/en/model_doc/unispeech.md
similarity index 94%
rename from docs/source/en/model_doc/unispeech.mdx
rename to docs/source/en/model_doc/unispeech.md
index 3d170b63cefa..8338aa1bda2e 100644
--- a/docs/source/en/model_doc/unispeech.mdx
+++ b/docs/source/en/model_doc/unispeech.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # UniSpeech
diff --git a/docs/source/en/model_doc/upernet.mdx b/docs/source/en/model_doc/upernet.md
similarity index 96%
rename from docs/source/en/model_doc/upernet.mdx
rename to docs/source/en/model_doc/upernet.md
index e839165e74bc..db651acaa406 100644
--- a/docs/source/en/model_doc/upernet.mdx
+++ b/docs/source/en/model_doc/upernet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # UPerNet
diff --git a/docs/source/en/model_doc/van.mdx b/docs/source/en/model_doc/van.md
similarity index 95%
rename from docs/source/en/model_doc/van.mdx
rename to docs/source/en/model_doc/van.md
index e59af727589e..c6d765336338 100644
--- a/docs/source/en/model_doc/van.mdx
+++ b/docs/source/en/model_doc/van.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # VAN
diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.md
similarity index 96%
rename from docs/source/en/model_doc/videomae.mdx
rename to docs/source/en/model_doc/videomae.md
index 00237055ac3d..7dbff1b0698f 100644
--- a/docs/source/en/model_doc/videomae.mdx
+++ b/docs/source/en/model_doc/videomae.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # VideoMAE
diff --git a/docs/source/en/model_doc/vilt.mdx b/docs/source/en/model_doc/vilt.md
similarity index 96%
rename from docs/source/en/model_doc/vilt.mdx
rename to docs/source/en/model_doc/vilt.md
index 7c8653e1a3b9..4170e374d868 100644
--- a/docs/source/en/model_doc/vilt.mdx
+++ b/docs/source/en/model_doc/vilt.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ViLT
diff --git a/docs/source/en/model_doc/vision-encoder-decoder.mdx b/docs/source/en/model_doc/vision-encoder-decoder.md
similarity index 98%
rename from docs/source/en/model_doc/vision-encoder-decoder.mdx
rename to docs/source/en/model_doc/vision-encoder-decoder.md
index 0241224c0667..0beeaeae108b 100644
--- a/docs/source/en/model_doc/vision-encoder-decoder.mdx
+++ b/docs/source/en/model_doc/vision-encoder-decoder.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Vision Encoder Decoder Models
diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.mdx b/docs/source/en/model_doc/vision-text-dual-encoder.md
similarity index 92%
rename from docs/source/en/model_doc/vision-text-dual-encoder.mdx
rename to docs/source/en/model_doc/vision-text-dual-encoder.md
index 8088efaa8c42..6fa9728cac46 100644
--- a/docs/source/en/model_doc/vision-text-dual-encoder.mdx
+++ b/docs/source/en/model_doc/vision-text-dual-encoder.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # VisionTextDualEncoder
diff --git a/docs/source/en/model_doc/visual_bert.mdx b/docs/source/en/model_doc/visual_bert.md
similarity index 97%
rename from docs/source/en/model_doc/visual_bert.mdx
rename to docs/source/en/model_doc/visual_bert.md
index df8858b1fa67..7d84c0d9faec 100644
--- a/docs/source/en/model_doc/visual_bert.mdx
+++ b/docs/source/en/model_doc/visual_bert.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # VisualBERT
diff --git a/docs/source/en/model_doc/vit.mdx b/docs/source/en/model_doc/vit.md
similarity index 98%
rename from docs/source/en/model_doc/vit.mdx
rename to docs/source/en/model_doc/vit.md
index 31977a45ca95..409580d09481 100644
--- a/docs/source/en/model_doc/vit.mdx
+++ b/docs/source/en/model_doc/vit.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Vision Transformer (ViT)
diff --git a/docs/source/en/model_doc/vit_hybrid.mdx b/docs/source/en/model_doc/vit_hybrid.md
similarity index 95%
rename from docs/source/en/model_doc/vit_hybrid.mdx
rename to docs/source/en/model_doc/vit_hybrid.md
index f53b3ff10d86..84969cd0f622 100644
--- a/docs/source/en/model_doc/vit_hybrid.mdx
+++ b/docs/source/en/model_doc/vit_hybrid.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Hybrid Vision Transformer (ViT Hybrid)
diff --git a/docs/source/en/model_doc/vit_mae.mdx b/docs/source/en/model_doc/vit_mae.md
similarity index 96%
rename from docs/source/en/model_doc/vit_mae.mdx
rename to docs/source/en/model_doc/vit_mae.md
index 714a68e152ef..c14cc7e57c90 100644
--- a/docs/source/en/model_doc/vit_mae.mdx
+++ b/docs/source/en/model_doc/vit_mae.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ViTMAE
diff --git a/docs/source/en/model_doc/vit_msn.mdx b/docs/source/en/model_doc/vit_msn.md
similarity index 96%
rename from docs/source/en/model_doc/vit_msn.mdx
rename to docs/source/en/model_doc/vit_msn.md
index f4e4afcac387..ded0245194f8 100644
--- a/docs/source/en/model_doc/vit_msn.mdx
+++ b/docs/source/en/model_doc/vit_msn.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ViTMSN
diff --git a/docs/source/en/model_doc/wav2vec2-conformer.mdx b/docs/source/en/model_doc/wav2vec2-conformer.md
similarity index 94%
rename from docs/source/en/model_doc/wav2vec2-conformer.mdx
rename to docs/source/en/model_doc/wav2vec2-conformer.md
index cf185346197f..87e255cd0c6e 100644
--- a/docs/source/en/model_doc/wav2vec2-conformer.mdx
+++ b/docs/source/en/model_doc/wav2vec2-conformer.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Wav2Vec2-Conformer
diff --git a/docs/source/en/model_doc/wav2vec2.mdx b/docs/source/en/model_doc/wav2vec2.md
similarity index 98%
rename from docs/source/en/model_doc/wav2vec2.mdx
rename to docs/source/en/model_doc/wav2vec2.md
index 9b084de8e941..3a67f66d9d1f 100644
--- a/docs/source/en/model_doc/wav2vec2.mdx
+++ b/docs/source/en/model_doc/wav2vec2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Wav2Vec2
diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.mdx b/docs/source/en/model_doc/wav2vec2_phoneme.md
similarity index 94%
rename from docs/source/en/model_doc/wav2vec2_phoneme.mdx
rename to docs/source/en/model_doc/wav2vec2_phoneme.md
index b39cf66ce136..a852bef637b2 100644
--- a/docs/source/en/model_doc/wav2vec2_phoneme.mdx
+++ b/docs/source/en/model_doc/wav2vec2_phoneme.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Wav2Vec2Phoneme
diff --git a/docs/source/en/model_doc/wavlm.mdx b/docs/source/en/model_doc/wavlm.md
similarity index 95%
rename from docs/source/en/model_doc/wavlm.mdx
rename to docs/source/en/model_doc/wavlm.md
index ce58b1c965be..2754304d8264 100644
--- a/docs/source/en/model_doc/wavlm.mdx
+++ b/docs/source/en/model_doc/wavlm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # WavLM
diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.md
similarity index 95%
rename from docs/source/en/model_doc/whisper.mdx
rename to docs/source/en/model_doc/whisper.md
index 52a8b5953c63..fbf806cd41df 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Whisper
diff --git a/docs/source/en/model_doc/xclip.mdx b/docs/source/en/model_doc/xclip.md
similarity index 96%
rename from docs/source/en/model_doc/xclip.mdx
rename to docs/source/en/model_doc/xclip.md
index a49ed8b9130c..45c4c3db749b 100644
--- a/docs/source/en/model_doc/xclip.mdx
+++ b/docs/source/en/model_doc/xclip.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # X-CLIP
diff --git a/docs/source/en/model_doc/xglm.mdx b/docs/source/en/model_doc/xglm.md
similarity index 95%
rename from docs/source/en/model_doc/xglm.mdx
rename to docs/source/en/model_doc/xglm.md
index fb4c7a14289f..1b184c17e803 100644
--- a/docs/source/en/model_doc/xglm.mdx
+++ b/docs/source/en/model_doc/xglm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XGLM
diff --git a/docs/source/en/model_doc/xlm-prophetnet.mdx b/docs/source/en/model_doc/xlm-prophetnet.md
similarity index 95%
rename from docs/source/en/model_doc/xlm-prophetnet.mdx
rename to docs/source/en/model_doc/xlm-prophetnet.md
index ba6b91da6c43..5e7ba5b7e3f5 100644
--- a/docs/source/en/model_doc/xlm-prophetnet.mdx
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLM-ProphetNet
diff --git a/docs/source/en/model_doc/xlm-roberta-xl.mdx b/docs/source/en/model_doc/xlm-roberta-xl.md
similarity index 95%
rename from docs/source/en/model_doc/xlm-roberta-xl.mdx
rename to docs/source/en/model_doc/xlm-roberta-xl.md
index 7c9efa593d66..b65929460706 100644
--- a/docs/source/en/model_doc/xlm-roberta-xl.mdx
+++ b/docs/source/en/model_doc/xlm-roberta-xl.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLM-RoBERTa-XL
diff --git a/docs/source/en/model_doc/xlm-roberta.mdx b/docs/source/en/model_doc/xlm-roberta.md
similarity index 98%
rename from docs/source/en/model_doc/xlm-roberta.mdx
rename to docs/source/en/model_doc/xlm-roberta.md
index f7fe74edeb94..935003156fd1 100644
--- a/docs/source/en/model_doc/xlm-roberta.mdx
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLM-RoBERTa
diff --git a/docs/source/en/model_doc/xlm-v.mdx b/docs/source/en/model_doc/xlm-v.md
similarity index 94%
rename from docs/source/en/model_doc/xlm-v.mdx
rename to docs/source/en/model_doc/xlm-v.md
index 4ad07edecbc6..38bed0dc46b5 100644
--- a/docs/source/en/model_doc/xlm-v.mdx
+++ b/docs/source/en/model_doc/xlm-v.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLM-V
diff --git a/docs/source/en/model_doc/xlm.mdx b/docs/source/en/model_doc/xlm.md
similarity index 97%
rename from docs/source/en/model_doc/xlm.mdx
rename to docs/source/en/model_doc/xlm.md
index 846ca73b6b9e..8b5b31a2dbef 100644
--- a/docs/source/en/model_doc/xlm.mdx
+++ b/docs/source/en/model_doc/xlm.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLM
diff --git a/docs/source/en/model_doc/xlnet.mdx b/docs/source/en/model_doc/xlnet.md
similarity index 97%
rename from docs/source/en/model_doc/xlnet.mdx
rename to docs/source/en/model_doc/xlnet.md
index 599c865c71dd..3685728cd72e 100644
--- a/docs/source/en/model_doc/xlnet.mdx
+++ b/docs/source/en/model_doc/xlnet.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLNet
diff --git a/docs/source/en/model_doc/xls_r.mdx b/docs/source/en/model_doc/xls_r.md
similarity index 94%
rename from docs/source/en/model_doc/xls_r.mdx
rename to docs/source/en/model_doc/xls_r.md
index 82a7e3b8afbd..8e22004244ca 100644
--- a/docs/source/en/model_doc/xls_r.mdx
+++ b/docs/source/en/model_doc/xls_r.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLS-R
diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.mdx b/docs/source/en/model_doc/xlsr_wav2vec2.md
similarity index 93%
rename from docs/source/en/model_doc/xlsr_wav2vec2.mdx
rename to docs/source/en/model_doc/xlsr_wav2vec2.md
index 32229f28b147..643d37416d38 100644
--- a/docs/source/en/model_doc/xlsr_wav2vec2.mdx
+++ b/docs/source/en/model_doc/xlsr_wav2vec2.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLSR-Wav2Vec2
diff --git a/docs/source/en/model_doc/xmod.mdx b/docs/source/en/model_doc/xmod.md
similarity index 96%
rename from docs/source/en/model_doc/xmod.mdx
rename to docs/source/en/model_doc/xmod.md
index c240889b025a..5a3409bbc4c3 100644
--- a/docs/source/en/model_doc/xmod.mdx
+++ b/docs/source/en/model_doc/xmod.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # X-MOD
diff --git a/docs/source/en/model_doc/yolos.mdx b/docs/source/en/model_doc/yolos.md
similarity index 95%
rename from docs/source/en/model_doc/yolos.mdx
rename to docs/source/en/model_doc/yolos.md
index ec5cf6cbb492..6185c3a06757 100644
--- a/docs/source/en/model_doc/yolos.mdx
+++ b/docs/source/en/model_doc/yolos.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # YOLOS
diff --git a/docs/source/en/model_doc/yoso.mdx b/docs/source/en/model_doc/yoso.md
similarity index 96%
rename from docs/source/en/model_doc/yoso.mdx
rename to docs/source/en/model_doc/yoso.md
index 66b05b33cb4d..4b98cd348c9a 100644
--- a/docs/source/en/model_doc/yoso.mdx
+++ b/docs/source/en/model_doc/yoso.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # YOSO
diff --git a/docs/source/en/model_sharing.mdx b/docs/source/en/model_sharing.md
similarity index 98%
rename from docs/source/en/model_sharing.mdx
rename to docs/source/en/model_sharing.md
index bae458be7928..078bc29fe2ca 100644
--- a/docs/source/en/model_sharing.mdx
+++ b/docs/source/en/model_sharing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Share a model
diff --git a/docs/source/en/model_summary.mdx b/docs/source/en/model_summary.md
similarity index 99%
rename from docs/source/en/model_summary.mdx
rename to docs/source/en/model_summary.md
index bc93c3d60324..10acb4c50210 100644
--- a/docs/source/en/model_summary.mdx
+++ b/docs/source/en/model_summary.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # The Transformer model family
diff --git a/docs/source/en/multilingual.mdx b/docs/source/en/multilingual.md
similarity index 97%
rename from docs/source/en/multilingual.mdx
rename to docs/source/en/multilingual.md
index 7c95de6ffc09..72192487f1f1 100644
--- a/docs/source/en/multilingual.mdx
+++ b/docs/source/en/multilingual.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Multilingual models for inference
diff --git a/docs/source/en/pad_truncation.mdx b/docs/source/en/pad_truncation.md
similarity index 97%
rename from docs/source/en/pad_truncation.mdx
rename to docs/source/en/pad_truncation.md
index 8862e0be008d..8094dc1bc2aa 100644
--- a/docs/source/en/pad_truncation.mdx
+++ b/docs/source/en/pad_truncation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Padding and truncation
diff --git a/docs/source/en/perf_hardware.mdx b/docs/source/en/perf_hardware.md
similarity index 97%
rename from docs/source/en/perf_hardware.mdx
rename to docs/source/en/perf_hardware.md
index b28df49892b1..98579bbf8751 100644
--- a/docs/source/en/perf_hardware.mdx
+++ b/docs/source/en/perf_hardware.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 
diff --git a/docs/source/en/perf_infer_cpu.mdx b/docs/source/en/perf_infer_cpu.md
similarity index 96%
rename from docs/source/en/perf_infer_cpu.mdx
rename to docs/source/en/perf_infer_cpu.md
index a3df21e93a57..a7a524ae1ef0 100644
--- a/docs/source/en/perf_infer_cpu.mdx
+++ b/docs/source/en/perf_infer_cpu.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Inference on CPU
diff --git a/docs/source/en/perf_infer_gpu_many.mdx b/docs/source/en/perf_infer_gpu_many.md
similarity index 87%
rename from docs/source/en/perf_infer_gpu_many.mdx
rename to docs/source/en/perf_infer_gpu_many.md
index d8a24d6ab8ae..f46ad59c467a 100644
--- a/docs/source/en/perf_infer_gpu_many.mdx
+++ b/docs/source/en/perf_infer_gpu_many.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Inference on a Multiple GPUs
diff --git a/docs/source/en/perf_infer_gpu_one.mdx b/docs/source/en/perf_infer_gpu_one.md
similarity index 98%
rename from docs/source/en/perf_infer_gpu_one.mdx
rename to docs/source/en/perf_infer_gpu_one.md
index 5c8f874c37a7..080d0709cd0b 100644
--- a/docs/source/en/perf_infer_gpu_one.mdx
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Inference on a Single GPU
diff --git a/docs/source/en/perf_infer_special.mdx b/docs/source/en/perf_infer_special.md
similarity index 81%
rename from docs/source/en/perf_infer_special.mdx
rename to docs/source/en/perf_infer_special.md
index e18a9a104883..e5744754b88e 100644
--- a/docs/source/en/perf_infer_special.mdx
+++ b/docs/source/en/perf_infer_special.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Inference on Specialized Hardware
diff --git a/docs/source/en/perf_train_cpu.mdx b/docs/source/en/perf_train_cpu.md
similarity index 95%
rename from docs/source/en/perf_train_cpu.mdx
rename to docs/source/en/perf_train_cpu.md
index aa7a9ec2bf8b..9c81820ce7d5 100644
--- a/docs/source/en/perf_train_cpu.mdx
+++ b/docs/source/en/perf_train_cpu.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Training on CPU
diff --git a/docs/source/en/perf_train_cpu_many.mdx b/docs/source/en/perf_train_cpu_many.md
similarity index 97%
rename from docs/source/en/perf_train_cpu_many.mdx
rename to docs/source/en/perf_train_cpu_many.md
index 6c5e0b77a4e5..4c131430babd 100644
--- a/docs/source/en/perf_train_cpu_many.mdx
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Training on Multiple CPUs
diff --git a/docs/source/en/perf_train_gpu_many.mdx b/docs/source/en/perf_train_gpu_many.md
similarity index 99%
rename from docs/source/en/perf_train_gpu_many.mdx
rename to docs/source/en/perf_train_gpu_many.md
index e756732daf1a..d8d6c9f09d2a 100644
--- a/docs/source/en/perf_train_gpu_many.mdx
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Training on Multiple GPUs
diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.md
similarity index 99%
rename from docs/source/en/perf_train_gpu_one.mdx
rename to docs/source/en/perf_train_gpu_one.md
index 5128486d6d2e..70f24fb1bc73 100644
--- a/docs/source/en/perf_train_gpu_one.mdx
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Efficient Training on a Single GPU
diff --git a/docs/source/en/perf_train_special.mdx b/docs/source/en/perf_train_special.md
similarity index 85%
rename from docs/source/en/perf_train_special.mdx
rename to docs/source/en/perf_train_special.md
index cb6b8d4090e2..48727b24fef3 100644
--- a/docs/source/en/perf_train_special.mdx
+++ b/docs/source/en/perf_train_special.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Training on Specialized Hardware
diff --git a/docs/source/en/perf_train_tpu.mdx b/docs/source/en/perf_train_tpu.md
similarity index 84%
rename from docs/source/en/perf_train_tpu.mdx
rename to docs/source/en/perf_train_tpu.md
index bc37e00877c2..c7b344ad81e7 100644
--- a/docs/source/en/perf_train_tpu.mdx
+++ b/docs/source/en/perf_train_tpu.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Training on TPUs
diff --git a/docs/source/en/perf_train_tpu_tf.mdx b/docs/source/en/perf_train_tpu_tf.md
similarity index 98%
rename from docs/source/en/perf_train_tpu_tf.mdx
rename to docs/source/en/perf_train_tpu_tf.md
index 344031f6474e..011421b629c0 100644
--- a/docs/source/en/perf_train_tpu_tf.mdx
+++ b/docs/source/en/perf_train_tpu_tf.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Training on TPU with TensorFlow
diff --git a/docs/source/en/performance.mdx b/docs/source/en/performance.md
similarity index 96%
rename from docs/source/en/performance.mdx
rename to docs/source/en/performance.md
index 6c68e9b2acce..1b88307b7046 100644
--- a/docs/source/en/performance.mdx
+++ b/docs/source/en/performance.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Performance and Scalability
diff --git a/docs/source/en/perplexity.mdx b/docs/source/en/perplexity.md
similarity index 97%
rename from docs/source/en/perplexity.mdx
rename to docs/source/en/perplexity.md
index 76ffd3385682..18abc0305b0e 100644
--- a/docs/source/en/perplexity.mdx
+++ b/docs/source/en/perplexity.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Perplexity of fixed-length models
diff --git a/docs/source/en/philosophy.mdx b/docs/source/en/philosophy.md
similarity index 97%
rename from docs/source/en/philosophy.mdx
rename to docs/source/en/philosophy.md
index 7788d7836236..cad1e2ccdc8c 100644
--- a/docs/source/en/philosophy.mdx
+++ b/docs/source/en/philosophy.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Philosophy
diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.md
similarity index 98%
rename from docs/source/en/pipeline_tutorial.mdx
rename to docs/source/en/pipeline_tutorial.md
index ee85d522518c..1b13c401b929 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pipelines for inference
diff --git a/docs/source/en/pipeline_webserver.mdx b/docs/source/en/pipeline_webserver.md
similarity index 97%
rename from docs/source/en/pipeline_webserver.mdx
rename to docs/source/en/pipeline_webserver.md
index f62985ec26b5..1421dc61f7d1 100644
--- a/docs/source/en/pipeline_webserver.mdx
+++ b/docs/source/en/pipeline_webserver.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Using pipelines for a webserver
 
 <Tip>
diff --git a/docs/source/en/pr_checks.mdx b/docs/source/en/pr_checks.md
similarity index 97%
rename from docs/source/en/pr_checks.mdx
rename to docs/source/en/pr_checks.md
index 1e3f62b22a47..6aeee89d75e1 100644
--- a/docs/source/en/pr_checks.mdx
+++ b/docs/source/en/pr_checks.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Checks on a Pull Request
diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.md
similarity index 99%
rename from docs/source/en/preprocessing.mdx
rename to docs/source/en/preprocessing.md
index c2933099b03d..1f8eb4b1547d 100644
--- a/docs/source/en/preprocessing.mdx
+++ b/docs/source/en/preprocessing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Preprocess
diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.md
similarity index 99%
rename from docs/source/en/quicktour.mdx
rename to docs/source/en/quicktour.md
index 221cf91ff85b..442542234f27 100644
--- a/docs/source/en/quicktour.mdx
+++ b/docs/source/en/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Quick tour
diff --git a/docs/source/en/run_scripts.mdx b/docs/source/en/run_scripts.md
similarity index 98%
rename from docs/source/en/run_scripts.mdx
rename to docs/source/en/run_scripts.md
index 58d6b8dd3e20..3b40b6ea0672 100644
--- a/docs/source/en/run_scripts.mdx
+++ b/docs/source/en/run_scripts.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Train with a script
diff --git a/docs/source/en/sagemaker.mdx b/docs/source/en/sagemaker.md
similarity index 87%
rename from docs/source/en/sagemaker.mdx
rename to docs/source/en/sagemaker.md
index 1ffdd4326e4d..f0a5a5f9c114 100644
--- a/docs/source/en/sagemaker.mdx
+++ b/docs/source/en/sagemaker.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Run training on Amazon SageMaker
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.md
similarity index 98%
rename from docs/source/en/serialization.mdx
rename to docs/source/en/serialization.md
index 022cf460f808..9fec884a8be4 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Export to ONNX
diff --git a/docs/source/en/task_summary.mdx b/docs/source/en/task_summary.md
similarity index 99%
rename from docs/source/en/task_summary.mdx
rename to docs/source/en/task_summary.md
index 67181d2d0c53..c1e2c9a0ba81 100644
--- a/docs/source/en/task_summary.mdx
+++ b/docs/source/en/task_summary.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # What 🤗 Transformers can do
diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.md
similarity index 98%
rename from docs/source/en/tasks/asr.mdx
rename to docs/source/en/tasks/asr.md
index 06737f8c8c62..d01269ba60a6 100644
--- a/docs/source/en/tasks/asr.mdx
+++ b/docs/source/en/tasks/asr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Automatic speech recognition
diff --git a/docs/source/en/tasks/audio_classification.mdx b/docs/source/en/tasks/audio_classification.md
similarity index 98%
rename from docs/source/en/tasks/audio_classification.mdx
rename to docs/source/en/tasks/audio_classification.md
index d79bd9033eee..743a797fc53f 100644
--- a/docs/source/en/tasks/audio_classification.mdx
+++ b/docs/source/en/tasks/audio_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Audio classification
diff --git a/docs/source/en/tasks/document_question_answering.mdx b/docs/source/en/tasks/document_question_answering.md
similarity index 99%
rename from docs/source/en/tasks/document_question_answering.mdx
rename to docs/source/en/tasks/document_question_answering.md
index 7294e9f8fda2..24bf3a069ac9 100644
--- a/docs/source/en/tasks/document_question_answering.mdx
+++ b/docs/source/en/tasks/document_question_answering.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Document Question Answering
diff --git a/docs/source/en/tasks/image_captioning.mdx b/docs/source/en/tasks/image_captioning.md
similarity index 97%
rename from docs/source/en/tasks/image_captioning.mdx
rename to docs/source/en/tasks/image_captioning.md
index 2922de0549f0..71e81b4651bd 100644
--- a/docs/source/en/tasks/image_captioning.mdx
+++ b/docs/source/en/tasks/image_captioning.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 
diff --git a/docs/source/en/tasks/image_classification.mdx b/docs/source/en/tasks/image_classification.md
similarity index 99%
rename from docs/source/en/tasks/image_classification.mdx
rename to docs/source/en/tasks/image_classification.md
index c1a2c94eb5e5..67c3d0d7f0b6 100644
--- a/docs/source/en/tasks/image_classification.mdx
+++ b/docs/source/en/tasks/image_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Image classification
diff --git a/docs/source/en/tasks/language_modeling.mdx b/docs/source/en/tasks/language_modeling.md
similarity index 99%
rename from docs/source/en/tasks/language_modeling.mdx
rename to docs/source/en/tasks/language_modeling.md
index 676156ede860..f6f9b37afe13 100644
--- a/docs/source/en/tasks/language_modeling.mdx
+++ b/docs/source/en/tasks/language_modeling.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Causal language modeling
diff --git a/docs/source/en/tasks/masked_language_modeling.mdx b/docs/source/en/tasks/masked_language_modeling.md
similarity index 99%
rename from docs/source/en/tasks/masked_language_modeling.mdx
rename to docs/source/en/tasks/masked_language_modeling.md
index 1452f161b161..5920bfcaf8d8 100644
--- a/docs/source/en/tasks/masked_language_modeling.mdx
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Masked language modeling
diff --git a/docs/source/en/tasks/monocular_depth_estimation.mdx b/docs/source/en/tasks/monocular_depth_estimation.md
similarity index 96%
rename from docs/source/en/tasks/monocular_depth_estimation.mdx
rename to docs/source/en/tasks/monocular_depth_estimation.md
index a2721d659e6e..fa59771cbb02 100644
--- a/docs/source/en/tasks/monocular_depth_estimation.mdx
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Monocular depth estimation
diff --git a/docs/source/en/tasks/multiple_choice.mdx b/docs/source/en/tasks/multiple_choice.md
similarity index 99%
rename from docs/source/en/tasks/multiple_choice.mdx
rename to docs/source/en/tasks/multiple_choice.md
index 5a06c24d110b..7b7d6bfa69ce 100644
--- a/docs/source/en/tasks/multiple_choice.mdx
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Multiple choice
diff --git a/docs/source/en/tasks/object_detection.mdx b/docs/source/en/tasks/object_detection.md
similarity index 99%
rename from docs/source/en/tasks/object_detection.mdx
rename to docs/source/en/tasks/object_detection.md
index 411ed7d2e739..a38c33c92718 100644
--- a/docs/source/en/tasks/object_detection.mdx
+++ b/docs/source/en/tasks/object_detection.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Object detection
diff --git a/docs/source/en/tasks/question_answering.mdx b/docs/source/en/tasks/question_answering.md
similarity index 99%
rename from docs/source/en/tasks/question_answering.mdx
rename to docs/source/en/tasks/question_answering.md
index ae31a070fc7e..cb268e520e7d 100644
--- a/docs/source/en/tasks/question_answering.mdx
+++ b/docs/source/en/tasks/question_answering.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Question answering
diff --git a/docs/source/en/tasks/semantic_segmentation.mdx b/docs/source/en/tasks/semantic_segmentation.md
similarity index 99%
rename from docs/source/en/tasks/semantic_segmentation.mdx
rename to docs/source/en/tasks/semantic_segmentation.md
index 89abe14c7574..267d0083b972 100644
--- a/docs/source/en/tasks/semantic_segmentation.mdx
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Semantic segmentation
diff --git a/docs/source/en/tasks/sequence_classification.mdx b/docs/source/en/tasks/sequence_classification.md
similarity index 99%
rename from docs/source/en/tasks/sequence_classification.mdx
rename to docs/source/en/tasks/sequence_classification.md
index db7bdc15b338..dd1ad03277ea 100644
--- a/docs/source/en/tasks/sequence_classification.mdx
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Text classification
diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.md
similarity index 99%
rename from docs/source/en/tasks/summarization.mdx
rename to docs/source/en/tasks/summarization.md
index ce480bccb75b..cd0db0ca7531 100644
--- a/docs/source/en/tasks/summarization.mdx
+++ b/docs/source/en/tasks/summarization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Summarization
diff --git a/docs/source/en/tasks/text-to-speech.mdx b/docs/source/en/tasks/text-to-speech.md
similarity index 99%
rename from docs/source/en/tasks/text-to-speech.mdx
rename to docs/source/en/tasks/text-to-speech.md
index 21c76841e32f..6a14972e7c91 100644
--- a/docs/source/en/tasks/text-to-speech.mdx
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Text to speech
diff --git a/docs/source/en/tasks/token_classification.mdx b/docs/source/en/tasks/token_classification.md
similarity index 99%
rename from docs/source/en/tasks/token_classification.mdx
rename to docs/source/en/tasks/token_classification.md
index 90af3a21bb75..f9a84853b330 100644
--- a/docs/source/en/tasks/token_classification.mdx
+++ b/docs/source/en/tasks/token_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Token classification
diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.md
similarity index 98%
rename from docs/source/en/tasks/translation.mdx
rename to docs/source/en/tasks/translation.md
index 530390f1c331..760b02ee39b3 100644
--- a/docs/source/en/tasks/translation.mdx
+++ b/docs/source/en/tasks/translation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Translation
diff --git a/docs/source/en/tasks/video_classification.mdx b/docs/source/en/tasks/video_classification.md
similarity index 99%
rename from docs/source/en/tasks/video_classification.mdx
rename to docs/source/en/tasks/video_classification.md
index 57dc00c1bf49..c28f1c401999 100644
--- a/docs/source/en/tasks/video_classification.mdx
+++ b/docs/source/en/tasks/video_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Video classification
diff --git a/docs/source/en/tasks/zero_shot_image_classification.mdx b/docs/source/en/tasks/zero_shot_image_classification.md
similarity index 97%
rename from docs/source/en/tasks/zero_shot_image_classification.mdx
rename to docs/source/en/tasks/zero_shot_image_classification.md
index 2784d365083f..3177255b1c5a 100644
--- a/docs/source/en/tasks/zero_shot_image_classification.mdx
+++ b/docs/source/en/tasks/zero_shot_image_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Zero-shot image classification
diff --git a/docs/source/en/tasks/zero_shot_object_detection.mdx b/docs/source/en/tasks/zero_shot_object_detection.md
similarity index 98%
rename from docs/source/en/tasks/zero_shot_object_detection.mdx
rename to docs/source/en/tasks/zero_shot_object_detection.md
index 4ba77647e6a3..3dfefb3c8b5e 100644
--- a/docs/source/en/tasks/zero_shot_object_detection.mdx
+++ b/docs/source/en/tasks/zero_shot_object_detection.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Zero-shot object detection
diff --git a/docs/source/en/tasks_explained.mdx b/docs/source/en/tasks_explained.md
similarity index 99%
rename from docs/source/en/tasks_explained.mdx
rename to docs/source/en/tasks_explained.md
index fba64f4a7a5c..d453e38e86b9 100644
--- a/docs/source/en/tasks_explained.mdx
+++ b/docs/source/en/tasks_explained.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # How 🤗 Transformers solve tasks
diff --git a/docs/source/en/testing.mdx b/docs/source/en/testing.md
similarity index 99%
rename from docs/source/en/testing.mdx
rename to docs/source/en/testing.md
index 5adbc8e44db7..d3c512e8ebd3 100644
--- a/docs/source/en/testing.mdx
+++ b/docs/source/en/testing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Testing
@@ -217,7 +221,7 @@ Just run the following line to automatically test every docstring example in the
 ```bash 
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument.
+If the file has a markdown extention, you should add the `--doctest-glob="*.md"` argument.
 
 ### Run only modified tests
 
diff --git a/docs/source/en/tf_xla.mdx b/docs/source/en/tf_xla.md
similarity index 98%
rename from docs/source/en/tf_xla.mdx
rename to docs/source/en/tf_xla.md
index 1d9e13e8b35c..5f6a360dd8d5 100644
--- a/docs/source/en/tf_xla.mdx
+++ b/docs/source/en/tf_xla.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # XLA Integration for TensorFlow Models
diff --git a/docs/source/en/tflite.mdx b/docs/source/en/tflite.md
similarity index 94%
rename from docs/source/en/tflite.mdx
rename to docs/source/en/tflite.md
index 23e08478ba82..7b7735c992ea 100644
--- a/docs/source/en/tflite.mdx
+++ b/docs/source/en/tflite.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Export to TFLite
diff --git a/docs/source/en/tokenizer_summary.mdx b/docs/source/en/tokenizer_summary.md
similarity index 99%
rename from docs/source/en/tokenizer_summary.mdx
rename to docs/source/en/tokenizer_summary.md
index 61099edabe08..b13c4e83b834 100644
--- a/docs/source/en/tokenizer_summary.mdx
+++ b/docs/source/en/tokenizer_summary.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Summary of the tokenizers
diff --git a/docs/source/en/torchscript.mdx b/docs/source/en/torchscript.md
similarity index 98%
rename from docs/source/en/torchscript.mdx
rename to docs/source/en/torchscript.md
index 00497ff7c237..adf34b2ea699 100644
--- a/docs/source/en/torchscript.mdx
+++ b/docs/source/en/torchscript.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Export to TorchScript
diff --git a/docs/source/en/training.mdx b/docs/source/en/training.md
similarity index 99%
rename from docs/source/en/training.mdx
rename to docs/source/en/training.md
index 24afa907aeca..fb4a0b6a279e 100644
--- a/docs/source/en/training.mdx
+++ b/docs/source/en/training.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Fine-tune a pretrained model
diff --git a/docs/source/en/transformers_agents.mdx b/docs/source/en/transformers_agents.md
similarity index 98%
rename from docs/source/en/transformers_agents.mdx
rename to docs/source/en/transformers_agents.md
index e388c3640ab6..e23015f209b2 100644
--- a/docs/source/en/transformers_agents.mdx
+++ b/docs/source/en/transformers_agents.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Transformers Agent
diff --git a/docs/source/en/troubleshooting.mdx b/docs/source/en/troubleshooting.md
similarity index 98%
rename from docs/source/en/troubleshooting.mdx
rename to docs/source/en/troubleshooting.md
index bc3135be8fb6..29b032dd2799 100644
--- a/docs/source/en/troubleshooting.mdx
+++ b/docs/source/en/troubleshooting.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Troubleshoot
diff --git a/docs/source/es/accelerate.mdx b/docs/source/es/accelerate.md
similarity index 96%
rename from docs/source/es/accelerate.mdx
rename to docs/source/es/accelerate.md
index 6065bc110a1d..2c4063b7ca3b 100644
--- a/docs/source/es/accelerate.mdx
+++ b/docs/source/es/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Entrenamiento distribuido con 🤗 Accelerate
diff --git a/docs/source/es/add_new_pipeline.mdx b/docs/source/es/add_new_pipeline.md
similarity index 98%
rename from docs/source/es/add_new_pipeline.mdx
rename to docs/source/es/add_new_pipeline.md
index 8e022077972f..5e64c435ab98 100644
--- a/docs/source/es/add_new_pipeline.mdx
+++ b/docs/source/es/add_new_pipeline.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ¿Cómo puedo crear un pipeline personalizado?
diff --git a/docs/source/es/autoclass_tutorial.mdx b/docs/source/es/autoclass_tutorial.md
similarity index 97%
rename from docs/source/es/autoclass_tutorial.mdx
rename to docs/source/es/autoclass_tutorial.md
index e04a639422bb..8b3ddd230b6b 100644
--- a/docs/source/es/autoclass_tutorial.mdx
+++ b/docs/source/es/autoclass_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Carga instancias preentrenadas con un AutoClass
diff --git a/docs/source/es/bertology.mdx b/docs/source/es/bertology.md
similarity index 93%
rename from docs/source/es/bertology.mdx
rename to docs/source/es/bertology.md
index 9a7c48874256..ed4e12a8d59c 100644
--- a/docs/source/es/bertology.mdx
+++ b/docs/source/es/bertology.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BERTología
diff --git a/docs/source/es/community.mdx b/docs/source/es/community.md
similarity index 99%
rename from docs/source/es/community.mdx
rename to docs/source/es/community.md
index a34fa30104b2..261970e6fe7d 100644
--- a/docs/source/es/community.mdx
+++ b/docs/source/es/community.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Comunidad
 
 Esta página agrupa los recursos de 🤗 Transformers desarrollados por la comunidad.
diff --git a/docs/source/es/converting_tensorflow_models.mdx b/docs/source/es/converting_tensorflow_models.md
similarity index 97%
rename from docs/source/es/converting_tensorflow_models.mdx
rename to docs/source/es/converting_tensorflow_models.md
index 2ab15e81b250..c7e22bddac70 100644
--- a/docs/source/es/converting_tensorflow_models.mdx
+++ b/docs/source/es/converting_tensorflow_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Convertir checkpoints de Tensorflow
diff --git a/docs/source/es/create_a_model.mdx b/docs/source/es/create_a_model.md
similarity index 99%
rename from docs/source/es/create_a_model.mdx
rename to docs/source/es/create_a_model.md
index 99ded53ee653..04014a7b6a70 100644
--- a/docs/source/es/create_a_model.mdx
+++ b/docs/source/es/create_a_model.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Crea una arquitectura personalizada
diff --git a/docs/source/es/custom_models.mdx b/docs/source/es/custom_models.md
similarity index 98%
rename from docs/source/es/custom_models.mdx
rename to docs/source/es/custom_models.md
index 434d59f87dae..e616a056055e 100644
--- a/docs/source/es/custom_models.mdx
+++ b/docs/source/es/custom_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Compartir modelos personalizados
diff --git a/docs/source/es/debugging.mdx b/docs/source/es/debugging.md
similarity index 98%
rename from docs/source/es/debugging.mdx
rename to docs/source/es/debugging.md
index a709e0407b8b..313566753052 100644
--- a/docs/source/es/debugging.mdx
+++ b/docs/source/es/debugging.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Debugging
diff --git a/docs/source/es/fast_tokenizers.mdx b/docs/source/es/fast_tokenizers.md
similarity index 94%
rename from docs/source/es/fast_tokenizers.mdx
rename to docs/source/es/fast_tokenizers.md
index 63b43cc1c4c7..92b925f67f7e 100644
--- a/docs/source/es/fast_tokenizers.mdx
+++ b/docs/source/es/fast_tokenizers.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Usa los tokenizadores de 🤗 Tokenizers
diff --git a/docs/source/es/index.mdx b/docs/source/es/index.md
similarity index 99%
rename from docs/source/es/index.mdx
rename to docs/source/es/index.md
index 49a4f83053cd..caefdfb7ad7b 100644
--- a/docs/source/es/index.mdx
+++ b/docs/source/es/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/es/installation.mdx b/docs/source/es/installation.md
similarity index 98%
rename from docs/source/es/installation.mdx
rename to docs/source/es/installation.md
index 01b9d81409d4..0eb2dcb03a44 100644
--- a/docs/source/es/installation.mdx
+++ b/docs/source/es/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Instalación
diff --git a/docs/source/es/model_sharing.mdx b/docs/source/es/model_sharing.md
similarity index 98%
rename from docs/source/es/model_sharing.mdx
rename to docs/source/es/model_sharing.md
index 06029880fb14..46e1ee07a9a5 100644
--- a/docs/source/es/model_sharing.mdx
+++ b/docs/source/es/model_sharing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Compartir un modelo
diff --git a/docs/source/es/multilingual.mdx b/docs/source/es/multilingual.md
similarity index 98%
rename from docs/source/es/multilingual.mdx
rename to docs/source/es/multilingual.md
index 4849416a44db..fa60cac68c26 100644
--- a/docs/source/es/multilingual.mdx
+++ b/docs/source/es/multilingual.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Modelos multilingües para inferencia
diff --git a/docs/source/es/philosophy.mdx b/docs/source/es/philosophy.md
similarity index 97%
rename from docs/source/es/philosophy.mdx
rename to docs/source/es/philosophy.md
index 65e9a2c67a42..4054ac0ae507 100644
--- a/docs/source/es/philosophy.mdx
+++ b/docs/source/es/philosophy.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Al menos que sea requrido por la ley aplicable o acordado por escrito, el software distribuido bajo la Licencia es distribuido sobre una BASE "AS IS", SIN GARANTIAS O CONDICIONES DE
 NINGÚN TIPO. Ver la Licencia para el idioma específico que rige los permisos y limitaciones bajo la Licencia.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Filosofía
diff --git a/docs/source/es/pipeline_tutorial.mdx b/docs/source/es/pipeline_tutorial.md
similarity index 97%
rename from docs/source/es/pipeline_tutorial.mdx
rename to docs/source/es/pipeline_tutorial.md
index af202758eb13..0f77c3c3db83 100644
--- a/docs/source/es/pipeline_tutorial.mdx
+++ b/docs/source/es/pipeline_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pipelines para inferencia
diff --git a/docs/source/es/pr_checks.mdx b/docs/source/es/pr_checks.md
similarity index 97%
rename from docs/source/es/pr_checks.mdx
rename to docs/source/es/pr_checks.md
index 283f025a81fa..ba67e85306d3 100644
--- a/docs/source/es/pr_checks.mdx
+++ b/docs/source/es/pr_checks.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Verificaciones en un Pull Request
diff --git a/docs/source/es/preprocessing.mdx b/docs/source/es/preprocessing.md
similarity index 99%
rename from docs/source/es/preprocessing.mdx
rename to docs/source/es/preprocessing.md
index 869f90c41773..f4eec4862be8 100644
--- a/docs/source/es/preprocessing.mdx
+++ b/docs/source/es/preprocessing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Preprocesamiento
diff --git a/docs/source/es/quicktour.mdx b/docs/source/es/quicktour.md
similarity index 99%
rename from docs/source/es/quicktour.mdx
rename to docs/source/es/quicktour.md
index 9a3e52e8c140..a674adf5cce2 100644
--- a/docs/source/es/quicktour.mdx
+++ b/docs/source/es/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Tour rápido
diff --git a/docs/source/es/run_scripts.mdx b/docs/source/es/run_scripts.md
similarity index 99%
rename from docs/source/es/run_scripts.mdx
rename to docs/source/es/run_scripts.md
index d0ab716f80ff..a66fd1e47e13 100644
--- a/docs/source/es/run_scripts.mdx
+++ b/docs/source/es/run_scripts.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Entrenamiento con scripts
diff --git a/docs/source/es/sagemaker.mdx b/docs/source/es/sagemaker.md
similarity index 87%
rename from docs/source/es/sagemaker.mdx
rename to docs/source/es/sagemaker.md
index 491d93e10d4d..a874aefe76f6 100644
--- a/docs/source/es/sagemaker.mdx
+++ b/docs/source/es/sagemaker.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Ejecutar el entrenamiento en Amazon SageMaker
diff --git a/docs/source/es/serialization.mdx b/docs/source/es/serialization.md
similarity index 99%
rename from docs/source/es/serialization.mdx
rename to docs/source/es/serialization.md
index 2815734bfa79..9c24ba72f3d4 100644
--- a/docs/source/es/serialization.mdx
+++ b/docs/source/es/serialization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Exportar modelos 🤗 Transformers
diff --git a/docs/source/es/tasks/asr.mdx b/docs/source/es/tasks/asr.md
similarity index 98%
rename from docs/source/es/tasks/asr.mdx
rename to docs/source/es/tasks/asr.md
index f3747a332d7f..850bdfd711e7 100644
--- a/docs/source/es/tasks/asr.mdx
+++ b/docs/source/es/tasks/asr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Reconocimiento automático del habla
diff --git a/docs/source/es/tasks/image_classification.mdx b/docs/source/es/tasks/image_classification.md
similarity index 97%
rename from docs/source/es/tasks/image_classification.mdx
rename to docs/source/es/tasks/image_classification.md
index 9b8b03207d08..ef67131d73af 100644
--- a/docs/source/es/tasks/image_classification.mdx
+++ b/docs/source/es/tasks/image_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Clasificación de imágenes
diff --git a/docs/source/es/tasks/language_modeling.mdx b/docs/source/es/tasks/language_modeling.md
similarity index 99%
rename from docs/source/es/tasks/language_modeling.mdx
rename to docs/source/es/tasks/language_modeling.md
index 565185072a11..8d2ba49d0d89 100644
--- a/docs/source/es/tasks/language_modeling.mdx
+++ b/docs/source/es/tasks/language_modeling.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Modelado de lenguaje
diff --git a/docs/source/es/tasks/multiple_choice.mdx b/docs/source/es/tasks/multiple_choice.md
similarity index 98%
rename from docs/source/es/tasks/multiple_choice.mdx
rename to docs/source/es/tasks/multiple_choice.md
index 2ece0969bf96..8391dcbdd5eb 100644
--- a/docs/source/es/tasks/multiple_choice.mdx
+++ b/docs/source/es/tasks/multiple_choice.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Selección múltiple
diff --git a/docs/source/es/tasks/question_answering.mdx b/docs/source/es/tasks/question_answering.md
similarity index 98%
rename from docs/source/es/tasks/question_answering.mdx
rename to docs/source/es/tasks/question_answering.md
index d599fa8f1a37..2aa896142e2e 100644
--- a/docs/source/es/tasks/question_answering.mdx
+++ b/docs/source/es/tasks/question_answering.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Respuesta a preguntas
diff --git a/docs/source/es/tasks/summarization.mdx b/docs/source/es/tasks/summarization.md
similarity index 98%
rename from docs/source/es/tasks/summarization.mdx
rename to docs/source/es/tasks/summarization.md
index 066ac288b49b..b545e4216e5d 100644
--- a/docs/source/es/tasks/summarization.mdx
+++ b/docs/source/es/tasks/summarization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Generación de resúmenes
diff --git a/docs/source/es/training.mdx b/docs/source/es/training.md
similarity index 98%
rename from docs/source/es/training.mdx
rename to docs/source/es/training.md
index 6a7f408920e5..7b7b0657bd8f 100644
--- a/docs/source/es/training.mdx
+++ b/docs/source/es/training.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Fine-tuning a un modelo pre-entrenado
diff --git a/docs/source/fr/in_translation.md b/docs/source/fr/in_translation.md
new file mode 100644
index 000000000000..910559ef6c9a
--- /dev/null
+++ b/docs/source/fr/in_translation.md
@@ -0,0 +1,5 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Traduction en cours. 
\ No newline at end of file
diff --git a/docs/source/fr/in_translation.mdx b/docs/source/fr/in_translation.mdx
deleted file mode 100644
index 619f76420bd5..000000000000
--- a/docs/source/fr/in_translation.mdx
+++ /dev/null
@@ -1 +0,0 @@
-# Traduction en cours. 
\ No newline at end of file
diff --git a/docs/source/fr/index.mdx b/docs/source/fr/index.md
similarity index 99%
rename from docs/source/fr/index.mdx
rename to docs/source/fr/index.md
index 63a3e8391f47..f18ad8e57c21 100644
--- a/docs/source/fr/index.mdx
+++ b/docs/source/fr/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/fr/quicktour.mdx b/docs/source/fr/quicktour.md
similarity index 99%
rename from docs/source/fr/quicktour.mdx
rename to docs/source/fr/quicktour.md
index d7d00e31dd87..5d5387cfdda5 100644
--- a/docs/source/fr/quicktour.mdx
+++ b/docs/source/fr/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Visite rapide
diff --git a/docs/source/it/accelerate.mdx b/docs/source/it/accelerate.md
similarity index 96%
rename from docs/source/it/accelerate.mdx
rename to docs/source/it/accelerate.md
index 20dc1a7ff90b..3114613a9a79 100644
--- a/docs/source/it/accelerate.mdx
+++ b/docs/source/it/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Allenamento distribuito con 🤗 Accelerate
diff --git a/docs/source/it/add_new_model.mdx b/docs/source/it/add_new_model.md
similarity index 99%
rename from docs/source/it/add_new_model.mdx
rename to docs/source/it/add_new_model.md
index 8dce90a816b8..3ee22e804aaa 100644
--- a/docs/source/it/add_new_model.mdx
+++ b/docs/source/it/add_new_model.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Come aggiungere un modello a 🤗 Transformers?
diff --git a/docs/source/it/add_new_pipeline.mdx b/docs/source/it/add_new_pipeline.md
similarity index 98%
rename from docs/source/it/add_new_pipeline.mdx
rename to docs/source/it/add_new_pipeline.md
index cf9acd2902fc..adc1c3651a2c 100644
--- a/docs/source/it/add_new_pipeline.mdx
+++ b/docs/source/it/add_new_pipeline.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Come creare una pipeline personalizzata?
diff --git a/docs/source/it/autoclass_tutorial.mdx b/docs/source/it/autoclass_tutorial.md
similarity index 97%
rename from docs/source/it/autoclass_tutorial.mdx
rename to docs/source/it/autoclass_tutorial.md
index 88dd6cad6c42..51621d098302 100644
--- a/docs/source/it/autoclass_tutorial.mdx
+++ b/docs/source/it/autoclass_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Carica istanze pre-allenate con AutoClass
diff --git a/docs/source/it/big_models.mdx b/docs/source/it/big_models.md
similarity index 97%
rename from docs/source/it/big_models.mdx
rename to docs/source/it/big_models.md
index 56a0fa6fea4a..cd0fd9017d9d 100644
--- a/docs/source/it/big_models.mdx
+++ b/docs/source/it/big_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Istanziare un big model
diff --git a/docs/source/it/community.mdx b/docs/source/it/community.md
similarity index 99%
rename from docs/source/it/community.mdx
rename to docs/source/it/community.md
index 530e132014c7..2f3c0c8a82b4 100644
--- a/docs/source/it/community.mdx
+++ b/docs/source/it/community.md
@@ -1,4 +1,8 @@
-﻿# Comunità
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Comunità
 
 Questa pagina raggruppa le risorse sviluppate dalla comunità riguardo 🤗 Transformers.
 
diff --git a/docs/source/it/converting_tensorflow_models.mdx b/docs/source/it/converting_tensorflow_models.md
similarity index 97%
rename from docs/source/it/converting_tensorflow_models.mdx
rename to docs/source/it/converting_tensorflow_models.md
index b9b30a315c6a..04398636359c 100644
--- a/docs/source/it/converting_tensorflow_models.mdx
+++ b/docs/source/it/converting_tensorflow_models.md
@@ -5,6 +5,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Convertire checkpoint di Tensorflow
diff --git a/docs/source/it/create_a_model.mdx b/docs/source/it/create_a_model.md
similarity index 98%
rename from docs/source/it/create_a_model.mdx
rename to docs/source/it/create_a_model.md
index 6e11f3f1d029..c32040d7d389 100644
--- a/docs/source/it/create_a_model.mdx
+++ b/docs/source/it/create_a_model.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Crea un'architettura personalizzata 
diff --git a/docs/source/it/custom_models.mdx b/docs/source/it/custom_models.md
similarity index 98%
rename from docs/source/it/custom_models.mdx
rename to docs/source/it/custom_models.md
index b4b0302e29e3..b0cdf4cd7bf0 100644
--- a/docs/source/it/custom_models.mdx
+++ b/docs/source/it/custom_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Condividere modelli personalizzati
diff --git a/docs/source/it/debugging.mdx b/docs/source/it/debugging.md
similarity index 98%
rename from docs/source/it/debugging.mdx
rename to docs/source/it/debugging.md
index 5b392489eab9..5c1dab51bd11 100644
--- a/docs/source/it/debugging.mdx
+++ b/docs/source/it/debugging.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Debugging
diff --git a/docs/source/it/index.mdx b/docs/source/it/index.md
similarity index 99%
rename from docs/source/it/index.mdx
rename to docs/source/it/index.md
index 4c050bfe5224..5c7d22c1e6b1 100644
--- a/docs/source/it/index.mdx
+++ b/docs/source/it/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/it/installation.mdx b/docs/source/it/installation.md
similarity index 98%
rename from docs/source/it/installation.mdx
rename to docs/source/it/installation.md
index 1ff47c110cff..4f884f80d936 100644
--- a/docs/source/it/installation.mdx
+++ b/docs/source/it/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Installazione
diff --git a/docs/source/it/migration.mdx b/docs/source/it/migration.md
similarity index 98%
rename from docs/source/it/migration.mdx
rename to docs/source/it/migration.md
index 92622787c6e9..3b3b71da4d49 100644
--- a/docs/source/it/migration.mdx
+++ b/docs/source/it/migration.md
@@ -12,6 +12,10 @@ distribuito con la Licenza è distribuito su BASE "COSÌ COM'È",
 SENZA GARANZIE O CONDIZIONI DI ALCUN TIPO, espresse o implicite. 
 Per la lingua specifica vedi la Licenza che regola le autorizzazioni e 
 le limitazioni ai sensi della STESSA. 
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 --> 
 
 # Migrazione da pacchetti precedenti 
diff --git a/docs/source/it/model_sharing.mdx b/docs/source/it/model_sharing.md
similarity index 98%
rename from docs/source/it/model_sharing.mdx
rename to docs/source/it/model_sharing.md
index 9e1ca9588a10..351cf57bf96b 100644
--- a/docs/source/it/model_sharing.mdx
+++ b/docs/source/it/model_sharing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Condividi un modello
diff --git a/docs/source/it/multilingual.mdx b/docs/source/it/multilingual.md
similarity index 98%
rename from docs/source/it/multilingual.mdx
rename to docs/source/it/multilingual.md
index a8ccec97d0a7..889c620ab29d 100644
--- a/docs/source/it/multilingual.mdx
+++ b/docs/source/it/multilingual.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Modelli multilingue per l'inferenza
diff --git a/docs/source/it/perf_hardware.mdx b/docs/source/it/perf_hardware.md
similarity index 97%
rename from docs/source/it/perf_hardware.mdx
rename to docs/source/it/perf_hardware.md
index 0bfdbc8fe686..a579362e2b1b 100644
--- a/docs/source/it/perf_hardware.mdx
+++ b/docs/source/it/perf_hardware.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 
diff --git a/docs/source/it/perf_infer_cpu.mdx b/docs/source/it/perf_infer_cpu.md
similarity index 96%
rename from docs/source/it/perf_infer_cpu.mdx
rename to docs/source/it/perf_infer_cpu.md
index 1423b8f0552c..baae51a5a978 100644
--- a/docs/source/it/perf_infer_cpu.mdx
+++ b/docs/source/it/perf_infer_cpu.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Inferenza Efficiente su CPU
diff --git a/docs/source/it/perf_infer_gpu_many.mdx b/docs/source/it/perf_infer_gpu_many.md
similarity index 88%
rename from docs/source/it/perf_infer_gpu_many.mdx
rename to docs/source/it/perf_infer_gpu_many.md
index 5eeefa907dd6..b78cb34e1d6d 100644
--- a/docs/source/it/perf_infer_gpu_many.mdx
+++ b/docs/source/it/perf_infer_gpu_many.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Inferenza Efficiente su GPU Multiple
diff --git a/docs/source/it/perf_infer_gpu_one.mdx b/docs/source/it/perf_infer_gpu_one.md
similarity index 97%
rename from docs/source/it/perf_infer_gpu_one.mdx
rename to docs/source/it/perf_infer_gpu_one.md
index 60df05515351..9acbed1d0f27 100644
--- a/docs/source/it/perf_infer_gpu_one.mdx
+++ b/docs/source/it/perf_infer_gpu_one.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Inferenza efficiente su GPU singola
diff --git a/docs/source/it/perf_infer_special.mdx b/docs/source/it/perf_infer_special.md
similarity index 81%
rename from docs/source/it/perf_infer_special.mdx
rename to docs/source/it/perf_infer_special.md
index 1e92190d192c..3e2c0a5c288e 100644
--- a/docs/source/it/perf_infer_special.mdx
+++ b/docs/source/it/perf_infer_special.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Inferenza su Hardware Specializzato
diff --git a/docs/source/it/perf_train_cpu.mdx b/docs/source/it/perf_train_cpu.md
similarity index 95%
rename from docs/source/it/perf_train_cpu.mdx
rename to docs/source/it/perf_train_cpu.md
index bf9b265ba301..c91baeec8800 100644
--- a/docs/source/it/perf_train_cpu.mdx
+++ b/docs/source/it/perf_train_cpu.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Addestramento efficiente su CPU
diff --git a/docs/source/it/perf_train_cpu_many.mdx b/docs/source/it/perf_train_cpu_many.md
similarity index 97%
rename from docs/source/it/perf_train_cpu_many.mdx
rename to docs/source/it/perf_train_cpu_many.md
index abe99b27f8df..2fb10ee4ba49 100644
--- a/docs/source/it/perf_train_cpu_many.mdx
+++ b/docs/source/it/perf_train_cpu_many.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Addestramento effciente su multiple CPU
diff --git a/docs/source/it/perf_train_special.mdx b/docs/source/it/perf_train_special.md
similarity index 86%
rename from docs/source/it/perf_train_special.mdx
rename to docs/source/it/perf_train_special.md
index 22ea6d73e3d6..afe05d801d66 100644
--- a/docs/source/it/perf_train_special.mdx
+++ b/docs/source/it/perf_train_special.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Addestramento su Hardware Specializzato
diff --git a/docs/source/it/perf_train_tpu.mdx b/docs/source/it/perf_train_tpu.md
similarity index 85%
rename from docs/source/it/perf_train_tpu.mdx
rename to docs/source/it/perf_train_tpu.md
index 395caebcd066..663f83c499cb 100644
--- a/docs/source/it/perf_train_tpu.mdx
+++ b/docs/source/it/perf_train_tpu.md
@@ -7,6 +7,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Addestramento su TPU
diff --git a/docs/source/it/pipeline_tutorial.mdx b/docs/source/it/pipeline_tutorial.md
similarity index 97%
rename from docs/source/it/pipeline_tutorial.mdx
rename to docs/source/it/pipeline_tutorial.md
index 64347164505f..056282b164ed 100644
--- a/docs/source/it/pipeline_tutorial.mdx
+++ b/docs/source/it/pipeline_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pipeline per l'inferenza
diff --git a/docs/source/it/pr_checks.mdx b/docs/source/it/pr_checks.md
similarity index 97%
rename from docs/source/it/pr_checks.mdx
rename to docs/source/it/pr_checks.md
index d7541d59f0ad..caa5fe32965b 100644
--- a/docs/source/it/pr_checks.mdx
+++ b/docs/source/it/pr_checks.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Controlli su una Pull Request
diff --git a/docs/source/it/preprocessing.mdx b/docs/source/it/preprocessing.md
similarity index 99%
rename from docs/source/it/preprocessing.mdx
rename to docs/source/it/preprocessing.md
index a57ff9df9151..94578dfe166b 100644
--- a/docs/source/it/preprocessing.mdx
+++ b/docs/source/it/preprocessing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Preprocess
diff --git a/docs/source/it/quicktour.mdx b/docs/source/it/quicktour.md
similarity index 99%
rename from docs/source/it/quicktour.mdx
rename to docs/source/it/quicktour.md
index 2378edd2c2a1..2ec450e238f8 100644
--- a/docs/source/it/quicktour.mdx
+++ b/docs/source/it/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Quick tour
diff --git a/docs/source/it/run_scripts.mdx b/docs/source/it/run_scripts.md
similarity index 99%
rename from docs/source/it/run_scripts.mdx
rename to docs/source/it/run_scripts.md
index 3ffd58a62830..327eb9374d38 100644
--- a/docs/source/it/run_scripts.mdx
+++ b/docs/source/it/run_scripts.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Addestramento con script
diff --git a/docs/source/it/serialization.mdx b/docs/source/it/serialization.md
similarity index 99%
rename from docs/source/it/serialization.mdx
rename to docs/source/it/serialization.md
index 1dde00f429bd..0067f1a3c52e 100644
--- a/docs/source/it/serialization.mdx
+++ b/docs/source/it/serialization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Esporta modelli 🤗 Transformers 
diff --git a/docs/source/it/training.mdx b/docs/source/it/training.md
similarity index 98%
rename from docs/source/it/training.mdx
rename to docs/source/it/training.md
index 68f6434bbb5a..be0883f07b77 100644
--- a/docs/source/it/training.mdx
+++ b/docs/source/it/training.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Fine-tuning di un modello pre-addestrato
diff --git a/docs/source/ja/accelerate.mdx b/docs/source/ja/accelerate.md
similarity index 96%
rename from docs/source/ja/accelerate.mdx
rename to docs/source/ja/accelerate.md
index 823ed0dcf72b..73e45b9cd3c5 100644
--- a/docs/source/ja/accelerate.mdx
+++ b/docs/source/ja/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Accelerate を用いた分散学習
diff --git a/docs/source/ja/index.mdx b/docs/source/ja/index.md
similarity index 99%
rename from docs/source/ja/index.mdx
rename to docs/source/ja/index.md
index f55a3fd42a53..364a3b34caba 100644
--- a/docs/source/ja/index.mdx
+++ b/docs/source/ja/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/ja/installation.mdx b/docs/source/ja/installation.md
similarity index 98%
rename from docs/source/ja/installation.mdx
rename to docs/source/ja/installation.md
index 0ae6cad52d09..3b8646672e52 100644
--- a/docs/source/ja/installation.mdx
+++ b/docs/source/ja/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # インストール
diff --git a/docs/source/ja/multilingual.mdx b/docs/source/ja/multilingual.md
similarity index 98%
rename from docs/source/ja/multilingual.mdx
rename to docs/source/ja/multilingual.md
index a5ccc18385a2..86dabb94633c 100644
--- a/docs/source/ja/multilingual.mdx
+++ b/docs/source/ja/multilingual.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 推論のための多言語モデル
diff --git a/docs/source/ko/accelerate.mdx b/docs/source/ko/accelerate.md
similarity index 96%
rename from docs/source/ko/accelerate.mdx
rename to docs/source/ko/accelerate.md
index e79b7a9bcf69..0ef8957de3ac 100644
--- a/docs/source/ko/accelerate.mdx
+++ b/docs/source/ko/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Accelerate를 활용한 분산 학습[[distributed-training-with-accelerate]]
diff --git a/docs/source/ko/attention.mdx b/docs/source/ko/attention.md
similarity index 96%
rename from docs/source/ko/attention.mdx
rename to docs/source/ko/attention.md
index 5ba05938e14d..8f82a4b851e4 100644
--- a/docs/source/ko/attention.mdx
+++ b/docs/source/ko/attention.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 어텐션 메커니즘[[attention_mechanisms]]
diff --git a/docs/source/ko/autoclass_tutorial.mdx b/docs/source/ko/autoclass_tutorial.md
similarity index 97%
rename from docs/source/ko/autoclass_tutorial.mdx
rename to docs/source/ko/autoclass_tutorial.md
index dfbfda2d5148..9ecfd9c2015d 100644
--- a/docs/source/ko/autoclass_tutorial.mdx
+++ b/docs/source/ko/autoclass_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # AutoClass로 사전 학습된 인스턴스 로드[[load-pretrained-instances-with-an-autoclass]]
diff --git a/docs/source/ko/bertology.mdx b/docs/source/ko/bertology.md
similarity index 93%
rename from docs/source/ko/bertology.mdx
rename to docs/source/ko/bertology.md
index 33b00b2af4d0..7b4f3dc4c493 100644
--- a/docs/source/ko/bertology.mdx
+++ b/docs/source/ko/bertology.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # BERTology
diff --git a/docs/source/ko/create_a_model.mdx b/docs/source/ko/create_a_model.md
similarity index 99%
rename from docs/source/ko/create_a_model.mdx
rename to docs/source/ko/create_a_model.md
index 15b14583c83c..8c7be3291e24 100644
--- a/docs/source/ko/create_a_model.mdx
+++ b/docs/source/ko/create_a_model.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 맞춤형 아키텍처 만들기[[create-a-custom-architecture]]
diff --git a/docs/source/ko/custom_models.mdx b/docs/source/ko/custom_models.md
similarity index 99%
rename from docs/source/ko/custom_models.mdx
rename to docs/source/ko/custom_models.md
index 6e9d91fcda27..72dad7caaff2 100644
--- a/docs/source/ko/custom_models.mdx
+++ b/docs/source/ko/custom_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 사용자 정의 모델 공유하기[[sharing-custom-models]]
diff --git a/docs/source/ko/fast_tokenizers.mdx b/docs/source/ko/fast_tokenizers.md
similarity index 95%
rename from docs/source/ko/fast_tokenizers.mdx
rename to docs/source/ko/fast_tokenizers.md
index bef75686ecb0..a6d1f14283bb 100644
--- a/docs/source/ko/fast_tokenizers.mdx
+++ b/docs/source/ko/fast_tokenizers.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Tokenizers 라이브러리의 토크나이저 사용하기[[use-tokenizers-from-tokenizers]]
diff --git a/docs/source/ko/in_translation.md b/docs/source/ko/in_translation.md
new file mode 100644
index 000000000000..61ff1426a452
--- /dev/null
+++ b/docs/source/ko/in_translation.md
@@ -0,0 +1,5 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 열심히 번역 중입니다. 조금 이따 만나요!
\ No newline at end of file
diff --git a/docs/source/ko/in_translation.mdx b/docs/source/ko/in_translation.mdx
deleted file mode 100644
index ead906183348..000000000000
--- a/docs/source/ko/in_translation.mdx
+++ /dev/null
@@ -1 +0,0 @@
-# 열심히 번역 중입니다. 조금 이따 만나요!
\ No newline at end of file
diff --git a/docs/source/ko/index.mdx b/docs/source/ko/index.md
similarity index 99%
rename from docs/source/ko/index.mdx
rename to docs/source/ko/index.md
index 5a6428d69487..f0ec9ae1b8b9 100644
--- a/docs/source/ko/index.mdx
+++ b/docs/source/ko/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/ko/installation.mdx b/docs/source/ko/installation.md
similarity index 98%
rename from docs/source/ko/installation.mdx
rename to docs/source/ko/installation.md
index 6ca9a7b31355..cd72d8c6bcbf 100644
--- a/docs/source/ko/installation.mdx
+++ b/docs/source/ko/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 설치방법[[installation]]
diff --git a/docs/source/ko/model_sharing.mdx b/docs/source/ko/model_sharing.md
similarity index 98%
rename from docs/source/ko/model_sharing.mdx
rename to docs/source/ko/model_sharing.md
index 3dcd7a0ebcb7..ed6836e8de56 100644
--- a/docs/source/ko/model_sharing.mdx
+++ b/docs/source/ko/model_sharing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 모델 공유하기[[share-a-model]]
diff --git a/docs/source/ko/multilingual.mdx b/docs/source/ko/multilingual.md
similarity index 98%
rename from docs/source/ko/multilingual.mdx
rename to docs/source/ko/multilingual.md
index 5483ac1318dd..2862bd983887 100644
--- a/docs/source/ko/multilingual.mdx
+++ b/docs/source/ko/multilingual.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 다국어 모델 추론하기[[multilingual-models-for-inference]]
diff --git a/docs/source/ko/pad_truncation.mdx b/docs/source/ko/pad_truncation.md
similarity index 97%
rename from docs/source/ko/pad_truncation.mdx
rename to docs/source/ko/pad_truncation.md
index 6fd7ccee2f6a..6aa8b99b1dfc 100644
--- a/docs/source/ko/pad_truncation.mdx
+++ b/docs/source/ko/pad_truncation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 패딩과 잘라내기[[padding-and-truncation]]
diff --git a/docs/source/ko/pipeline_tutorial.mdx b/docs/source/ko/pipeline_tutorial.md
similarity index 98%
rename from docs/source/ko/pipeline_tutorial.mdx
rename to docs/source/ko/pipeline_tutorial.md
index 769122d3a0ec..4c32db756f0e 100644
--- a/docs/source/ko/pipeline_tutorial.mdx
+++ b/docs/source/ko/pipeline_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 추론을 위한 Pipeline[[pipelines-for-inference]]
diff --git a/docs/source/ko/preprocessing.mdx b/docs/source/ko/preprocessing.md
similarity index 99%
rename from docs/source/ko/preprocessing.mdx
rename to docs/source/ko/preprocessing.md
index 9b83d7bb29ba..a7597f23a0dc 100644
--- a/docs/source/ko/preprocessing.mdx
+++ b/docs/source/ko/preprocessing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 전처리[[preprocess]]
diff --git a/docs/source/ko/quicktour.mdx b/docs/source/ko/quicktour.md
similarity index 99%
rename from docs/source/ko/quicktour.mdx
rename to docs/source/ko/quicktour.md
index dc50d6a938fe..d8c4038af9dd 100644
--- a/docs/source/ko/quicktour.mdx
+++ b/docs/source/ko/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 둘러보기[[quick-tour]]
diff --git a/docs/source/ko/run_scripts.mdx b/docs/source/ko/run_scripts.md
similarity index 99%
rename from docs/source/ko/run_scripts.mdx
rename to docs/source/ko/run_scripts.md
index e829198b36fe..c1af1677183b 100644
--- a/docs/source/ko/run_scripts.mdx
+++ b/docs/source/ko/run_scripts.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 스크립트로 실행하기[[train-with-a-script]]
diff --git a/docs/source/ko/sagemaker.mdx b/docs/source/ko/sagemaker.md
similarity index 87%
rename from docs/source/ko/sagemaker.mdx
rename to docs/source/ko/sagemaker.md
index 882dc2fd778e..f612435d3c1a 100644
--- a/docs/source/ko/sagemaker.mdx
+++ b/docs/source/ko/sagemaker.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Amazon SageMaker에서 학습 실행하기[[run-training-on-amazon-sagemaker]]
diff --git a/docs/source/ko/serialization.mdx b/docs/source/ko/serialization.md
similarity index 99%
rename from docs/source/ko/serialization.mdx
rename to docs/source/ko/serialization.md
index a72a9472ebc8..e3d06deac89b 100644
--- a/docs/source/ko/serialization.mdx
+++ b/docs/source/ko/serialization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # ONNX로 내보내기[[export-to-onnx]]
diff --git a/docs/source/ko/task_summary.mdx b/docs/source/ko/task_summary.md
similarity index 99%
rename from docs/source/ko/task_summary.mdx
rename to docs/source/ko/task_summary.md
index b495e356827e..dbebf38760a6 100644
--- a/docs/source/ko/task_summary.mdx
+++ b/docs/source/ko/task_summary.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers로 할 수 있는 것[[what__transformers_can_do]]
diff --git a/docs/source/ko/tasks/asr.mdx b/docs/source/ko/tasks/asr.md
similarity index 98%
rename from docs/source/ko/tasks/asr.mdx
rename to docs/source/ko/tasks/asr.md
index ec84bd4e8f7e..47a568ecf02b 100644
--- a/docs/source/ko/tasks/asr.mdx
+++ b/docs/source/ko/tasks/asr.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 자동 음성 인식[[automatic-speech-recognition]]
diff --git a/docs/source/ko/tasks/image_captioning.mdx b/docs/source/ko/tasks/image_captioning.md
similarity index 98%
rename from docs/source/ko/tasks/image_captioning.mdx
rename to docs/source/ko/tasks/image_captioning.md
index a7c317a7979c..0521db0dc9ab 100644
--- a/docs/source/ko/tasks/image_captioning.mdx
+++ b/docs/source/ko/tasks/image_captioning.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 
diff --git a/docs/source/ko/tasks/image_classification.mdx b/docs/source/ko/tasks/image_classification.md
similarity index 99%
rename from docs/source/ko/tasks/image_classification.mdx
rename to docs/source/ko/tasks/image_classification.md
index 3815f8708e75..031e01ea5c5a 100644
--- a/docs/source/ko/tasks/image_classification.mdx
+++ b/docs/source/ko/tasks/image_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 이미지 분류[[image-classification]]
diff --git a/docs/source/ko/tasks/language_modeling.mdx b/docs/source/ko/tasks/language_modeling.md
similarity index 99%
rename from docs/source/ko/tasks/language_modeling.mdx
rename to docs/source/ko/tasks/language_modeling.md
index e5261d1813e0..ba540825c295 100644
--- a/docs/source/ko/tasks/language_modeling.mdx
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 인과 언어 모델링[[causal-language-modeling]]
diff --git a/docs/source/ko/tasks/masked_language_modeling.mdx b/docs/source/ko/tasks/masked_language_modeling.md
similarity index 99%
rename from docs/source/ko/tasks/masked_language_modeling.mdx
rename to docs/source/ko/tasks/masked_language_modeling.md
index 9eb375d3b8fb..d22d439dbd51 100644
--- a/docs/source/ko/tasks/masked_language_modeling.mdx
+++ b/docs/source/ko/tasks/masked_language_modeling.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 마스킹된 언어 모델링(Masked language modeling)[[masked-language-modeling]]
diff --git a/docs/source/ko/tasks/monocular_depth_estimation.mdx b/docs/source/ko/tasks/monocular_depth_estimation.md
similarity index 97%
rename from docs/source/ko/tasks/monocular_depth_estimation.mdx
rename to docs/source/ko/tasks/monocular_depth_estimation.md
index 2ccadd2fd3cb..e02dd5466b7d 100644
--- a/docs/source/ko/tasks/monocular_depth_estimation.mdx
+++ b/docs/source/ko/tasks/monocular_depth_estimation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 단일 영상 기반 깊이 추정[[depth-estimation-pipeline]]
diff --git a/docs/source/ko/tasks/multiple_choice.mdx b/docs/source/ko/tasks/multiple_choice.md
similarity index 99%
rename from docs/source/ko/tasks/multiple_choice.mdx
rename to docs/source/ko/tasks/multiple_choice.md
index 9a259ee77ae6..c174ca632f69 100644
--- a/docs/source/ko/tasks/multiple_choice.mdx
+++ b/docs/source/ko/tasks/multiple_choice.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 객관식 문제[[multiple-choice]]
diff --git a/docs/source/ko/tasks/object_detection.mdx b/docs/source/ko/tasks/object_detection.md
similarity index 99%
rename from docs/source/ko/tasks/object_detection.mdx
rename to docs/source/ko/tasks/object_detection.md
index 9d65da72fb7c..f29ca5d61b6c 100644
--- a/docs/source/ko/tasks/object_detection.mdx
+++ b/docs/source/ko/tasks/object_detection.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 객체 탐지 [[object-detection]]
diff --git a/docs/source/ko/tasks/question_answering.mdx b/docs/source/ko/tasks/question_answering.md
similarity index 99%
rename from docs/source/ko/tasks/question_answering.mdx
rename to docs/source/ko/tasks/question_answering.md
index 602d1b71be0a..4b218ccce214 100644
--- a/docs/source/ko/tasks/question_answering.mdx
+++ b/docs/source/ko/tasks/question_answering.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 질의 응답(Question Answering)[[question-answering]]
diff --git a/docs/source/ko/tasks/sequence_classification.mdx b/docs/source/ko/tasks/sequence_classification.md
similarity index 99%
rename from docs/source/ko/tasks/sequence_classification.mdx
rename to docs/source/ko/tasks/sequence_classification.md
index 32cf216d7b4c..bc364d3199e2 100644
--- a/docs/source/ko/tasks/sequence_classification.mdx
+++ b/docs/source/ko/tasks/sequence_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 텍스트 분류[[text-classification]]
diff --git a/docs/source/ko/tasks/summarization.mdx b/docs/source/ko/tasks/summarization.md
similarity index 99%
rename from docs/source/ko/tasks/summarization.mdx
rename to docs/source/ko/tasks/summarization.md
index 19c6c7073218..5ca5f63a27c9 100644
--- a/docs/source/ko/tasks/summarization.mdx
+++ b/docs/source/ko/tasks/summarization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 요약[[summarization]]
diff --git a/docs/source/ko/tasks/token_classification.mdx b/docs/source/ko/tasks/token_classification.md
similarity index 99%
rename from docs/source/ko/tasks/token_classification.mdx
rename to docs/source/ko/tasks/token_classification.md
index c0c0271828ee..b09c2c8078aa 100644
--- a/docs/source/ko/tasks/token_classification.mdx
+++ b/docs/source/ko/tasks/token_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 토큰 분류[[token-classification]]
diff --git a/docs/source/ko/tasks/translation.mdx b/docs/source/ko/tasks/translation.md
similarity index 99%
rename from docs/source/ko/tasks/translation.mdx
rename to docs/source/ko/tasks/translation.md
index f0256052af70..b18f56d13b9d 100644
--- a/docs/source/ko/tasks/translation.mdx
+++ b/docs/source/ko/tasks/translation.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 번역[[translation]]
diff --git a/docs/source/ko/tasks/video_classification.mdx b/docs/source/ko/tasks/video_classification.md
similarity index 99%
rename from docs/source/ko/tasks/video_classification.mdx
rename to docs/source/ko/tasks/video_classification.md
index 4d185b0aa765..eb04352d84a0 100644
--- a/docs/source/ko/tasks/video_classification.mdx
+++ b/docs/source/ko/tasks/video_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 영상 분류 [[video-classification]]
diff --git a/docs/source/ko/tasks/zero_shot_image_classification.mdx b/docs/source/ko/tasks/zero_shot_image_classification.md
similarity index 97%
rename from docs/source/ko/tasks/zero_shot_image_classification.mdx
rename to docs/source/ko/tasks/zero_shot_image_classification.md
index 199c089007b2..f824de93b865 100644
--- a/docs/source/ko/tasks/zero_shot_image_classification.mdx
+++ b/docs/source/ko/tasks/zero_shot_image_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 제로샷(zero-shot) 이미지 분류[[zeroshot-image-classification]]
diff --git a/docs/source/ko/tasks/zero_shot_object_detection.mdx b/docs/source/ko/tasks/zero_shot_object_detection.md
similarity index 98%
rename from docs/source/ko/tasks/zero_shot_object_detection.mdx
rename to docs/source/ko/tasks/zero_shot_object_detection.md
index c1dc6c732d23..8e9b52e8c7a2 100644
--- a/docs/source/ko/tasks/zero_shot_object_detection.mdx
+++ b/docs/source/ko/tasks/zero_shot_object_detection.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 제로샷(zero-shot) 객체 탐지[[zeroshot-object-detection]]
diff --git a/docs/source/ko/tasks_explained.mdx b/docs/source/ko/tasks_explained.md
similarity index 99%
rename from docs/source/ko/tasks_explained.mdx
rename to docs/source/ko/tasks_explained.md
index 7bef350dc8b2..78c90849bb89 100644
--- a/docs/source/ko/tasks_explained.mdx
+++ b/docs/source/ko/tasks_explained.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers로 작업을 해결하는 방법[[how-transformers-solve-tasks]]
diff --git a/docs/source/ko/torchscript.mdx b/docs/source/ko/torchscript.md
similarity index 98%
rename from docs/source/ko/torchscript.mdx
rename to docs/source/ko/torchscript.md
index 1a22baf92bec..297479caf2c0 100644
--- a/docs/source/ko/torchscript.mdx
+++ b/docs/source/ko/torchscript.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # TorchScript로 내보내기[[export-to-torchscript]]
diff --git a/docs/source/ko/training.mdx b/docs/source/ko/training.md
similarity index 99%
rename from docs/source/ko/training.mdx
rename to docs/source/ko/training.md
index 0366c656f0a0..4e375f0f7215 100644
--- a/docs/source/ko/training.mdx
+++ b/docs/source/ko/training.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 사전 학습된 모델 미세 튜닝하기[[finetune-a-pretrained-model]]
diff --git a/docs/source/ko/troubleshooting.mdx b/docs/source/ko/troubleshooting.md
similarity index 98%
rename from docs/source/ko/troubleshooting.mdx
rename to docs/source/ko/troubleshooting.md
index 56c27df9fcc0..5eef788e0993 100644
--- a/docs/source/ko/troubleshooting.mdx
+++ b/docs/source/ko/troubleshooting.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 문제 해결[[troubleshoot]]
diff --git a/docs/source/ms/index.mdx b/docs/source/ms/index.md
similarity index 99%
rename from docs/source/ms/index.mdx
rename to docs/source/ms/index.md
index c7dc916d167e..8ae0b484aa61 100644
--- a/docs/source/ms/index.mdx
+++ b/docs/source/ms/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Melainkan diperlukan oleh undang-undang yang terpakai atau dipersetujui secara bertulis, perisian yang diedarkan di bawah Lesen diedarkan pada
 ASAS ""SEBAGAIMANA ADANYA"", TANPA WARANTI ATAU SEBARANG JENIS SYARAT, sama ada nyata atau tersirat. Lihat Lesen untuk
 bahasa tertentu yang mengawal kebenaran dan pengehadan di bawah Lesen.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/pt/accelerate.mdx b/docs/source/pt/accelerate.md
similarity index 96%
rename from docs/source/pt/accelerate.mdx
rename to docs/source/pt/accelerate.md
index 59dbd96a83b2..a4e346a2b487 100644
--- a/docs/source/pt/accelerate.mdx
+++ b/docs/source/pt/accelerate.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Treinamento distribuído com o 🤗 Accelerate
diff --git a/docs/source/pt/converting_tensorflow_models.mdx b/docs/source/pt/converting_tensorflow_models.md
similarity index 97%
rename from docs/source/pt/converting_tensorflow_models.mdx
rename to docs/source/pt/converting_tensorflow_models.md
index db7be687c385..ac1271d2764b 100644
--- a/docs/source/pt/converting_tensorflow_models.mdx
+++ b/docs/source/pt/converting_tensorflow_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Convertendo checkpoints do TensorFlow para Pytorch
diff --git a/docs/source/pt/create_a_model.mdx b/docs/source/pt/create_a_model.md
similarity index 98%
rename from docs/source/pt/create_a_model.mdx
rename to docs/source/pt/create_a_model.md
index bde2b1b18770..8c53752d6cf8 100644
--- a/docs/source/pt/create_a_model.mdx
+++ b/docs/source/pt/create_a_model.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Criar uma arquitetura customizada
diff --git a/docs/source/pt/custom_models.mdx b/docs/source/pt/custom_models.md
similarity index 98%
rename from docs/source/pt/custom_models.mdx
rename to docs/source/pt/custom_models.md
index 59484dcc35eb..70c56913a383 100644
--- a/docs/source/pt/custom_models.mdx
+++ b/docs/source/pt/custom_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Compartilhando modelos customizados
diff --git a/docs/source/pt/fast_tokenizers.mdx b/docs/source/pt/fast_tokenizers.md
similarity index 94%
rename from docs/source/pt/fast_tokenizers.mdx
rename to docs/source/pt/fast_tokenizers.md
index aff9afb31f2b..ea1da8a61571 100644
--- a/docs/source/pt/fast_tokenizers.mdx
+++ b/docs/source/pt/fast_tokenizers.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Usando os Tokenizers do 🤗 Tokenizers
diff --git a/docs/source/pt/index.mdx b/docs/source/pt/index.md
similarity index 99%
rename from docs/source/pt/index.mdx
rename to docs/source/pt/index.md
index e9de6f464dd1..08575b0bea22 100644
--- a/docs/source/pt/index.mdx
+++ b/docs/source/pt/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers
diff --git a/docs/source/pt/installation.mdx b/docs/source/pt/installation.md
similarity index 98%
rename from docs/source/pt/installation.mdx
rename to docs/source/pt/installation.md
index 2325cc74afe2..15b59f7d8768 100644
--- a/docs/source/pt/installation.mdx
+++ b/docs/source/pt/installation.md
@@ -12,6 +12,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Guia de Instalação
diff --git a/docs/source/pt/multilingual.mdx b/docs/source/pt/multilingual.md
similarity index 98%
rename from docs/source/pt/multilingual.mdx
rename to docs/source/pt/multilingual.md
index 4db9b54dab34..b6366b8c2289 100644
--- a/docs/source/pt/multilingual.mdx
+++ b/docs/source/pt/multilingual.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Modelos multilinguísticos para inferência
diff --git a/docs/source/pt/pipeline_tutorial.mdx b/docs/source/pt/pipeline_tutorial.md
similarity index 96%
rename from docs/source/pt/pipeline_tutorial.mdx
rename to docs/source/pt/pipeline_tutorial.md
index 2991bcecde4f..a7ea71256808 100644
--- a/docs/source/pt/pipeline_tutorial.mdx
+++ b/docs/source/pt/pipeline_tutorial.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Pipelines para inferência
@@ -105,7 +109,7 @@ Passe seu texto de entrada ao [`pipeline`] para gerar algum texto:
 A flexibilidade do [`pipeline`] significa que também pode-se extender às tarefas de áudio.
 La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio.
 
-Por exemplo, classifiquemos a emoção de um breve fragmento do famoso discurso de John F. Kennedy /home/rzimmerdev/dev/transformers/docs/source/pt/pipeline_tutorial.mdx
+Por exemplo, classifiquemos a emoção de um breve fragmento do famoso discurso de John F. Kennedy /home/rzimmerdev/dev/transformers/docs/source/pt/pipeline_tutorial.md
 Encontre um modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para
 reconhecimento de emoções no Model Hub e carregue-o usando o [`pipeline`]:
 
diff --git a/docs/source/pt/quicktour.mdx b/docs/source/pt/quicktour.md
similarity index 99%
rename from docs/source/pt/quicktour.mdx
rename to docs/source/pt/quicktour.md
index 3c00a64b6652..e807124de573 100644
--- a/docs/source/pt/quicktour.mdx
+++ b/docs/source/pt/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Tour rápido
diff --git a/docs/source/pt/run_scripts.mdx b/docs/source/pt/run_scripts.md
similarity index 99%
rename from docs/source/pt/run_scripts.mdx
rename to docs/source/pt/run_scripts.md
index e91c4fc87d2d..8d87c10c2713 100644
--- a/docs/source/pt/run_scripts.mdx
+++ b/docs/source/pt/run_scripts.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Treinamento a partir de um script
diff --git a/docs/source/pt/serialization.mdx b/docs/source/pt/serialization.md
similarity index 99%
rename from docs/source/pt/serialization.mdx
rename to docs/source/pt/serialization.md
index 5e95d5951215..d5a21c7f890d 100644
--- a/docs/source/pt/serialization.mdx
+++ b/docs/source/pt/serialization.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Exportando modelos para ONNX 
diff --git a/docs/source/pt/tasks/sequence_classification.mdx b/docs/source/pt/tasks/sequence_classification.md
similarity index 98%
rename from docs/source/pt/tasks/sequence_classification.mdx
rename to docs/source/pt/tasks/sequence_classification.md
index 7c443e700d4e..cc04f5dbaece 100644
--- a/docs/source/pt/tasks/sequence_classification.mdx
+++ b/docs/source/pt/tasks/sequence_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Classificação de texto
diff --git a/docs/source/pt/tasks/token_classification.mdx b/docs/source/pt/tasks/token_classification.md
similarity index 98%
rename from docs/source/pt/tasks/token_classification.mdx
rename to docs/source/pt/tasks/token_classification.md
index 780080a60dd3..1de82f4a509c 100644
--- a/docs/source/pt/tasks/token_classification.mdx
+++ b/docs/source/pt/tasks/token_classification.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Classificação de tokens
diff --git a/docs/source/pt/training.mdx b/docs/source/pt/training.md
similarity index 98%
rename from docs/source/pt/training.mdx
rename to docs/source/pt/training.md
index d84f227aec28..aa529ac948b8 100644
--- a/docs/source/pt/training.mdx
+++ b/docs/source/pt/training.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Fine-tuning de um modelo pré-treinado
diff --git a/docs/source/zh/index.mdx b/docs/source/zh/index.md
similarity index 99%
rename from docs/source/zh/index.mdx
rename to docs/source/zh/index.md
index 71f5d7e3b174..38e758caf73c 100644
--- a/docs/source/zh/index.mdx
+++ b/docs/source/zh/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 🤗 Transformers简介
diff --git a/docs/source/zh/quicktour.mdx b/docs/source/zh/quicktour.md
similarity index 99%
rename from docs/source/zh/quicktour.mdx
rename to docs/source/zh/quicktour.md
index a9125136ced7..41688173116a 100644
--- a/docs/source/zh/quicktour.mdx
+++ b/docs/source/zh/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # 快速上手
diff --git a/setup.cfg b/setup.cfg
index 8b84d3a6d9b9..ffe8973dd21c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
 [tool:pytest]
 doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
-doctest_glob=**/*.mdx
\ No newline at end of file
+doctest_glob=**/*.md
\ No newline at end of file
diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
index 85d053a14873..87949827d9f8 100644
--- a/src/transformers/commands/add_new_model.py
+++ b/src/transformers/commands/add_new_model.py
@@ -183,8 +183,8 @@ def remove_copy_lines(path):
             os.remove(f"{directory}/test_modeling_flax_{lowercase_model_name}.py")
 
         shutil.move(
-            f"{directory}/{lowercase_model_name}.mdx",
-            f"{path_to_transformer_root}/docs/source/en/model_doc/{lowercase_model_name}.mdx",
+            f"{directory}/{lowercase_model_name}.md",
+            f"{path_to_transformer_root}/docs/source/en/model_doc/{lowercase_model_name}.md",
         )
 
         shutil.move(
diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index 91ce8a143c59..a232ad34dbb4 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -646,7 +646,7 @@ def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) ->
     model_files = list(model_module.glob("*.py"))
     model_files = filter_framework_files(model_files, frameworks=frameworks)
 
-    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.mdx"
+    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.md"
 
     # Basic pattern for test files
     test_files = [
@@ -1185,7 +1185,7 @@ def duplicate_doc_file(
         old_model_patterns (`ModelPatterns`): The patterns for the old model.
         new_model_patterns (`ModelPatterns`): The patterns for the new model.
         dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file.
-            Will default to the a file named `{new_model_patterns.model_type}.mdx` in the same folder as `module_file`.
+            Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`.
         frameworks (`List[str]`, *optional*):
             If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file.
     """
@@ -1196,7 +1196,7 @@ def duplicate_doc_file(
     if frameworks is None:
         frameworks = get_default_frameworks()
     if dest_file is None:
-        dest_file = Path(doc_file).parent / f"{new_model_patterns.model_type}.mdx"
+        dest_file = Path(doc_file).parent / f"{new_model_patterns.model_type}.md"
 
     # Parse the doc file in blocks. One block per section/header
     lines = content.split("\n")
@@ -1405,7 +1405,7 @@ def disable_fx_test(filename: Path) -> bool:
     add_model_to_auto_classes(old_model_patterns, new_model_patterns, model_classes)
 
     # 5. Add doc file
-    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{old_model_patterns.model_type}.mdx"
+    doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{old_model_patterns.model_type}.md"
     duplicate_doc_file(doc_file, old_model_patterns, new_model_patterns, frameworks=frameworks)
 
     # 6. Warn the user for duplicate patterns
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a3a4baa1f0d1..e911769a1251 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1857,9 +1857,9 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
 
 
 def preprocess_string(string, skip_cuda_tests):
-    """Prepare a docstring or a `.mdx` file to be run by doctest.
+    """Prepare a docstring or a `.md` file to be run by doctest.
 
-    The argument `string` would be the whole file content if it is a `.mdx` file. For a python file, it would be one of
+    The argument `string` would be the whole file content if it is a `.md` file. For a python file, it would be one of
     its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a
     cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
     `string`.
diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md
index 42c423c02e28..e1785853dcd3 100644
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
@@ -92,7 +92,7 @@ Choose from 1, 2 [1]:
 
 Once the command has finished, you should have a total of 7 new files spread across the repository:
 ```
-docs/source/model_doc/<model_name>.mdx
+docs/source/model_doc/<model_name>.md
 src/transformers/models/<model_name>/configuration_<model_name>.py
 src/transformers/models/<model_name>/modeling_<model_name>.py
 src/transformers/models/<model_name>/modeling_tf_<model_name>.py
@@ -242,7 +242,7 @@ make fix-copies
 ```
 
 and then you can start tweaking your model. You should:
-- fill the doc file at `docs/source/model_doc/model_name.mdx`
+- fill the doc file at `docs/source/model_doc/model_name.md`
 - tweak the configuration and modeling files to your need
 
 Once you're done, you can run the tests to ensure that they all pass:
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index 0f334da31c65..61ccc184f551 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -471,7 +471,7 @@ def test_get_model_files(self):
         bert_files = get_model_files("bert")
 
         doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
         self.assertEqual(model_files, BERT_MODEL_FILES)
@@ -490,7 +490,7 @@ def test_get_model_files(self):
         # VIT
         vit_files = get_model_files("vit")
         doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
         self.assertEqual(model_files, VIT_MODEL_FILES)
@@ -509,7 +509,7 @@ def test_get_model_files(self):
         # Wav2Vec2
         wav2vec2_files = get_model_files("wav2vec2")
         doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
         self.assertEqual(model_files, WAV2VEC2_MODEL_FILES)
@@ -532,7 +532,7 @@ def test_get_model_files_only_pt(self):
         bert_files = get_model_files("bert", frameworks=["pt"])
 
         doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
         bert_model_files = BERT_MODEL_FILES - {
@@ -553,7 +553,7 @@ def test_get_model_files_only_pt(self):
         # VIT
         vit_files = get_model_files("vit", frameworks=["pt"])
         doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
         vit_model_files = VIT_MODEL_FILES - {
@@ -574,7 +574,7 @@ def test_get_model_files_only_pt(self):
         # Wav2Vec2
         wav2vec2_files = get_model_files("wav2vec2", frameworks=["pt"])
         doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
         wav2vec2_model_files = WAV2VEC2_MODEL_FILES - {
@@ -599,7 +599,7 @@ def test_get_model_files_tf_and_flax(self):
         bert_files = get_model_files("bert", frameworks=["tf", "flax"])
 
         doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
         bert_model_files = BERT_MODEL_FILES - {"src/transformers/models/bert/modeling_bert.py"}
@@ -618,7 +618,7 @@ def test_get_model_files_tf_and_flax(self):
         # VIT
         vit_files = get_model_files("vit", frameworks=["tf", "flax"])
         doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
         vit_model_files = VIT_MODEL_FILES - {"src/transformers/models/vit/modeling_vit.py"}
@@ -637,7 +637,7 @@ def test_get_model_files_tf_and_flax(self):
         # Wav2Vec2
         wav2vec2_files = get_model_files("wav2vec2", frameworks=["tf", "flax"])
         doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
         wav2vec2_model_files = WAV2VEC2_MODEL_FILES - {"src/transformers/models/wav2vec2/modeling_wav2vec2.py"}
@@ -713,7 +713,7 @@ def test_retrieve_info_for_model_with_bert(self):
         self.assertEqual(test_files, bert_test_files)
 
         doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         self.assertEqual(all_bert_files["module_name"], "bert")
 
@@ -762,7 +762,7 @@ def test_retrieve_info_for_model_pt_tf_with_bert(self):
         self.assertEqual(test_files, bert_test_files)
 
         doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
 
         self.assertEqual(all_bert_files["module_name"], "bert")
 
@@ -806,7 +806,7 @@ def test_retrieve_info_for_model_with_vit(self):
         self.assertEqual(test_files, vit_test_files)
 
         doc_file = str(Path(all_vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
 
         self.assertEqual(all_vit_files["module_name"], "vit")
 
@@ -860,7 +860,7 @@ def test_retrieve_info_for_model_with_wav2vec2(self):
         self.assertEqual(test_files, wav2vec2_test_files)
 
         doc_file = str(Path(all_wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.mdx")
+        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
 
         self.assertEqual(all_wav2vec2_files["module_name"], "wav2vec2")
 
@@ -1476,8 +1476,8 @@ def test_duplicate_doc_file(self):
 """
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            doc_file = os.path.join(tmp_dir, "gpt2.mdx")
-            new_doc_file = os.path.join(tmp_dir, "gpt-new-new.mdx")
+            doc_file = os.path.join(tmp_dir, "gpt2.md")
+            new_doc_file = os.path.join(tmp_dir, "gpt-new-new.md")
 
             gpt2_model_patterns = ModelPatterns("GPT2", "gpt2", tokenizer_class="GPT2Tokenizer")
             new_model_patterns = ModelPatterns(
diff --git a/utils/check_copies.py b/utils/check_copies.py
index bd79aa9e86c0..0dbc7b889edc 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -440,7 +440,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
 
     # If the introduction or the conclusion of the list change, the prompts may need to be updated.
     index_list, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "index.mdx"),
+        filename=os.path.join(PATH_TO_DOCS, "index.md"),
         start_prompt="<!--This list is updated automatically from the README",
         end_prompt="### Supported frameworks",
     )
@@ -464,11 +464,11 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     converted_md_list = convert_readme_to_index(md_list)
     if converted_md_list != index_list:
         if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "index.mdx"), "w", encoding="utf-8", newline="\n") as f:
+            with open(os.path.join(PATH_TO_DOCS, "index.md"), "w", encoding="utf-8", newline="\n") as f:
                 f.writelines(lines[:start_index] + [converted_md_list] + lines[end_index:])
         else:
             raise ValueError(
-                "The model list in the README changed and the list in `index.mdx` has not been updated. Run "
+                "The model list in the README changed and the list in `index.md` has not been updated. Run "
                 "`make fix-copies` to fix this."
             )
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 12615550a8d0..81dbfcb5f05e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -756,7 +756,7 @@ def find_all_documented_objects():
             content = f.read()
         raw_doc_objs = re.findall(r"(?:autoclass|autofunction):: transformers.(\S+)\s+", content)
         documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs]
-    for doc_file in Path(PATH_TO_DOC).glob("**/*.mdx"):
+    for doc_file in Path(PATH_TO_DOC).glob("**/*.md"):
         with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
             content = f.read()
         raw_doc_objs = re.findall(r"\[\[autodoc\]\]\s+(\S+)\s+", content)
@@ -908,7 +908,7 @@ def check_all_objects_are_documented():
 def check_model_type_doc_match():
     """Check all doc pages have a corresponding model type."""
     model_doc_folder = Path(PATH_TO_DOC) / "model_doc"
-    model_docs = [m.stem for m in model_doc_folder.glob("*.mdx")]
+    model_docs = [m.stem for m in model_doc_folder.glob("*.md")]
 
     model_types = list(transformers.models.auto.configuration_auto.MODEL_NAMES_MAPPING.keys())
     model_types = [MODEL_TYPE_TO_DOC_MAPPING[m] if m in MODEL_TYPE_TO_DOC_MAPPING else m for m in model_types]
diff --git a/utils/check_table.py b/utils/check_table.py
index 80593881a39c..77eabe4093c0 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -157,7 +157,7 @@ def get_model_table_from_auto_modules():
 def check_model_table(overwrite=False):
     """Check the model table in the index.rst is consistent with the state of the lib and maybe `overwrite`."""
     current_table, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "index.mdx"),
+        filename=os.path.join(PATH_TO_DOCS, "index.md"),
         start_prompt="<!--This table is updated automatically from the auto modules",
         end_prompt="<!-- End table-->",
     )
@@ -165,11 +165,11 @@ def check_model_table(overwrite=False):
 
     if current_table != new_table:
         if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "index.mdx"), "w", encoding="utf-8", newline="\n") as f:
+            with open(os.path.join(PATH_TO_DOCS, "index.md"), "w", encoding="utf-8", newline="\n") as f:
                 f.writelines(lines[:start_index] + [new_table] + lines[end_index:])
         else:
             raise ValueError(
-                "The model table in the `index.mdx` has not been updated. Run `make fix-copies` to fix this."
+                "The model table in the `index.md` has not been updated. Run `make fix-copies` to fix this."
             )
 
 
diff --git a/utils/check_task_guides.py b/utils/check_task_guides.py
index 42515439a98f..a4dde52116fe 100644
--- a/utils/check_task_guides.py
+++ b/utils/check_task_guides.py
@@ -55,29 +55,29 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
 
 TASK_GUIDE_TO_MODELS = {
-    "asr.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_CTC_MAPPING_NAMES,
-    "audio_classification.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-    "language_modeling.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-    "image_classification.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-    "masked_language_modeling.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-    "multiple_choice.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
-    "object_detection.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
-    "question_answering.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-    "semantic_segmentation.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-    "sequence_classification.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-    "summarization.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-    "token_classification.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-    "translation.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-    "video_classification.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
-    "document_question_answering.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-    "monocular_depth_estimation.mdx": transformers_module.models.auto.modeling_auto.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
+    "asr.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CTC_MAPPING_NAMES,
+    "audio_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    "language_modeling.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    "image_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    "masked_language_modeling.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    "multiple_choice.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
+    "object_detection.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+    "question_answering.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    "semantic_segmentation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+    "sequence_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    "summarization.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    "token_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    "translation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    "video_classification.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
+    "document_question_answering.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    "monocular_depth_estimation.md": transformers_module.models.auto.modeling_auto.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
 }
 
 # This list contains model types used in some task guides that are not in `CONFIG_MAPPING_NAMES` (therefore not in any
 # `MODEL_MAPPING_NAMES` or any `MODEL_FOR_XXX_MAPPING_NAMES`).
 SPECIAL_TASK_GUIDE_TO_MODEL_TYPES = {
-    "summarization.mdx": ("nllb",),
-    "translation.mdx": ("nllb",),
+    "summarization.md": ("nllb",),
+    "translation.md": ("nllb",),
 }
 
 
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 5b95f24a11f4..0064a9999b62 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -1,17 +1,17 @@
-docs/source/en/quicktour.mdx
-docs/source/es/quicktour.mdx
-docs/source/en/pipeline_tutorial.mdx
-docs/source/en/autoclass_tutorial.mdx
-docs/source/en/task_summary.mdx
-docs/source/en/model_doc/markuplm.mdx
-docs/source/en/model_doc/speech_to_text.mdx
-docs/source/en/model_doc/switch_transformers.mdx
-docs/source/en/model_doc/t5.mdx
-docs/source/en/model_doc/t5v1.1.mdx
-docs/source/en/model_doc/byt5.mdx
-docs/source/en/model_doc/tapex.mdx
-docs/source/en/model_doc/donut.mdx
-docs/source/en/model_doc/encoder-decoder.mdx
+docs/source/en/quicktour.md
+docs/source/es/quicktour.md
+docs/source/en/pipeline_tutorial.md
+docs/source/en/autoclass_tutorial.md
+docs/source/en/task_summary.md
+docs/source/en/model_doc/markuplm.md
+docs/source/en/model_doc/speech_to_text.md
+docs/source/en/model_doc/switch_transformers.md
+docs/source/en/model_doc/t5.md
+docs/source/en/model_doc/t5v1.1.md
+docs/source/en/model_doc/byt5.md
+docs/source/en/model_doc/tapex.md
+docs/source/en/model_doc/donut.md
+docs/source/en/model_doc/encoder-decoder.md
 src/transformers/generation/configuration_utils.py
 src/transformers/generation/tf_utils.py
 src/transformers/generation/utils.py
diff --git a/utils/notification_service_doc_tests.py b/utils/notification_service_doc_tests.py
index 521eb98264f2..aac768fb3943 100644
--- a/utils/notification_service_doc_tests.py
+++ b/utils/notification_service_doc_tests.py
@@ -329,7 +329,7 @@ def add_path(self, path: str):
     docs = collections.OrderedDict(
         [
             ("*.py", "API Examples"),
-            ("*.mdx", "MDX Examples"),
+            ("*.md", "MD Examples"),
         ]
     )
 
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index a06a2f672fbf..b6a1ec87ed7a 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -263,14 +263,14 @@ def get_diff_for_doctesting(repo, base_commit, commits):
     code_diff = []
     for commit in commits:
         for diff_obj in commit.diff(base_commit):
-            # We always add new python/mdx files
-            if diff_obj.change_type in ["A"] and (diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".mdx")):
+            # We always add new python/md files
+            if diff_obj.change_type in ["A"] and (diff_obj.b_path.endswith(".py") or diff_obj.b_path.endswith(".md")):
                 code_diff.append(diff_obj.b_path)
             # Now for modified files
             elif (
                 diff_obj.change_type in ["M", "R"]
                 and diff_obj.b_path.endswith(".py")
-                or diff_obj.b_path.endswith(".mdx")
+                or diff_obj.b_path.endswith(".md")
             ):
                 # In case of renames, we'll look at the tests using both the old and new name.
                 if diff_obj.a_path != diff_obj.b_path:

From cb8f675510f35b34c935a9ad04f2fe7fe6a8a9e2 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <kit1980@gmail.com>
Date: Tue, 20 Jun 2023 17:21:13 -0700
Subject: [PATCH 484/935] Update deprecated torch.ger (#24387)

---
 src/transformers/models/transfo_xl/modeling_transfo_xl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index 493ab08d9ce4..d0f6cc029fb3 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -185,7 +185,7 @@ def __init__(self, demb):
         self.register_buffer("inv_freq", inv_freq)
 
     def forward(self, pos_seq, bsz=None):
-        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
         pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
 
         if bsz is not None:

From ad78d9597b224443e9fe65a94acc8c0bc48cd039 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Tue, 20 Jun 2023 17:34:20 -0700
Subject: [PATCH 485/935] [docs] Fix NLLB-MoE links (#24388)

fix broken links
---
 docs/source/en/model_doc/nllb-moe.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md
index 4688e67ef451..ea456934feea 100644
--- a/docs/source/en/model_doc/nllb-moe.md
+++ b/docs/source/en/model_doc/nllb-moe.md
@@ -101,8 +101,8 @@ See example below for a translation from romanian to german:
 
 ## Documentation resources
 
-- [Translation task guide](./tasks/translation)
-- [Summarization task guide](./tasks/summarization)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
 
 
 ## NllbMoeConfig

From 45f71d793d944fe44cd30edf1bb2a250f7ccbe77 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 21 Jun 2023 11:12:38 +0200
Subject: [PATCH 486/935] Add `ffmpeg` for `doc_test_job` on CircleCI (#24397)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index ef0acdb66534..389e9f14b021 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -454,7 +454,7 @@ def job_name(self):
     "pr_documentation_tests",
     additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
     install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
+        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg",
         "pip install --upgrade pip",
         "pip install -e .[dev]",
         "pip install git+https://github.com/huggingface/accelerate",

From 5f0801d174a69e38d28eeba47e77545a27a260be Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 21 Jun 2023 11:14:41 +0100
Subject: [PATCH 487/935] Generate: add SequenceBiasLogitsProcessor (#24334)

---
 docs/source/en/internal/generation_utils.md   |   3 +
 src/transformers/__init__.py                  |   2 +
 src/transformers/generation/__init__.py       |   2 +
 .../generation/configuration_utils.py         |  12 +-
 src/transformers/generation/logits_process.py | 310 +++++++++++-------
 src/transformers/generation/utils.py          |   6 +-
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 tests/generation/test_logits_process.py       |  22 ++
 8 files changed, 241 insertions(+), 123 deletions(-)

diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index c158be36ba59..f5c882bf128a 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -141,6 +141,9 @@ generation.
 [[autodoc]] NoRepeatNGramLogitsProcessor
     - __call__
 
+[[autodoc]] SequenceBiasLogitsProcessor
+    - __call__
+
 [[autodoc]] NoBadWordsLogitsProcessor
     - __call__
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1254761a2aa4..3217a9ca0465 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -970,6 +970,7 @@
             "PhrasalConstraint",
             "PrefixConstrainedLogitsProcessor",
             "RepetitionPenaltyLogitsProcessor",
+            "SequenceBiasLogitsProcessor",
             "StoppingCriteria",
             "StoppingCriteriaList",
             "TemperatureLogitsWarper",
@@ -4733,6 +4734,7 @@
             PhrasalConstraint,
             PrefixConstrainedLogitsProcessor,
             RepetitionPenaltyLogitsProcessor,
+            SequenceBiasLogitsProcessor,
             StoppingCriteria,
             StoppingCriteriaList,
             TemperatureLogitsWarper,
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index bf87b6e5ff5f..0a522e9bb797 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -56,6 +56,7 @@
         "NoRepeatNGramLogitsProcessor",
         "PrefixConstrainedLogitsProcessor",
         "RepetitionPenaltyLogitsProcessor",
+        "SequenceBiasLogitsProcessor",
         "EncoderRepetitionPenaltyLogitsProcessor",
         "TemperatureLogitsWarper",
         "TopKLogitsWarper",
@@ -182,6 +183,7 @@
             NoRepeatNGramLogitsProcessor,
             PrefixConstrainedLogitsProcessor,
             RepetitionPenaltyLogitsProcessor,
+            SequenceBiasLogitsProcessor,
             TemperatureLogitsWarper,
             TopKLogitsWarper,
             TopPLogitsWarper,
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 26246f3a758d..20caed9cc513 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -142,11 +142,8 @@ class GenerationConfig(PushToHubMixin):
         no_repeat_ngram_size (`int`, *optional*, defaults to 0):
             If set to int > 0, all ngrams of that size can only occur once.
         bad_words_ids(`List[List[int]]`, *optional*):
-            List of token ids that are not allowed to be generated. In order to get the token ids of the words that
-            should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing the
-            tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
-            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
-            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+            List of list of token ids that are not allowed to be generated. Check
+            [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
         force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
             List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
             words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
@@ -183,6 +180,10 @@ class GenerationConfig(PushToHubMixin):
             A list of pairs of integers which indicates a mapping from generation indices to token indices that will be
             forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token
             of index 123.
+        sequence_bias (`Dict[Tuple[int], float]`, *optional*)):
+            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. Check
+            [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
 
         > Parameters that define the output variables of `generate`
 
@@ -262,6 +263,7 @@ def __init__(self, **kwargs):
         self.suppress_tokens = kwargs.pop("suppress_tokens", None)
         self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
         self.forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
+        self.sequence_bias = kwargs.pop("sequence_bias", None)
 
         # Parameters that define the output variables of `generate`
         self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 8595a827efde..1b32e592dc51 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -15,7 +15,7 @@
 
 import inspect
 import math
-from typing import Callable, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, List, Tuple, Union
 
 import numpy as np
 import torch
@@ -539,140 +539,218 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores
 
 
-class NoBadWordsLogitsProcessor(LogitsProcessor):
+class SequenceBiasLogitsProcessor(LogitsProcessor):
     """
-    [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
+    [`LogitsProcessor`] that applies an additive bias on sequences. The bias is applied to the last token of a sequence
+    when the next generated token can complete it. Consequently, to take the most of biasing sequences with more than
+    one token, consider using beam methods (to gracefully work around partially completed sequences that have a
+    negative bias) and applying the bias to their prefixes (to ensure the bias is applied earlier).
+
+    <Tip>
+
+    In order to get the token ids of the sequences that you want to bias, make sure to set `add_prefix_space=True` when
+    initializing the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The
+    `add_prefix_space` argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours
+    come from `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+
+    </Tip>
 
     Args:
-        bad_words_ids (`List[List[int]]`):
-            List of list of token ids that are not allowed to be generated. In order to get the token ids of the words
-            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
-            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
-            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
-            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
-        eos_token_id (`Union[int, List[int]]`):
-            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        sequence_bias (`Dict[Tuple[int], float]`):
+            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
+            will always be applied. Otherwise, the bias will only be applied if the sequence in question is about to be
+            completed (in the token selection step after this processor is applied).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["The full name of Donald is Donald"], return_tensors="pt")
+
+    >>> summary_ids = model.generate(inputs["input_ids"], max_new_tokens=4)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Trump Jr
+
+    >>> # Now let's control generation through a bias. Please note that the tokenizer is initialized differently!
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
+
+
+    >>> def get_tokens_as_tuple(word):
+    ...     return tuple(tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0])
+
+
+    >>> # If we add a negative bias without beam search, it may become "stuck" in a prefix without good continuations
+    >>> sequence_bias = {get_tokens_as_tuple("Trump"): -10.0}
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Donald,
+
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Rumsfeld,
+
+    >>> # We can also add a positive bias to nudge the model towards specific tokens or continuations
+    >>> sequence_bias = {get_tokens_as_tuple("Donald Duck"): 10.0}
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Duck.
+    ```
     """
 
-    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List[int]]):
-        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
-            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
-        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
-            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+    def __init__(self, sequence_bias: Dict[Tuple[int], float]):
+        self.sequence_bias = sequence_bias
+        self._validate_arguments()
+
+        # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
+        # is infered in the first usage, which inhibits initializing here)
+        self.sequences_length_greater_than_1 = []
+        self.length_1_bias = None
+        self.length_greather_than_1_bias = None
+        self.prepared_bias_variables = False
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # 1 - Prepares the bias tensors. This is only needed the first time the logit processor is called.
+        if not self.prepared_bias_variables:
+            self._prepare_bias_variables(scores)
+
+        # 2 - prepares an empty bias to add
+        bias = torch.zeros_like(scores)
+
+        # 3 - include the bias from length = 1
+        bias += self.length_1_bias
+
+        # 4 - include the bias from length > 1, after determining which biased sequences may be completed.
+        # `matching_mask` is a (batch_size, vocab_size) boolean mask that is True for all tokens whose corresponding
+        # bias should be applied. The bias is applied on the last token of the sequence, if (and only if) the sequence
+        # may become complete this iteration.
+        matching_mask = torch.zeros_like(scores, dtype=torch.bool)
+        for sequence_ids in self.sequences_length_greater_than_1:
+            if len(sequence_ids) > input_ids.shape[1]:  # the sequence is longer than the context, ignore
+                continue
+            prefix_length = len(sequence_ids) - 1
+            last_token = sequence_ids[-1]
+            matching_rows = torch.eq(
+                input_ids[:, -prefix_length:],
+                torch.tensor(sequence_ids[:-1], dtype=input_ids.dtype, device=input_ids.device),
+            ).prod(dim=1)
+            matching_mask[:, last_token] |= matching_rows.bool()
+        bias += torch.where(matching_mask, self.length_greather_than_1_bias, 0.0)
+
+        # 5 - apply the bias to the scores
+        scores = scores + bias
+        return scores
+
+    def _prepare_bias_variables(self, scores: torch.FloatTensor):
+        vocabulary_size = scores.shape[-1]
+        sequence_bias = self.sequence_bias
+        tokens_with_bias = []
+
+        # Check biased tokens out of bounds
+        invalid_biases = []
+        for sequence_ids in sequence_bias:
+            for token_id in sequence_ids:
+                if token_id >= vocabulary_size:
+                    invalid_biases.append(token_id)
+        if len(invalid_biases) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocabulary_size}, but the following tokens were being biased: "
+                f"{invalid_biases}"
+            )
+
+        # Precompute the bias tensors to be applied. Sequences of length 1 are kept separately, as they can be applied
+        # with simpler logic.
+        self.length_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
+        self.length_greather_than_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
+        for sequence_ids, bias in sequence_bias.items():
+            if len(sequence_ids) == 1:
+                self.length_1_bias[sequence_ids[-1]] = bias
+            else:
+                self.sequences_length_greater_than_1.append(sequence_ids)
+                if self.length_greather_than_1_bias[sequence_ids[-1]] != 0.0:
+                    raise ValueError(
+                        "Setting a bias on sequences that share a common token termination is not yet supported. "
+                        "Please open an issue if you see this error message (after checking that it doesn't already "
+                        "exist)."
+                    )
+                self.length_greather_than_1_bias[sequence_ids[-1]] = bias
+            tokens_with_bias.append(sequence_ids[-1])
+
+        self.prepared_bias_variables = True
+
+    def _validate_arguments(self):
+        sequence_bias = self.sequence_bias
+        if not isinstance(sequence_bias, dict) or len(sequence_bias) == 0:
+            raise ValueError(f"`sequence_bias` has to be a non-empty dictionary, but is {sequence_bias}.")
+        if any(not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()):
+            raise ValueError(f"`sequence_bias` has to be a dict with tuples as keys, but is {sequence_bias}.")
         if any(
-            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
-            for bad_word_ids in bad_words_ids
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in sequence_ids)
+            or len(sequence_ids) == 0
+            for sequence_ids in sequence_bias.keys()
         ):
             raise ValueError(
-                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+                f"Each key in `sequence_bias` has to be a non-empty tuple of positive integers, but is "
+                f"{sequence_bias}."
             )
+        if any(not isinstance(bias, float) for bias in sequence_bias.values()):
+            raise ValueError(f"`sequence_bias` has to be a dict with floats as values, but is {sequence_bias}.")
 
-        if eos_token_id is None:
-            eos_token_id = []
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
 
-        bad_words_ids = list(
-            filter(lambda bad_token_seq: all([bad_token_seq != [i] for i in eos_token_id]), bad_words_ids)
-        )
-        self.bad_words_id_length_1 = []
-        self.bad_words_id_length_greater_than_1 = []
-        for word in bad_words_ids:
-            if len(word) == 1:
-                self.bad_words_id_length_1.append(word[0])
-            else:
-                self.bad_words_id_length_greater_than_1.append(word)
+class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
+    """
+    [`LogitsProcessor`] that enforces that specified sequences will never be selected.
 
-        self.static_bad_words_mask: Optional[torch.LongTensor] = None
+    <Tip>
 
-        for banned_token_seq in self.bad_words_id_length_greater_than_1:
-            if len(banned_token_seq) == 0:
-                raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
+    In order to get the token ids of the words that should not appear in the generated text, make sure to set
+    `add_prefix_space=True` when initializing the tokenizer, and use `tokenizer(bad_words,
+    add_special_tokens=False).input_ids`. The `add_prefix_space` argument is only supported for some slow tokenizers,
+    as fast tokenizers' prefixing behaviours come from `pre tokenizers`. Read more
+    [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
 
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0:
-            self.static_bad_words_mask = self._calc_static_bad_word_mask(scores)
+    </Tip>
 
-        dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist())
-        scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens)
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated.
+        eos_token_id (`Union[int, List[int]]`):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+    """
 
-        return scores
+    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List[int]]):
+        self.bad_word_ids = bad_words_ids
+        self._validate_arguments()
 
-    def _calc_static_bad_word_mask(self, scores: torch.FloatTensor) -> torch.BoolTensor:
-        static_bad_words_mask = torch.zeros(scores.shape[1])
-        static_bad_words_mask[self.bad_words_id_length_1] = 1
-        return static_bad_words_mask.unsqueeze(0).to(scores.device).bool()
-
-    def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool:
-        if len(tokens) == 0:
-            # if bad word tokens is just one token always ban it
-            return True
-        elif len(tokens) > len(prev_tokens):
-            # if bad word tokens are longer then prev input_ids they can't be equal
-            return False
-        else:
-            return prev_tokens[-len(tokens) :] == tokens
-
-    def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]:
-        banned_tokens = []
-        for prev_input_ids_slice in prev_input_ids:
-            banned_tokens_slice = []
-            for banned_token_seq in self.bad_words_id_length_greater_than_1:
-                if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]):
-                    banned_tokens_slice.append(banned_token_seq[-1])
-
-            banned_tokens.append(banned_tokens_slice)
-
-        return banned_tokens
-
-    def _set_scores_to_inf_for_banned_tokens(
-        self, scores: torch.Tensor, banned_tokens: List[List[int]]
-    ) -> torch.Tensor:
-        """
-        Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a
-        list of list of banned tokens to ban in the format [[batch index, vocabulary position],...
-
-        Args:
-            scores: logits distribution of shape (batch size, vocabulary size)
-            banned_tokens: list of list of tokens to ban of length (batch_size)
-        """
-        banned_mask_list = []
-        for idx, batch_banned_tokens in enumerate(banned_tokens):
-            for token in batch_banned_tokens:
-                # Eliminates invalid bad word IDs that are over the vocabulary size.
-                if token <= scores.shape[1]:
-                    banned_mask_list.append([idx, token])
-                else:
-                    logger.error(
-                        f"An invalid bad word ID is defined: {token}. This ID is not contained in the "
-                        "vocabulary, and is therefore ignored."
-                    )
-        if not banned_mask_list and self.static_bad_words_mask is None:
-            return scores
+        # Filter EOS token from bad_words_ids
+        if eos_token_id is None:
+            eos_token_id = []
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        bad_words_ids = list(
+            filter(lambda bad_token_seq: all([bad_token_seq != [i] for i in eos_token_id]), bad_words_ids)
+        )
 
-        else:
-            if banned_mask_list:
-                indices = torch.ones(len(banned_mask_list))
-                banned_mask = torch.LongTensor(banned_mask_list, device=indices.device)
-                # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates:
-                # [ 0  1  1 ]
-                # [ 0  0  0 ]
-                # [ 1  0  0 ]
-
-                banned_mask = (
-                    torch.sparse.LongTensor(banned_mask.t(), indices, scores.size())
-                    .to(scores.device)
-                    .to_dense()
-                    .bool()
-                )
-
-                if self.static_bad_words_mask is not None:
-                    banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask)
-            else:
-                banned_mask = self.static_bad_words_mask
+        # Forbidding a sequence is equivalent to setting its bias to -inf
+        sequence_bias = {tuple(sequence): float("-inf") for sequence in bad_words_ids}
+        super().__init__(sequence_bias=sequence_bias)
 
-            scores = scores.masked_fill(banned_mask, -float("inf"))
-            return scores
+    def _validate_arguments(self):
+        bad_words_ids = self.bad_word_ids
+        if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
 
 
 class PrefixConstrainedLogitsProcessor(LogitsProcessor):
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index eed177af9ef7..e5da7a143b4c 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -56,6 +56,7 @@
     NoRepeatNGramLogitsProcessor,
     PrefixConstrainedLogitsProcessor,
     RepetitionPenaltyLogitsProcessor,
+    SequenceBiasLogitsProcessor,
     SuppressTokensAtBeginLogitsProcessor,
     SuppressTokensLogitsProcessor,
     TemperatureLogitsWarper,
@@ -842,8 +843,9 @@ def _get_logits_processor(
         # instantiate processors list
         processors = LogitsProcessorList()
 
-        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
-        # all samplers can be found in `generation_utils_samplers.py`
+        if generation_config.sequence_bias is not None:
+            processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias))
+
         if generation_config.diversity_penalty is not None and generation_config.diversity_penalty > 0.0:
             processors.append(
                 HammingDiversityLogitsProcessor(
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 11e763e2a08e..5971244e66d3 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -240,6 +240,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class SequenceBiasLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class StoppingCriteria(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index 85ebf780f7dc..e560692d4c29 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -46,6 +46,7 @@
         NoRepeatNGramLogitsProcessor,
         PrefixConstrainedLogitsProcessor,
         RepetitionPenaltyLogitsProcessor,
+        SequenceBiasLogitsProcessor,
         TemperatureLogitsWarper,
         TopKLogitsWarper,
         TopPLogitsWarper,
@@ -512,6 +513,27 @@ def test_no_bad_words_dist_processor(self):
         filtered_scores = no_bad_words_dist_proc(input_ids, scores.clone())
         self.assertTrue(torch.allclose(scores, filtered_scores, atol=1e-3))
 
+    def test_bias_dist_processor(self):
+        vocab_size = 5
+        batch_size = 2
+
+        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
+        positive_bias = {(1,): 100.0, (4,): 100.0}
+        negative_bias = {(1, 0): -100.0, (0, 1, 2): -100.0, (1, 3, 1, 3): -100.0}
+        sequence_bias = {**positive_bias, **negative_bias}
+
+        # scores = 0 to facilitate checks
+        scores = torch.zeros((batch_size, vocab_size), dtype=torch.float, device=torch_device)
+
+        bias_dist_proc = SequenceBiasLogitsProcessor(sequence_bias=sequence_bias)
+        filtered_scores = bias_dist_proc(input_ids, scores.clone())
+
+        # batch 1: positive bias: tokens (1, 4); negative bias: tokens (0, 3); neutral: tokens (2)
+        # batch 2: positive bias: tokens (1, 4); negative bias: tokens (0, 2); neutral: tokens (3)
+        self.assertListEqual(
+            filtered_scores.tolist(), [[-100.0, 100.0, 0.0, -100.0, 100.0], [-100.0, 100.0, -100.0, 0.0, 100.0]]
+        )
+
     def test_processor_list(self):
         batch_size = 4
         sequence_length = 10

From 16c7b16a0a42142aa8041d603de7f1de0ccf2913 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 21 Jun 2023 12:36:34 +0200
Subject: [PATCH 488/935] byebye Hub connection timeout - Recast (#24399)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 389e9f14b021..0b7923ef6766 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -109,7 +109,7 @@ def to_dict(self):
         ]
         steps.extend([{"run": l} for l in self.install_steps])
         # TODO (ydshieh): Remove this line after the next release (the one after 2023/06/19) of `huggingface_hub`
-        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install git+https://github.com/huggingface/huggingface_hub.git@92028da882e47d61703e8c9144028e1ecadab415"}})
+        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install git+https://github.com/huggingface/huggingface_hub.git@c36817701ecd4694b290178846176da2e195d838"}})
         steps.append(
             {
                 "save_cache": {

From 4c6e42958951ca66a6b498b1afce8d8ad4ac2274 Mon Sep 17 00:00:00 2001
From: Bearnardd <43574448+Bearnardd@users.noreply.github.com>
Date: Wed, 21 Jun 2023 12:42:21 +0200
Subject: [PATCH 489/935] fix type annotation for debug arg (#24033)

* fix type annotation for debug arg

* fix TypeErorr
---
 src/transformers/training_args.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 71d75c6e0e29..ad0df7f99d11 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -873,7 +873,7 @@ class TrainingArguments:
             )
         },
     )
-    debug: str = field(
+    debug: Union[str, List[DebugOption]] = field(
         default="",
         metadata={
             "help": (
@@ -1563,8 +1563,11 @@ def __post_init__(self):
             )
             self.debug += " tpu_metrics_debug"
             self.tpu_metrics_debug = False
+
         if isinstance(self.debug, str):
             self.debug = [DebugOption(s) for s in self.debug.split()]
+        elif self.debug is None:
+            self.debug = []
 
         self.deepspeed_plugin = None
         if self.deepspeed:

From 1815d1865e61df3be1cbe37ee8b78f6017b0b1e2 Mon Sep 17 00:00:00 2001
From: Meghan Cowan <cowanmc27@gmail.com>
Date: Wed, 21 Jun 2023 04:24:41 -0700
Subject: [PATCH 490/935] [Trainer] Fix optimizer step on PyTorch TPU (#24389)

* update optimizer step for tpu

* add comment
---
 src/transformers/trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 5a59567b021c..73ef46655b45 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1859,7 +1859,8 @@ def _inner_training_loop(
                             self.scaler.step(self.optimizer)
                             self.scaler.update()
                         else:
-                            xm.optimizer_step(self.optimizer)
+                            # tpu-comment: accelerate wrapped optimizers call xm.optimizer_step
+                            self.optimizer.step()
                     elif self.do_grad_scaling:
                         scale_before = self.scaler.get_scale()
                         self.scaler.step(self.optimizer)

From 285a48011da3145ae77c5b22bcfbe77d367e5173 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 21 Jun 2023 17:04:59 +0200
Subject: [PATCH 491/935] Fix gradient checkpointing + fp16 autocast for most
 models (#24247)

* fix gc bug

* continue PoC on OPT

* fixes

* :exploding_head:

* fix tests

* remove pytest.mark

* fixup

* forward contrib credits from discussions

* forward contrib credits from discussions

* reverting changes on untouched files.

---------

Co-authored-by: zhaoqf123 <zhaoqf123@users.noreply.github.com>
Co-authored-by: 7eu7d7 <7eu7d7@users.noreply.github.com>
---
 .../models/albert/modeling_albert.py          |  6 +++-
 .../models/align/modeling_align.py            |  9 +++--
 .../models/altclip/modeling_altclip.py        | 11 ++++--
 .../modeling_audio_spectrogram_transformer.py |  4 +--
 .../models/autoformer/modeling_autoformer.py  |  5 +--
 src/transformers/models/bart/modeling_bart.py |  5 +--
 src/transformers/models/beit/modeling_beit.py |  4 +--
 src/transformers/models/bert/modeling_bert.py |  9 +++--
 .../modeling_bert_generation.py               |  9 +++--
 .../models/big_bird/modeling_big_bird.py      |  4 +--
 .../modeling_bigbird_pegasus.py               |  5 +--
 .../models/biogpt/modeling_biogpt.py          |  3 +-
 .../models/blenderbot/modeling_blenderbot.py  |  5 +--
 .../modeling_blenderbot_small.py              |  5 +--
 src/transformers/models/blip/modeling_blip.py |  3 +-
 .../models/blip/modeling_blip_text.py         |  3 +-
 .../models/blip_2/modeling_blip_2.py          | 11 ++++--
 .../models/bloom/modeling_bloom.py            |  3 +-
 .../bridgetower/modeling_bridgetower.py       | 11 ++++--
 .../models/camembert/modeling_camembert.py    |  9 +++--
 .../models/canine/modeling_canine.py          |  9 +++--
 .../chinese_clip/modeling_chinese_clip.py     | 11 ++++--
 src/transformers/models/clap/modeling_clap.py | 12 +++++--
 src/transformers/models/clip/modeling_clip.py |  3 +-
 .../models/clipseg/modeling_clipseg.py        |  3 +-
 .../models/codegen/modeling_codegen.py        |  3 +-
 .../modeling_conditional_detr.py              |  3 +-
 .../models/convbert/modeling_convbert.py      |  9 +++--
 src/transformers/models/cvt/modeling_cvt.py   |  3 +-
 .../data2vec/modeling_data2vec_audio.py       |  5 +--
 .../models/data2vec/modeling_data2vec_text.py |  9 +++--
 .../data2vec/modeling_data2vec_vision.py      |  4 +--
 .../models/deberta/modeling_deberta.py        |  4 +--
 .../models/deberta_v2/modeling_deberta_v2.py  |  4 +--
 .../modeling_decision_transformer.py          |  4 +--
 .../modeling_deformable_detr.py               |  4 +--
 src/transformers/models/deit/modeling_deit.py |  4 +--
 src/transformers/models/deta/modeling_deta.py |  4 +--
 src/transformers/models/detr/modeling_detr.py |  3 +-
 .../models/donut/modeling_donut_swin.py       |  4 +--
 src/transformers/models/dpt/modeling_dpt.py   |  4 +--
 .../models/electra/modeling_electra.py        |  9 +++--
 .../models/ernie/modeling_ernie.py            |  9 +++--
 src/transformers/models/esm/modeling_esm.py   |  5 +--
 .../models/flava/modeling_flava.py            |  5 +--
 src/transformers/models/fnet/modeling_fnet.py |  4 +--
 .../models/focalnet/modeling_focalnet.py      |  3 +-
 src/transformers/models/git/modeling_git.py   | 11 ++++--
 src/transformers/models/gpt2/modeling_gpt2.py | 10 ++++--
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  3 +-
 .../models/gpt_neo/modeling_gpt_neo.py        |  3 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |  3 +-
 src/transformers/models/gptj/modeling_gptj.py |  3 +-
 .../models/groupvit/modeling_groupvit.py      |  3 +-
 .../models/hubert/modeling_hubert.py          |  7 ++--
 .../models/imagegpt/modeling_imagegpt.py      |  4 +--
 .../models/informer/modeling_informer.py      |  7 ++--
 .../models/layoutlm/modeling_layoutlm.py      |  9 +++--
 .../models/layoutlmv2/modeling_layoutlmv2.py  |  4 +--
 .../models/layoutlmv3/modeling_layoutlmv3.py  |  4 +--
 src/transformers/models/led/modeling_led.py   |  5 +--
 src/transformers/models/lilt/modeling_lilt.py |  9 +++--
 .../models/llama/modeling_llama.py            |  3 +-
 .../models/longformer/modeling_longformer.py  |  9 +++--
 .../models/longt5/modeling_longt5.py          | 10 ++++--
 src/transformers/models/luke/modeling_luke.py |  4 +--
 .../models/m2m_100/modeling_m2m_100.py        |  5 +--
 .../models/marian/modeling_marian.py          |  5 +--
 .../models/markuplm/modeling_markuplm.py      |  3 +-
 .../mask2former/modeling_mask2former.py       |  3 +-
 .../models/maskformer/modeling_maskformer.py  |  3 +-
 .../maskformer/modeling_maskformer_swin.py    |  4 +--
 .../models/mbart/modeling_mbart.py            |  5 +--
 .../models/mctct/modeling_mctct.py            |  3 +-
 .../megatron_bert/modeling_megatron_bert.py   |  9 +++--
 .../models/mobilevit/modeling_mobilevit.py    |  4 +--
 .../mobilevitv2/modeling_mobilevitv2.py       |  3 +-
 src/transformers/models/mt5/modeling_mt5.py   |  5 ++-
 src/transformers/models/mvp/modeling_mvp.py   |  5 +--
 .../models/nezha/modeling_nezha.py            |  9 +++--
 .../models/nllb_moe/modeling_nllb_moe.py      |  6 ++--
 .../nystromformer/modeling_nystromformer.py   |  9 +++--
 .../models/oneformer/modeling_oneformer.py    |  3 +-
 .../models/open_llama/modeling_open_llama.py  |  3 +-
 src/transformers/models/opt/modeling_opt.py   |  3 +-
 .../models/owlvit/modeling_owlvit.py          |  3 +-
 .../models/pegasus/modeling_pegasus.py        |  5 +--
 .../models/pegasus_x/modeling_pegasus_x.py    |  5 +--
 .../models/pix2struct/modeling_pix2struct.py  |  7 ++--
 .../models/plbart/modeling_plbart.py          |  5 +--
 .../models/prophetnet/modeling_prophetnet.py  |  5 +--
 .../models/qdqbert/modeling_qdqbert.py        |  4 +--
 .../models/realm/modeling_realm.py            |  9 +++--
 .../models/rembert/modeling_rembert.py        |  9 +++--
 .../models/retribert/modeling_retribert.py    |  4 +--
 .../models/roberta/modeling_roberta.py        |  9 +++--
 .../modeling_roberta_prelayernorm.py          |  9 +++--
 .../models/roc_bert/modeling_roc_bert.py      |  9 +++--
 .../models/roformer/modeling_roformer.py      |  9 +++--
 src/transformers/models/sam/modeling_sam.py   |  3 +-
 src/transformers/models/sew/modeling_sew.py   |  5 +--
 .../models/sew_d/modeling_sew_d.py            |  6 ++--
 .../speech_to_text/modeling_speech_to_text.py |  5 +--
 .../modeling_speech_to_text_2.py              |  3 +-
 .../models/speecht5/modeling_speecht5.py      |  7 ++--
 .../models/splinter/modeling_splinter.py      |  9 +++--
 src/transformers/models/swin/modeling_swin.py |  4 +--
 .../models/swin2sr/modeling_swin2sr.py        |  4 +--
 .../models/swinv2/modeling_swinv2.py          |  4 +--
 .../modeling_switch_transformers.py           | 10 ++++--
 src/transformers/models/t5/modeling_t5.py     | 10 ++++--
 .../modeling_table_transformer.py             |  3 +-
 .../models/tapas/modeling_tapas.py            |  3 +-
 .../modeling_time_series_transformer.py       |  5 +--
 .../timesformer/modeling_timesformer.py       |  3 +-
 .../modeling_trajectory_transformer.py        |  3 +-
 .../models/trocr/modeling_trocr.py            |  3 +-
 src/transformers/models/tvlt/modeling_tvlt.py |  6 ++--
 .../models/unispeech/modeling_unispeech.py    |  7 ++--
 .../unispeech_sat/modeling_unispeech_sat.py   |  7 ++--
 .../models/videomae/modeling_videomae.py      |  6 ++--
 src/transformers/models/vilt/modeling_vilt.py |  3 +-
 .../visual_bert/modeling_visual_bert.py       |  9 +++--
 src/transformers/models/vit/modeling_vit.py   |  4 +--
 .../models/vit_hybrid/modeling_vit_hybrid.py  |  4 +--
 .../models/vit_mae/modeling_vit_mae.py        |  6 ++--
 .../models/vit_msn/modeling_vit_msn.py        |  4 +--
 .../models/wav2vec2/modeling_wav2vec2.py      |  7 ++--
 .../modeling_wav2vec2_conformer.py            |  5 +--
 .../models/wavlm/modeling_wavlm.py            |  7 ++--
 .../models/whisper/modeling_whisper.py        |  5 +--
 .../models/x_clip/modeling_x_clip.py          |  5 +--
 src/transformers/models/xglm/modeling_xglm.py |  3 +-
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |  5 +--
 .../xlm_roberta/modeling_xlm_roberta.py       |  9 +++--
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |  9 +++--
 src/transformers/models/xmod/modeling_xmod.py |  9 +++--
 .../models/yolos/modeling_yolos.py            |  4 +--
 src/transformers/models/yoso/modeling_yoso.py |  9 +++--
 src/transformers/pytorch_utils.py             | 15 ++++++++
 ...ng_{{cookiecutter.lowercase_modelname}}.py |  8 +++--
 tests/models/align/test_modeling_align.py     |  6 ++++
 tests/models/altclip/test_modeling_altclip.py |  6 ++++
 .../autoformer/test_modeling_autoformer.py    |  6 ++++
 tests/models/beit/test_modeling_beit.py       |  6 ++++
 .../models/big_bird/test_modeling_big_bird.py |  6 ++++
 tests/models/blip/test_modeling_blip.py       |  6 ++++
 tests/models/canine/test_modeling_canine.py   |  6 ++++
 .../test_modeling_chinese_clip.py             | 12 +++++++
 tests/models/clip/test_modeling_clip.py       |  6 ++++
 tests/models/clipseg/test_modeling_clipseg.py | 12 +++++++
 .../data2vec/test_modeling_data2vec_vision.py |  6 ++++
 tests/models/dpt/test_modeling_dpt.py         |  6 ++++
 tests/models/dpt/test_modeling_dpt_hybrid.py  |  6 ++++
 tests/models/flava/test_modeling_flava.py     | 30 ++++++++++++++++
 tests/models/fnet/test_modeling_fnet.py       |  6 ++++
 tests/models/gpt2/test_modeling_gpt2.py       |  6 ++++
 .../graphormer/test_modeling_graphormer.py    |  6 ++++
 .../models/imagegpt/test_modeling_imagegpt.py |  6 ++++
 .../models/informer/test_modeling_informer.py |  6 ++++
 .../models/layoutlm/test_modeling_layoutlm.py |  6 ++++
 tests/models/lilt/test_modeling_lilt.py       |  6 ++++
 tests/models/luke/test_modeling_luke.py       |  6 ++++
 tests/models/marian/test_modeling_marian.py   |  6 ++++
 tests/models/owlvit/test_modeling_owlvit.py   | 12 +++++++
 tests/models/pegasus/test_modeling_pegasus.py |  6 ++++
 .../pix2struct/test_modeling_pix2struct.py    |  6 ++++
 tests/models/regnet/test_modeling_regnet.py   |  6 ++++
 .../models/roformer/test_modeling_roformer.py |  6 ++++
 tests/models/sam/test_modeling_sam.py         |  6 ++++
 .../test_modeling_speech_to_text.py           |  6 ++++
 .../test_modeling_switch_transformers.py      |  6 ++++
 .../test_modeling_time_series_transformer.py  |  6 ++++
 tests/models/van/test_modeling_van.py         |  6 ++++
 tests/models/vilt/test_modeling_vilt.py       |  6 ++++
 .../visual_bert/test_modeling_visual_bert.py  |  6 ++++
 tests/models/vit_mae/test_modeling_vit_mae.py |  6 ++++
 tests/models/x_clip/test_modeling_x_clip.py   |  6 ++++
 tests/test_modeling_common.py                 | 36 ++++++++++++++++++-
 179 files changed, 836 insertions(+), 271 deletions(-)

diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 9eadaa219834..6aaf187b040b 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -34,7 +34,11 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 09ee6eca6265..0a7238ef6069 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -30,7 +30,12 @@
     BaseModelOutputWithPoolingAndNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1100,7 +1105,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 26b3f5928081..68a3a28a4801 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -30,7 +30,12 @@
     BaseModelOutputWithPoolingAndProjection,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig
 
@@ -651,7 +656,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -965,7 +970,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 0f8c045121c7..15b8c379352d 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -25,7 +25,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_audio_spectrogram_transformer import ASTConfig
 
@@ -343,7 +343,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 70587add17e7..981df3ab845c 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -34,6 +34,7 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_autoformer import AutoformerConfig
@@ -1210,7 +1211,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1428,7 +1429,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 50452449021c..9000ad3d0602 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -35,6 +35,7 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -849,7 +850,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1105,7 +1106,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index b17721fb2bcd..b546f1400191 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -34,7 +34,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -517,7 +517,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index fb92a0e84cc4..37f236d4a602 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -40,7 +40,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -598,7 +603,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index f92b7a0633e8..f20503c594df 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -25,7 +25,12 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -408,7 +413,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index e1346a23c9db..5e80d0423f74 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -37,7 +37,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward
+from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1622,7 +1622,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 8d7906631d54..1ab72f0b4912 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -36,6 +36,7 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -1945,7 +1946,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -2291,7 +2292,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index a9ecb11a61f1..c29c13547eb3 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -32,6 +32,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -594,7 +595,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 8f2780772cbd..f96531f51f76 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -36,6 +36,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -779,7 +780,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1034,7 +1035,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index ef8d51a2b0e7..b09dce88e02e 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -34,6 +34,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -777,7 +778,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1031,7 +1032,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index f16b89b7a316..93bb26c5b969 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -25,6 +25,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -620,7 +621,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 1f269cf852ee..38866578b6b0 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -34,6 +34,7 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_blip import BlipTextConfig
 
@@ -427,7 +428,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 82a879771b78..b326ff36c7ef 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -31,7 +31,12 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -492,7 +497,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
@@ -963,7 +968,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 4f6de49a1447..2144c43687ae 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -33,6 +33,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_bloom import BloomConfig
 
@@ -775,7 +776,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     alibi,
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 4290241fbc09..37424e03545a 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -32,8 +32,13 @@
     ModelOutput,
     SequenceClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig
 
@@ -810,7 +815,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index e98840fbc6d2..25d11d24e14c 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -35,7 +35,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -529,7 +534,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index a91d42f0395e..8406a9d1d42f 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -36,7 +36,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -800,7 +805,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 0adf5cfdcb18..975857024e33 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -31,7 +31,12 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -914,7 +919,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -1023,7 +1028,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                 )
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c4dbcb03f34d..fa836700066b 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -30,7 +30,13 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    meshgrid,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -947,7 +953,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
@@ -1601,7 +1607,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index ee9d660ef713..6a96715a276b 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -25,6 +25,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -644,7 +645,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 85b119653068..fc37277c34d5 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -26,6 +26,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -654,7 +655,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 8b1d34f59e7b..7cee097b2b1a 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -24,6 +24,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_codegen import CodeGenConfig
 
@@ -549,7 +550,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 023cb2784841..44d9cc9bb599 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -26,6 +26,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1395,7 +1396,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index bbdba210c233..49923ba1234e 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -35,7 +35,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_convbert import ConvBertConfig
 
@@ -639,7 +644,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index 99e3a02febf4..8784fff414fb 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -26,7 +26,8 @@
 
 from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_cvt import CvtConfig
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 168f342acd32..72a53c292cee 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -35,6 +35,7 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
@@ -300,7 +301,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -600,7 +601,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 206fe1603b00..45c182a95c3d 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -34,7 +34,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -515,7 +520,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index 77b424354892..cbef81d2a81b 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -33,7 +33,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -529,7 +529,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 9a0d43db3a0a..260e713d5b9e 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -31,7 +31,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data
+from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta import DebertaConfig
 
@@ -464,7 +464,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     next_kv,
                     attention_mask,
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 1596ad4ffad4..22aef3592404 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -32,7 +32,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data
+from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
 
@@ -508,7 +508,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                output_states = torch.utils.checkpoint.checkpoint(
+                output_states = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     next_kv,
                     attention_mask,
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 926947b1617d..64d64191c484 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -643,7 +643,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 6469cf7a65df..fc195622c2dd 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -41,7 +41,7 @@
 )
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid
+from ...pytorch_utils import meshgrid, torch_custom_checkpointing
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_deformable_detr import DeformableDetrConfig
@@ -1380,7 +1380,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     encoder_hidden_states,
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 8b03835812fc..4c5491935cad 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -33,7 +33,7 @@
     MaskedImageModelingOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -364,7 +364,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index af218829d6f9..67427b4f4137 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -36,7 +36,7 @@
 )
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid
+from ...pytorch_utils import meshgrid, torch_custom_checkpointing
 from ...utils import is_torchvision_available, logging, requires_backends
 from ..auto import AutoBackbone
 from .configuration_deta import DetaConfig
@@ -1272,7 +1272,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     encoder_hidden_states,
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index c92c43e46d18..684129663fa8 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -26,6 +26,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1130,7 +1131,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 65c48eb81f83..07f9fee14ed6 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -28,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -756,7 +756,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 187a6c36656a..0630a3c48be9 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -39,7 +39,7 @@
 )
 from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import ModelOutput, logging
 from ..auto import AutoBackbone
 from .configuration_dpt import DPTConfig
@@ -535,7 +535,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index a7ee4ec93202..3197e060bd1a 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -36,7 +36,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -576,7 +581,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index b8df1b2d5035..a5f16a3a867f 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -38,7 +38,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -511,7 +516,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index e0b26e0f7812..27b7bb2d917c 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -30,7 +30,8 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import logging
 from .configuration_esm import EsmConfig
 
@@ -610,7 +611,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 5d49197f8ca5..0f85ff06f585 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -26,7 +26,8 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -668,7 +669,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 6bc526eeebcb..8d8de88c8f1d 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -43,7 +43,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward
+from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -297,7 +297,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(layer_module), hidden_states)
+                layer_outputs = torch_custom_checkpointing(create_custom_forward(layer_module), hidden_states)
             else:
                 layer_outputs = layer_module(hidden_states)
 
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index fc327ad0b39f..9e8efed44388 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -593,7 +594,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                stage_outputs = torch.utils.checkpoint.checkpoint(
+                stage_outputs = torch_custom_checkpointing(
                     create_custom_forward(stage_module),
                     hidden_states,
                     input_dimensions,
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 23ae6d64962f..83bf591fdb98 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -34,7 +34,12 @@
     CausalLMOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_git import GitConfig, GitVisionConfig
 
@@ -457,7 +462,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -883,7 +888,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index b9a8568f00e7..dab2c613532b 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -35,8 +35,12 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...modeling_utils import Conv1D, PreTrainedModel, SequenceSummary
+from ...pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_conv1d_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -890,7 +894,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 705d07b1da25..cf23e1ba08a5 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -28,6 +28,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -661,7 +662,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index b67f4ddbfaca..768893cb4471 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -34,6 +34,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_gpt_neo import GPTNeoConfig
 
@@ -613,7 +614,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 7c3bfd1035f9..3f7bbcdf64e6 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -36,6 +36,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_gpt_neox import GPTNeoXConfig
 
@@ -557,7 +558,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index de120167989d..4969bd7fd1bb 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -31,6 +31,7 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -677,7 +678,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index c19ebd13b91d..e5ee94adbd26 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1037,7 +1038,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 70a8c079409b..774c4826c9bb 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -27,6 +27,7 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -353,7 +354,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -738,7 +739,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -828,7 +829,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 539119fabf28..31b911431f92 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -32,7 +32,7 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer, torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_imagegpt import ImageGPTConfig
 
@@ -826,7 +826,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 2bf3f208a903..4774f1d91d66 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -30,6 +30,7 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
@@ -1217,14 +1218,14 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                     )
                     if conv_layer is not None:
-                        output = torch.utils.checkpoint.checkpoint(conv_layer, layer_outputs[0])
+                        output = torch_custom_checkpointing(conv_layer, layer_outputs[0])
                         layer_outputs = (output,) + layer_outputs[1:]
                 else:
                     layer_outputs = encoder_layer(
@@ -1440,7 +1441,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 410f76509422..614bebe12196 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -33,7 +33,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlm import LayoutLMConfig
 
@@ -492,7 +497,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 5a6f39ce31a6..0e0f2c1bd823 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -31,7 +31,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward
+from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -455,7 +455,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index db6618caaeaf..31fb1f6fb572 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -32,7 +32,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward
+from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlmv3 import LayoutLMv3Config
 
@@ -671,7 +671,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index a11659e38933..8fa8c00aadf7 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -35,6 +35,7 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1884,7 +1885,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -2150,7 +2151,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index 74454d244e8d..1953992d058f 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -31,7 +31,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_lilt import LiltConfig
 
@@ -519,7 +524,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layout_inputs,
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index c9debdd252dc..2468f7088ba4 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -29,6 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_llama import LlamaConfig
 
@@ -568,7 +569,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 665e2cb56421..809d889eed47 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -25,7 +25,12 @@
 
 from ...activations import ACT2FN, gelu
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1311,7 +1316,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 1a49444e8a50..d1358a78d8f5 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -23,7 +23,6 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -33,7 +32,12 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    ALL_LAYERNORM_LAYERS,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1517,7 +1521,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index ba21d3deb32e..0f217909b0ca 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -26,7 +26,7 @@
 from ...activations import ACT2FN, gelu
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward
+from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -795,7 +795,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     word_hidden_states,
                     entity_hidden_states,
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index f8f9e1d3a8ee..db8e017d17f3 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -32,6 +32,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -827,7 +828,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1074,7 +1075,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(decoder_layer),
                         hidden_states,
                         combined_attention_mask,
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index a75f833fb5cb..15d58baeda6a 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -35,6 +35,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -790,7 +791,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1039,7 +1040,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 0c6847b47815..0792ff1b723e 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -43,6 +43,7 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_markuplm import MarkupLMConfig
 
@@ -653,7 +654,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 4cb2493e58c8..2fd61a179b1b 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -36,6 +36,7 @@
 )
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_mask2former import Mask2FormerConfig
 
@@ -1875,7 +1876,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 830f8b23c816..2b91e975ce1e 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -776,7 +777,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index 7016b598e853..e22f466edce8 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -28,7 +28,7 @@
 from ...file_utils import ModelOutput
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils.backbone_utils import BackboneMixin
 from .configuration_maskformer_swin import MaskFormerSwinConfig
 
@@ -695,7 +695,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_hidden_states, output_dimensions, layer_all_hidden_states = torch.utils.checkpoint.checkpoint(
+                layer_hidden_states, output_dimensions, layer_all_hidden_states = torch_custom_checkpointing(
                     create_custom_forward(layer_module), hidden_states, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 67750ab42f71..660177708a83 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -34,6 +34,7 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -831,7 +832,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1089,7 +1090,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py
index 08e280b3ccf9..22838d4e28d0 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/mctct/modeling_mctct.py
@@ -33,6 +33,7 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_mctct import MCTCTConfig
 
@@ -623,7 +624,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index bba7e7369cb8..9a24f41e70b0 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -40,7 +40,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -556,7 +561,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index 3503e86c9c75..e68357e6d37d 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -33,7 +33,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -633,7 +633,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
index b8c071a74f4b..bd2f2bd9cf6f 100644
--- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -32,6 +32,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -589,7 +590,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index a3cfce8ffc4a..ce5c81f63425 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -23,7 +23,6 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -33,7 +32,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1046,7 +1045,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 6a44768d8eec..4f905a7b51ed 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -34,6 +34,7 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -953,7 +954,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1231,7 +1232,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index 97c5b5a90ec3..68a78a64faeb 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -38,7 +38,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -584,7 +589,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 06b61c7497db..d67032d119bc 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -22,7 +22,6 @@
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
@@ -33,6 +32,7 @@
     Seq2SeqMoEOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -1155,7 +1155,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1428,7 +1428,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(decoder_layer),
                         hidden_states,
                         combined_attention_mask,
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index b859b0db1d4f..6bbd95e7091d 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -33,7 +33,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_nystromformer import NystromformerConfig
 
@@ -375,7 +380,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index a874611acde8..1e2c59a717d5 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -2619,7 +2620,7 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor):
         for layer in self.layers:
             if self.use_checkpoint:
-                hidden_states = torch.utils.checkpoint.checkpoint(layer, hidden_states)
+                hidden_states = torch_custom_checkpointing(layer, hidden_states)
             else:
                 hidden_states = layer(hidden_states)
         return hidden_states
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 16ad554dc313..07b19a808de3 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -29,6 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_open_llama import OpenLlamaConfig
 
@@ -603,7 +604,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index bd64630c6200..79b555a5e3e0 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -29,6 +29,7 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -700,7 +701,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     causal_attention_mask,
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index f65a0688578e..6f43ea603e3f 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -27,6 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -754,7 +755,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index a2bd3f3812e5..3af479971e18 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -34,6 +34,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -805,7 +806,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1089,7 +1090,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index 8e380a4de5f0..94fc2d25ddcb 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -33,6 +33,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -1072,7 +1073,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         global_hidden_states,
@@ -1330,7 +1331,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 2db104a5a112..0834bbeaaf7b 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -20,7 +20,6 @@
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -31,7 +30,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, torch_custom_checkpointing
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -350,7 +349,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -1502,7 +1501,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 365429360af5..23a9f928d193 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -33,6 +33,7 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -810,7 +811,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1067,7 +1068,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 9160d5e1eb46..007d9aadf268 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1336,7 +1337,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     extended_attention_mask,
@@ -1577,7 +1578,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 47a34e959072..d4371c0efbb2 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -39,7 +39,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -586,7 +586,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index f68fc04105de..09d6bb7325b4 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -31,7 +31,12 @@
     ModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_realm import RealmConfig
 
@@ -591,7 +596,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index da4ad9608514..06da821d4dd3 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -36,7 +36,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -548,7 +553,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py
index 240d9476e70b..e1397d39ceae 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/retribert/modeling_retribert.py
@@ -21,10 +21,10 @@
 from typing import Optional
 
 import torch
-import torch.utils.checkpoint as checkpoint
 from torch import nn
 
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, logging
 from ..bert.modeling_bert import BertModel
 from .configuration_retribert import RetriBertConfig
@@ -141,7 +141,7 @@ def partial_encode(*inputs):
             for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
                 b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
                 b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
-                pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
+                pooled_output = torch_custom_checkpointing(partial_encode, b_embedding_output, b_attention_mask)
                 pooled_output_list.append(pooled_output)
             return torch.cat(pooled_output_list, dim=0)
 
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index b0f136924601..f86fa4aa8082 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -35,7 +35,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -515,7 +520,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index b1e02e27f138..01276cd07119 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -35,7 +35,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -517,7 +522,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 7647c14a9ea3..63abc9d4aa18 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -35,7 +35,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -649,7 +654,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index b966bf4490a9..586ecbd2dad6 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -36,7 +36,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -585,7 +590,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index c3cbaa9176f0..0e4177d90b0a 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
 
@@ -1049,7 +1050,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index dd854c49f5c9..75ad1f97dffe 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -28,6 +28,7 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew import SEWConfig
 
@@ -367,7 +368,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -680,7 +681,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 7f7c1977d692..b7acb306bb91 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -29,7 +29,7 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data
+from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew_d import SEWDConfig
 
@@ -460,7 +460,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -1141,7 +1141,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                output_states = torch.utils.checkpoint.checkpoint(
+                output_states = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     next_kv,
                     attention_mask,
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index d8a19084eb38..3e2024dc69ec 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -31,6 +31,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_speech_to_text import Speech2TextConfig
 
@@ -820,7 +821,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1068,7 +1069,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index c13b04642d9d..12e8d4592adb 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -27,6 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_speech_to_text_2 import Speech2Text2Config
 
@@ -677,7 +678,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 3e8ce5a23b7e..5988607f1cb4 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -35,6 +35,7 @@
     Seq2SeqSpectrogramOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig
 
@@ -528,7 +529,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -1394,7 +1395,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1723,7 +1724,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 6e636fb695da..88d6a480b705 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -27,7 +27,12 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_splinter import SplinterConfig
 
@@ -464,7 +469,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index b324cfdcd935..93144c66a913 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -29,7 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -832,7 +832,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index cd58b7065058..6b1b80334555 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageSuperResolutionOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -753,7 +753,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(stage_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 97b460479d6d..07dd0a79b7ae 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -28,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -908,7 +908,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 008e23531ac1..1378ec9a98d5 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -23,7 +23,6 @@
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -33,7 +32,12 @@
     Seq2SeqMoEOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    ALL_LAYERNORM_LAYERS,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1075,7 +1079,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 050309fa9a33..4531214e19cd 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -24,7 +24,6 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -34,7 +33,12 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    ALL_LAYERNORM_LAYERS,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1074,7 +1078,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 733ff7b9b453..998f21a28610 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -26,6 +26,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1074,7 +1075,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 1621653f3ee0..4f736a367e30 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -34,6 +34,7 @@
     find_pruneable_heads_and_indices,
     is_torch_greater_or_equal_than_1_12,
     prune_linear_layer,
+    torch_custom_checkpointing,
 )
 from ...utils import (
     ModelOutput,
@@ -653,7 +654,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 8986ef6729ca..e3e0b3055d8b 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -31,6 +31,7 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
@@ -949,7 +950,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1166,7 +1167,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index 9f886b6ece53..5ff5bd7fd19d 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -27,6 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_timesformer import TimesformerConfig
 
@@ -446,7 +447,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
index e8ecedccb5ea..1027bd73f3fe 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
@@ -26,6 +26,7 @@
 from torch.nn import functional as F
 
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -556,7 +557,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch.utils.checkpoint.checkpoint(
+                outputs = torch_custom_checkpointing(
                     create_custom_forward(block),
                     hidden_states,
                     layer_past,
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 6276c68a425d..e8ee10f7defd 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -27,6 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_trocr import TrOCRConfig
 
@@ -709,7 +710,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/tvlt/modeling_tvlt.py b/src/transformers/models/tvlt/modeling_tvlt.py
index 3725c5e7728b..4b990cdb03eb 100644
--- a/src/transformers/models/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/tvlt/modeling_tvlt.py
@@ -29,7 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -567,7 +567,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -884,7 +884,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index e068fa59e579..5bd1af95c75a 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -29,6 +29,7 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -391,7 +392,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -774,7 +775,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -864,7 +865,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 2ed8a5d57204..f603d2712f6d 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -36,6 +36,7 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -405,7 +406,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -788,7 +789,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -878,7 +879,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index c62d0c4632cb..5f44a5e4b3aa 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -30,7 +30,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -441,7 +441,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
@@ -724,7 +724,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 6ee1e396a625..5499a26cc748 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -38,6 +38,7 @@
     find_pruneable_heads_and_indices,
     meshgrid,
     prune_linear_layer,
+    torch_custom_checkpointing,
 )
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_vilt import ViltConfig
@@ -536,7 +537,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 0bef6e4af9d9..a73d6ac72072 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -32,7 +32,12 @@
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -423,7 +428,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index bfd440caae2b..28ea5740ca59 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -32,7 +32,7 @@
     MaskedImageModelingOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -404,7 +404,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 051d431946a8..ba3bbddf563b 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from ..auto import AutoBackbone
 from .configuration_vit_hybrid import ViTHybridConfig
@@ -422,7 +422,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index ef0c7c9f3686..5a9c539fbc0b 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -29,7 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -543,7 +543,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
@@ -800,7 +800,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index 46639e7d622c..4f7b412fecb8 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_vit_msn import ViTMSNConfig
 
@@ -394,7 +394,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 43ab2408bb23..9705a51e488a 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -37,6 +37,7 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -458,7 +459,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -810,7 +811,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -899,7 +900,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 3e37a4a504b0..86c0cbe5e2d6 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -35,6 +35,7 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -523,7 +524,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -916,7 +917,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index e4072d93724f..35dc46bac1f9 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -36,6 +36,7 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_wavlm import WavLMConfig
 
@@ -361,7 +362,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(
+                hidden_states = torch_custom_checkpointing(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -720,7 +721,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -811,7 +812,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 515b886f98db..703607ad4a69 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -34,6 +34,7 @@
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -853,7 +854,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         None,
@@ -1085,7 +1086,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 8db4ee0fd194..be6b28189003 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -26,6 +26,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -708,7 +709,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
@@ -955,7 +956,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 4a72b785a024..61b51d51fcbc 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -27,6 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xglm import XGLMConfig
 
@@ -683,7 +684,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index 2d14bfb6a7b5..fd90086672cf 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -29,6 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1356,7 +1357,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     extended_attention_mask,
@@ -1600,7 +1601,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index ae8d51a3f8eb..e00574239c92 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -35,7 +35,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -516,7 +521,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index fb86717e1d7f..71f8de5a7277 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -34,7 +34,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -504,7 +509,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index d99b77fedda3..44e50bed3b21 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -34,7 +34,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xmod import XmodConfig
 
@@ -578,7 +583,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     lang_ids,
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index e3cb02ceae6e..4b4aa0124167 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -499,7 +499,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index 8c2ff9fa4e07..1b1e6b13add8 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -34,7 +34,12 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import (
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+    torch_custom_checkpointing,
+)
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_yoso import YosoConfig
 
@@ -566,7 +571,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 4723c43035e6..520eeb89393a 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -285,3 +285,18 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     non-overlapping lifetimes may have the same id.
     """
     return tensor.device, storage_ptr(tensor), storage_size(tensor)
+
+
+def torch_custom_checkpointing(*args):
+    r"""
+    A correct usage of `torch.utils.checkpoint.checkpoint` as the default call leads to silent bugs that leads to the
+    gradients of the last layers not being updated. For more in depth detail of the issue, please have a look at:
+    https://github.com/huggingface/transformers/pull/24247
+    """
+    kwargs = {}
+    if "use_reentrant" in list(inspect.signature(torch.utils.checkpoint.checkpoint).parameters):
+        kwargs["use_reentrant"] = False
+    return torch.utils.checkpoint.checkpoint(
+        *args,
+        **kwargs,
+    )
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 4899e195986f..c5d141b1f4b8 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -43,6 +43,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
+from ...pytorch_utils import torch_custom_checkpointing
 from ...pytorch_utils import (
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
@@ -550,7 +551,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -1585,6 +1586,7 @@ def forward(
     CausalLMOutputWithCrossAttentions
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -2318,7 +2320,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                    layer_outputs = torch_custom_checkpointing(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -2557,7 +2559,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch.utils.checkpoint.checkpoint(
+                layer_outputs = torch_custom_checkpointing(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 2357c20e213a..c8ac69840f77 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -352,6 +352,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="ALIGN does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 28213de84df6..266e0c47b6bb 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -186,6 +186,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 9f0434689c4b..ad006d9d0794 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -238,6 +238,12 @@ def test_encoder_decoder_model_standalone(self):
     def test_resize_tokens_embeddings(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     # # Input is 'static_categorical_features' not 'input_ids'
     def test_model_main_input_name(self):
         model_signature = inspect.signature(getattr(AutoformerModel, "forward"))
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index f9aa7339f7e0..149820023a69 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -227,6 +227,12 @@ def test_inputs_embeds(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index f86c6d0ac70a..45bff430bfdb 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -609,6 +609,12 @@ def test_for_change_to_full_attn(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     # overwrite from common in order to skip the check on `attentions`
     def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
         # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 7d9c6b5ba58b..a34efc026474 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -789,6 +789,12 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index d612a02bf47c..6e6d7ce3836a 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -499,6 +499,12 @@ def test_inputs_embeds(self):
         # ViT does not use inputs_embeds
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip("CANINE does not have a get_input_embeddings() method.")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 57f532da8635..cf2668f4d8b5 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -395,6 +395,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
@@ -469,6 +475,12 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index d16241ab2f22..82592d8452f5 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -227,6 +227,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index b54861d8d8d0..387a2e1c8f34 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -202,6 +202,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
@@ -448,6 +454,12 @@ def test_model_for_image_segmentation(self):
     def test_hidden_states_output(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index b4c391fea17e..90786de24978 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -310,6 +310,12 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 76790ee79502..5889653991ca 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -182,6 +182,12 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 6d4a75c80da1..04ba8c0289be 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -196,6 +196,12 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 2544b7ee93f6..b6d71f33a684 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -185,6 +185,12 @@ def test_inputs_embeds(self):
         # FLAVA does not use inputs_embeds
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -462,6 +468,12 @@ def test_inputs_embeds(self):
         # FLAVA does not use inputs_embeds
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     # skip this test as FlavaTextModel has no base class and is
     # not available in MODEL_MAPPING
     def test_save_load_fast_init_from_base(self):
@@ -624,6 +636,12 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -731,6 +749,12 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_CODEBOOK_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -1156,6 +1180,12 @@ class FlavaForPreTrainingTest(FlavaModelTest):
     class_for_tester = FlavaForPreTrainingTester
     test_torchscript = False
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index e7e592d5b62f..968218427365 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -444,6 +444,12 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 65542b495497..620bb30b2657 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -562,6 +562,12 @@ def test_gpt2_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_batch_generation(self):
         model = GPT2LMHeadModel.from_pretrained("gpt2")
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index e874ebf0f44a..f1c63729e000 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -356,6 +356,12 @@ def test_inputs_embeds(self):
     def test_feed_forward_chunking(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="Graphormer does not share input and output embeddings")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 27d83f3eb8c1..1f4ea02f8d20 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -304,6 +304,12 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_imagegpt_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_imagegpt_model(*config_and_inputs)
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index f3c8539d8450..2202d62242ca 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -216,6 +216,12 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 0535fbf4e1f4..b88d0c4b50d8 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,6 +279,12 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
 
 def prepare_layoutlm_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index 1bb92300c3db..4032504b8b25 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -275,6 +275,12 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in LILT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 35bdb6b6d5fa..4e1ef3d173b4 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -697,6 +697,12 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in LUKE_PRETRAINED_MODEL_ARCHIVE_LIST:
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 6cbcd55d3f76..933383d2929a 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -263,6 +263,12 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_save_load_strict(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index acf078ffe800..83fb86ba0e93 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -155,6 +155,12 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -633,6 +639,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             return
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index bde7477f9450..1f409d1b004b 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -280,6 +280,12 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         input_ids = input_dict["input_ids"]
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 8ec023676d63..1eba4cb10c28 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -332,6 +332,12 @@ def test_model(self):
     def test_training(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
     def test_training_gradient_checkpointing(self):
         pass
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index e7c33699fda7..9b260845287b 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -161,6 +161,12 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 357e126a047a..6d54b7c1286e 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -452,6 +452,12 @@ def test_model_as_decoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_model_as_decoder_with_default_input_mask(self):
         # This regression test was failing with PyTorch < 1.3
         (
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index a0f39a401355..8a4772138647 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -421,6 +421,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 16ad704fd510..1524ce24d262 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -324,6 +324,12 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         input_features = input_dict["input_features"]
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index f8730d899329..4ff4554fbc35 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -613,6 +613,12 @@ def test_decoder_model_past_with_attn_mask(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_beam_sample_generate_dict_output(self):
         r"""
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 42319a1dd0a2..44962267feea 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -200,6 +200,12 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_save_load_strict(self):
         config, _ = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py
index 49df30a828a6..7ec941dbc885 100644
--- a/tests/models/van/test_modeling_van.py
+++ b/tests/models/van/test_modeling_van.py
@@ -243,6 +243,12 @@ def test_model_from_pretrained(self):
             model = VanModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 772091d5b976..17447acf680d 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -340,6 +340,12 @@ def test_determinism(self):
     def test_model_outputs_equivalence(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index cf48fd7ffbec..5dae4ebe1f94 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -549,6 +549,12 @@ def test_model_for_flickr(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_flickr()
         self.model_tester.create_and_check_for_flickr(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index c58e2e94802e..77c36bef8bab 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -208,6 +208,12 @@ def test_for_pretraining(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 2efece44caeb..7fd65d871dbf 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -202,6 +202,12 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip(
+        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
+    )
+    def test_training_gradient_checkpointing_autocast(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 07a8b16bfef7..7c02141f057e 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import collections
 import copy
 import gc
@@ -549,6 +548,41 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
+    @slow
+    @require_torch_gpu
+    def test_training_gradient_checkpointing_autocast(self):
+        if not self.model_tester.is_training:
+            return
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            if (
+                model_class.__name__
+                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
+                or not model_class.supports_gradient_checkpointing
+            ):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+
+            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+            model.gradient_checkpointing_enable()
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            with torch.cuda.amp.autocast(True, dtype=torch.float16):
+                output = model(**inputs)[0]
+                loss = output.mean()
+
+                loss.backward()
+                optimizer.step()
+
+                for n, param in model.named_parameters():
+                    self.assertTrue(param.grad is not None, f"None gradient in param {n}")
+
     def test_attention_outputs(self):
         if not self.has_attentions:
             self.skipTest(reason="Model does not output attentions")

From 1a6fb930fbca82d80f2c1dbd2935ec5e8cdb2cdb Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 21 Jun 2023 11:19:42 -0400
Subject: [PATCH 492/935] Clean up dist import (#24402)

---
 src/transformers/training_args.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index ad0df7f99d11..dc692f5aa74f 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -87,9 +87,9 @@
             )
             import torch_xla.distributed.xla_backend as xbn
 
-            if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
-                torch.distributed.init_process_group(backend="xla")
-                if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla):
+            if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
+                dist.init_process_group(backend="xla")
+                if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla):
                     raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.")
 
 
@@ -1716,11 +1716,7 @@ def _setup_devices(self) -> "torch.device":
         if not is_sagemaker_mp_enabled():
             device = self.distributed_state.device
             self.local_rank = self.distributed_state.local_process_index
-        if (
-            torch.distributed.is_available()
-            and torch.distributed.is_initialized()
-            and self.parallel_mode != ParallelMode.DISTRIBUTED
-        ):
+        if dist.is_available() and dist.is_initialized() and self.parallel_mode != ParallelMode.DISTRIBUTED:
             logger.warning(
                 "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
                 "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
@@ -1963,10 +1959,8 @@ def main_process_first(self, local=True, desc="work"):
                     logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
                     if is_torch_tpu_available():
                         xm.rendezvous(desc)
-                    elif is_sagemaker_dp_enabled():
-                        dist.barrier()
                     else:
-                        torch.distributed.barrier()
+                        dist.barrier()
                 yield
             finally:
                 if is_main_process:
@@ -1974,10 +1968,8 @@ def main_process_first(self, local=True, desc="work"):
                     logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
                     if is_torch_tpu_available():
                         xm.rendezvous(desc)
-                    elif is_sagemaker_dp_enabled():
-                        dist.barrier()
                     else:
-                        torch.distributed.barrier()
+                        dist.barrier()
         else:
             yield
 

From 0f968ddaa391ceefb177a633d7122da132b38efe Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 21 Jun 2023 17:31:57 +0200
Subject: [PATCH 493/935] Check auto mappings could be imported via `from
 transformers` (#24400)

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/__init__.py                  |  4 +++
 src/transformers/models/auto/__init__.py      |  4 +++
 .../models/auto/modeling_flax_auto.py         |  3 +++
 src/transformers/utils/dummy_flax_objects.py  |  3 +++
 src/transformers/utils/dummy_pt_objects.py    |  3 +++
 utils/check_repo.py                           | 25 +++++++++++++++++++
 6 files changed, 42 insertions(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3217a9ca0465..548e9bf660d1 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1028,6 +1028,7 @@
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING",
             "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
             "MODEL_FOR_BACKBONE_MAPPING",
             "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
@@ -3646,6 +3647,7 @@
     )
     _import_structure["models.auto"].extend(
         [
+            "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
             "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
             "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
@@ -4780,6 +4782,7 @@
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
             MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_BACKBONE_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
@@ -6901,6 +6904,7 @@
             FlaxAlbertPreTrainedModel,
         )
         from .models.auto import (
+            FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
             FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_MASKED_LM_MAPPING,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 5af79da56f7d..df9958a80e4d 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -40,6 +40,7 @@
 else:
     _import_structure["modeling_auto"] = [
         "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING",
         "MODEL_FOR_AUDIO_XVECTOR_MAPPING",
         "MODEL_FOR_BACKBONE_MAPPING",
         "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
@@ -162,6 +163,7 @@
     pass
 else:
     _import_structure["modeling_flax_auto"] = [
+        "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
         "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
         "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
         "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
@@ -207,6 +209,7 @@
     else:
         from .modeling_auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+            MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
             MODEL_FOR_AUDIO_XVECTOR_MAPPING,
             MODEL_FOR_BACKBONE_MAPPING,
             MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
@@ -329,6 +332,7 @@
         pass
     else:
         from .modeling_flax_auto import (
+            FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
             FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             FLAX_MODEL_FOR_MASKED_LM_MAPPING,
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index e3b8d9cf5b52..44ef84448119 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -264,6 +264,9 @@
 FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
 )
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+)
 
 
 class FlaxAutoModel(_BaseAutoModelClass):
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index ce571bc9f8d0..cc3f48cb25e2 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -135,6 +135,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
+
+
 FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = None
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5971244e66d3..1490adf3b20e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -452,6 +452,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
 
 
+MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_AUDIO_XVECTOR_MAPPING = None
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 81dbfcb5f05e..03edeab17786 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -711,6 +711,29 @@ def check_all_auto_mapping_names_in_config_mapping_names():
         raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
 
 
+def check_all_auto_mappings_importable():
+    """Check all auto mappings could be imported."""
+    check_missing_backends()
+
+    failures = []
+    mappings_to_check = {}
+    # Each auto modeling files contains multiple mappings. Let's get them in a dynamic way.
+    for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]:
+        module = getattr(transformers.models.auto, module_name, None)
+        if module is None:
+            continue
+        # all mappings in a single auto modeling file
+        mapping_names = [x for x in dir(module) if x.endswith("_MAPPING_NAMES")]
+        mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
+
+    for name, _ in mappings_to_check.items():
+        name = name.replace("_MAPPING_NAMES", "_MAPPING")
+        if not hasattr(transformers, name):
+            failures.append(f"`{name}` should be defined in the main `__init__` file.")
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
 _re_decorator = re.compile(r"^\s*@(\S+)\s+$")
 
 
@@ -993,6 +1016,8 @@ def check_repo_quality():
     check_all_auto_object_names_being_defined()
     print("Checking all keys in auto name mappings are defined in `CONFIG_MAPPING_NAMES`.")
     check_all_auto_mapping_names_in_config_mapping_names()
+    print("Checking all auto mappings could be imported.")
+    check_all_auto_mappings_importable()
 
 
 if __name__ == "__main__":

From cd927a47361c95c8adc5037df6453b65cca9149f Mon Sep 17 00:00:00 2001
From: Matthijs Hollemans <mail@hollance.com>
Date: Wed, 21 Jun 2023 17:48:21 +0200
Subject: [PATCH 494/935] add word-level timestamps to Whisper (#23205)

* let's go!

* initial implementation of token-level timestamps

* only return a single timestamp per token

* remove token probabilities

* fix return type

* fix doc comment

* strip special tokens

* rename

* revert to not stripping special tokens

* only support models that have alignment_heads

* add integration test

* consistently name it token-level timestamps

* small DTW tweak

* initial support for ASR pipeline

* fix pipeline doc comments

* resolve token timestamps in pipeline with chunking

* change warning when no final timestamp is found

* return word-level timestamps

* fixup

* fix bug that skipped final word in each chunk

* fix failing unit tests

* merge punctuations into the words

* also return word tokens

* also return token indices

* add (failing) unit test for combine_tokens_into_words

* make combine_tokens_into_words private

* restore OpenAI's punctuation rules

* add pipeline tests

* make requested changes

* PR review changes

* fix failing pipeline test

* small stuff from PR

* only return words and their timestamps, not segments

* move alignment_heads into generation config

* forgot to set alignment_heads in pipeline tests

* tiny comment fix

* grr
---
 .../models/whisper/configuration_whisper.py   |   8 +-
 .../models/whisper/modeling_whisper.py        | 138 +++++++++++-
 .../models/whisper/tokenization_whisper.py    | 206 +++++++++++++++++-
 .../whisper/tokenization_whisper_fast.py      |   2 +-
 .../pipelines/automatic_speech_recognition.py |  28 ++-
 tests/models/whisper/test_modeling_whisper.py |  29 +++
 .../whisper/test_tokenization_whisper.py      |  20 +-
 ..._pipelines_automatic_speech_recognition.py |  50 +++++
 8 files changed, 456 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index c95e5b28874a..a8bbc9718f11 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -171,7 +171,9 @@ class WhisperConfig(PretrainedConfig):
             The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
             step, irrespectively of `mask_feature_prob`. Only relevant if
             `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
-
+        median_filter_width (`int`, *optional*, defaults to 7):
+            Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
+            Should be an odd number.
 
     Example:
 
@@ -229,6 +231,7 @@ def __init__(
         mask_feature_prob=0.0,
         mask_feature_length=10,
         mask_feature_min_masks=0,
+        median_filter_width=7,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -265,6 +268,9 @@ def __init__(
         self.mask_feature_prob = mask_feature_prob
         self.mask_feature_length = mask_feature_length
         self.mask_feature_min_masks = mask_feature_min_masks
+
+        self.median_filter_width = median_filter_width
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 703607ad4a69..ef6a98b6c5ba 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -227,6 +227,81 @@ def compute_num_masked_span(input_length):
     return spec_aug_mask
 
 
+def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Applies a median filter of width `filter_width` along the last dimension of the input.
+
+    The `inputs` tensor is assumed to be 3- or 4-dimensional.
+    """
+    if filter_width <= 0 or filter_width % 2 != 1:
+        raise ValueError("`filter_width` should be an odd number")
+
+    pad_width = filter_width // 2
+    if inputs.shape[-1] <= pad_width:
+        return inputs
+
+    # Pad the left and right edges.
+    inputs = nn.functional.pad(inputs, (pad_width, pad_width, 0, 0), mode="reflect")
+
+    # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+    result = inputs.unfold(-1, filter_width, 1).sort()[0][..., pad_width]
+    return result
+
+
+def _dynamic_time_warping(matrix: np.ndarray):
+    """
+    Measures similarity between two temporal sequences: the input audio and the output tokens. Used to generate
+    token-level timestamps.
+    """
+    output_length, input_length = matrix.shape
+    cost = np.ones((output_length + 1, input_length + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((output_length + 1, input_length + 1), dtype=np.float32)
+
+    cost[0, 0] = 0
+    for j in range(1, input_length + 1):
+        for i in range(1, output_length + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+
+            cost[i, j] = matrix[i - 1, j - 1] + c
+            trace[i, j] = t
+
+    # backtrace
+    i = trace.shape[0] - 1
+    j = trace.shape[1] - 1
+    trace[0, :] = 2
+    trace[:, 0] = 1
+
+    text_indices = []
+    time_indices = []
+    while i > 0 or j > 0:
+        text_indices.append(i - 1)
+        time_indices.append(j - 1)
+        if trace[i, j] == 0:
+            i -= 1
+            j -= 1
+        elif trace[i, j] == 1:
+            i -= 1
+        elif trace[i, j] == 2:
+            j -= 1
+        else:
+            raise RuntimeError(
+                f"Internal error in dynamic time warping. Unexpected trace[{i}, {j}]. Please file a bug report."
+            )
+
+    text_indices = np.array(text_indices)[::-1]
+    time_indices = np.array(time_indices)[::-1]
+    return text_indices, time_indices
+
+
 class WhisperPositionalEmbedding(nn.Embedding):
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__(num_positions, embedding_dim)
@@ -1472,6 +1547,7 @@ def generate(
         language=None,
         is_multilingual=None,
         prompt_ids: Optional[torch.Tensor] = None,
+        return_token_timestamps=None,
         **kwargs,
     ):
         """
@@ -1534,6 +1610,10 @@ def generate(
                 provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
                 transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
                 correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
             kwargs:
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -1662,7 +1742,19 @@ def generate(
         if generation_config.return_timestamps:
             logits_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
 
-        return super().generate(
+        if return_token_timestamps:
+            kwargs["output_attentions"] = True
+            kwargs["return_dict_in_generate"] = True
+
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+
+        outputs = super().generate(
             inputs,
             generation_config,
             logits_processor,
@@ -1672,6 +1764,11 @@ def generate(
             **kwargs,
         )
 
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            outputs["token_timestamps"] = self._extract_token_timestamps(outputs, generation_config.alignment_heads)
+
+        return outputs
+
     def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
@@ -1693,7 +1790,6 @@ def prepare_inputs_for_generation(
             "decoder_attention_mask": None,
         }
 
-    #
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
@@ -1701,6 +1797,44 @@ def _reorder_cache(past_key_values, beam_idx):
             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
 
+    def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02):
+        """
+        Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
+        map each output token to a position in the input audio.
+
+        Returns:
+            tensor containing the timestamps in seconds for each predicted token
+        """
+        # Create a list with `decoder_layers` elements, each a tensor of shape
+        # (batch size, attention_heads, output length, input length).
+        cross_attentions = []
+        for i in range(self.config.decoder_layers):
+            cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
+
+        # Select specific cross-attention layers and heads. This is a tensor
+        # of shape (batch size, num selected, output length, input length).
+        weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
+        weights = weights.permute([1, 0, 2, 3])
+
+        # Normalize and smoothen the weights.
+        std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
+        weights = (weights - mean) / std
+        weights = _median_filter(weights, self.config.median_filter_width)
+
+        # Average the different cross-attention heads.
+        matrix = weights.mean(dim=1)
+
+        timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
+
+        # Perform dynamic time warping on each element of the batch.
+        for batch_idx in range(timestamps.shape[0]):
+            text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].double().cpu().numpy())
+            jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+            jump_times = time_indices[jumps] * time_precision
+            timestamps[batch_idx, 1:] = torch.tensor(jump_times)
+
+        return timestamps
+
 
 @add_start_docstrings(
     """
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 52a2e2dd722e..6053f479aac6 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -585,7 +585,7 @@ def decode(
                 Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                 timestamps.
             decode_with_timestamps (`bool`, *optional*, defaults to `False`):
-                WHether or not to decode with timestamps included in the raw text.
+                Whether or not to decode with timestamps included in the raw text.
         Returns:
             `str`: The decoded sentence.
         """
@@ -779,6 +779,7 @@ def new_chunk():
     time_offset = 0.0
     timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
     previous_tokens = []
+    previous_token_timestamps = []
     skip = False
     right_stride_start = None
 
@@ -788,6 +789,8 @@ def new_chunk():
         # We can drop everything to Python list, it's going to make
         # our lives easier
         token_ids = output["tokens"][0].tolist()
+        if return_timestamps == "word":
+            token_timestamps = output["token_timestamps"][0].tolist()
 
         # Those keep track of timestamps within strides
         # Which need to be skipped and resolve all tokens in a single
@@ -820,6 +823,7 @@ def new_chunk():
                         last_timestamp = token
 
         current_tokens = []
+        current_token_timestamps = []
 
         # - all tokens within output
         for i, token in enumerate(token_ids):
@@ -883,20 +887,37 @@ def new_chunk():
                         chunk["timestamp"][1] = time
                         # Handling merges.
                         previous_tokens.append(current_tokens)
-                        resolved_tokens = _find_longest_common_sequence(previous_tokens)
+                        if return_timestamps == "word":
+                            previous_token_timestamps.append(current_token_timestamps)
+                        resolved_tokens, resolved_token_timestamps = _find_longest_common_sequence(
+                            previous_tokens, previous_token_timestamps
+                        )
                         resolved_text = tokenizer.decode(resolved_tokens)
                         chunk["text"] = resolved_text
+                        if return_timestamps == "word":
+                            chunk["words"] = _collate_word_timestamps(
+                                tokenizer, resolved_tokens, resolved_token_timestamps, last_language
+                            )
                         chunks.append(chunk)
 
                         # Flush all our temporary context
                         previous_tokens = []
                         current_tokens = []
+                        previous_token_timestamps = []
+                        current_token_timestamps = []
                         chunk = new_chunk()
             else:
                 # 4/ Regular token
                 # We just append to the list of all tokens so we can handle
                 # merges later and decode into text.
                 current_tokens.append(token)
+                if return_timestamps == "word":
+                    start_time = round(token_timestamps[i] + time_offset, 2)
+                    if i + 1 < len(token_timestamps):
+                        end_time = round(token_timestamps[i + 1] + time_offset, 2)
+                    else:
+                        end_time = None  # should never happen
+                    current_token_timestamps.append((start_time, end_time))
 
         if "stride" in output:
             time_offset += chunk_len - stride_right
@@ -904,21 +925,31 @@ def new_chunk():
         # Leftover tokens
         if current_tokens:
             previous_tokens.append(current_tokens)
+            if return_timestamps == "word":
+                previous_token_timestamps.append(current_token_timestamps)
         elif not (any(p for p in previous_tokens)):
             chunk = new_chunk()
             previous_tokens = []
             current_tokens = []
+            previous_token_timestamps = []
+            current_token_timestamps = []
 
     if previous_tokens:
         if return_timestamps:
             logger.warning(
-                "There was an error while processing timestamps, we haven't found a timestamp as last token. Was"
-                " WhisperTimeStampLogitsProcessor used?"
+                "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. "
+                "Also make sure WhisperTimeStampLogitsProcessor was used during generation."
             )
         # Happens when we don't use timestamps
-        resolved_tokens = _find_longest_common_sequence(previous_tokens)
+        resolved_tokens, resolved_token_timestamps = _find_longest_common_sequence(
+            previous_tokens, previous_token_timestamps
+        )
         resolved_text = tokenizer.decode(resolved_tokens)
         chunk["text"] = resolved_text
+        if return_timestamps == "word":
+            chunk["words"] = _collate_word_timestamps(
+                tokenizer, resolved_tokens, resolved_token_timestamps, last_language
+            )
         chunks.append(chunk)
 
     # Preparing and cleaning up the pipeline output
@@ -931,20 +962,35 @@ def new_chunk():
                 chunk["timestamp"] = tuple(chunk["timestamp"])
             if not return_language:
                 chunk.pop("language")
-        optional = {"chunks": chunks}
+
+        if return_timestamps == "word":
+            new_chunks = []
+            for chunk in chunks:
+                new_chunks.extend(chunk["words"])
+            optional = {"chunks": new_chunks}
+        else:
+            optional = {"chunks": chunks}
     else:
         optional = {}
     return full_text, optional
 
 
-def _find_longest_common_sequence(sequences):
+def _find_longest_common_sequence(sequences, token_timestamp_sequences=None):
     # It would be much harder to do O(n) because of fault tolerance.
     # We actually have a really good property which is that the total sequence
     # MUST be those subsequences in order.
+    # If token_timestamp_sequences is provided, will split those sequences in
+    # exactly the same way.
+
     left_sequence = sequences[0]
     left_length = len(left_sequence)
     total_sequence = []
-    for right_sequence in sequences[1:]:
+
+    if token_timestamp_sequences:
+        left_token_timestamp_sequence = token_timestamp_sequences[0]
+        total_token_timestamp_sequence = []
+
+    for seq_idx, right_sequence in enumerate(sequences[1:]):
         # index = 0
         max_ = 0.0
         max_indices = (left_length, left_length, 0, 0)
@@ -1018,6 +1064,148 @@ def _find_longest_common_sequence(sequences):
         left_sequence = right_sequence[right_mid:]
         left_length = len(left_sequence)
 
+        if token_timestamp_sequences:
+            total_token_timestamp_sequence.extend(left_token_timestamp_sequence[:left_mid])
+            left_token_timestamp_sequence = token_timestamp_sequences[seq_idx + 1][right_mid:]
+
     total_sequence.extend(left_sequence)
 
-    return total_sequence
+    if token_timestamp_sequences is None:
+        return total_sequence
+
+    if len(token_timestamp_sequences) > 0:
+        total_token_timestamp_sequence.extend(left_token_timestamp_sequence)
+        return total_sequence, total_token_timestamp_sequence
+    else:
+        return total_sequence, []
+
+
+def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language):
+    words, _, token_indices = _combine_tokens_into_words(tokenizer, tokens, language)
+    timings = [
+        {
+            "text": word,
+            "timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
+        }
+        for word, indices in zip(words, token_indices)
+    ]
+    return timings
+
+
+def _combine_tokens_into_words(
+    tokenizer,
+    tokens: List[int],
+    language: str = None,
+    prepend_punctuations: str = "\"'“¡¿([{-",
+    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+):
+    """
+    Groups tokens by word. Returns a tuple containing a list of strings with the words, and a list of `token_id`
+    sequences with the tokens making up each word.
+    """
+    if language is None:
+        language = tokenizer.language
+    if language is None:
+        language = "english"
+
+    if language in {"chinese", "japanese", "thai", "lao", "myanmar"}:
+        # These languages don't typically use spaces.
+        words, word_tokens, token_indices = _split_tokens_on_unicode(tokenizer, tokens)
+    else:
+        words, word_tokens, token_indices = _split_tokens_on_spaces(tokenizer, tokens)
+
+    _merge_punctuations(words, word_tokens, token_indices, prepend_punctuations, append_punctuations)
+    return words, word_tokens, token_indices
+
+
+def _split_tokens_on_unicode(tokenizer, tokens: List[int]):
+    """Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points."""
+    decoded_full = tokenizer.decode(tokens, decode_with_timestamps=True)
+    replacement_char = "\ufffd"
+
+    words = []
+    word_tokens = []
+    token_indices = []
+    current_tokens = []
+    current_indices = []
+    unicode_offset = 0
+
+    for token_idx, token in enumerate(tokens):
+        current_tokens.append(token)
+        current_indices.append(token_idx)
+        decoded = tokenizer.decode(current_tokens, decode_with_timestamps=True)
+
+        if (
+            replacement_char not in decoded
+            or decoded_full[unicode_offset + decoded.index(replacement_char)] == replacement_char
+        ):
+            words.append(decoded)
+            word_tokens.append(current_tokens)
+            token_indices.append(current_indices)
+            current_tokens = []
+            current_indices = []
+            unicode_offset += len(decoded)
+
+    return words, word_tokens, token_indices
+
+
+def _split_tokens_on_spaces(tokenizer, tokens: List[int]):
+    """Combine tokens into words by splitting at whitespace and punctuation tokens."""
+    subwords, subword_tokens_list, subword_indices_list = _split_tokens_on_unicode(tokenizer, tokens)
+    words = []
+    word_tokens = []
+    token_indices = []
+
+    for subword, subword_tokens, subword_indices in zip(subwords, subword_tokens_list, subword_indices_list):
+        special = subword_tokens[0] >= tokenizer.eos_token_id
+        with_space = subword.startswith(" ")
+        punctuation = subword.strip() in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+        if special or with_space or punctuation or len(words) == 0:
+            words.append(subword)
+            word_tokens.append(subword_tokens)
+            token_indices.append(subword_indices)
+        else:
+            words[-1] = words[-1] + subword
+            word_tokens[-1].extend(subword_tokens)
+            token_indices[-1].extend(subword_indices)
+
+    return words, word_tokens, token_indices
+
+
+def _merge_punctuations(words, tokens, indices, prepended, appended):
+    """Merges punctuation tokens with neighboring words."""
+    # prepend punctuations
+    i = len(words) - 2
+    j = len(words) - 1
+    while i >= 0:
+        if words[i].startswith(" ") and words[i].strip() in prepended:
+            words[j] = words[i] + words[j]
+            tokens[j] = tokens[i] + tokens[j]
+            indices[j] = indices[i] + indices[j]
+            words[i] = ""
+            tokens[i] = []
+            indices[i] = []
+        else:
+            j = i
+        i -= 1
+
+    # append punctuations
+    i = 0
+    j = 1
+    while j < len(words):
+        if not words[i].endswith(" ") and words[j] in appended:
+            words[i] += words[j]
+            tokens[i] += tokens[j]
+            indices[i] += indices[j]
+            words[j] = ""
+            tokens[j] = []
+            indices[j] = []
+        else:
+            i = j
+        j += 1
+
+    # remove elements that are now empty
+    words[:] = [word for word in words if word]
+    tokens[:] = [token for token in tokens if token]
+    indices[:] = [idx for idx in indices if idx]
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index a8cd8d662726..4861de65286c 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -295,7 +295,7 @@ def decode(
                 Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                 timestamps.
             decode_with_timestamps (`bool`, *optional*, defaults to `False`):
-                WHether or not to decode with timestamps included in the raw text.
+                Whether or not to decode with timestamps included in the raw text.
         Returns:
             `str`: The decoded sentence.
         """
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index b96363232fb5..7b23fb527814 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -246,12 +246,12 @@ def __call__(
                       treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
                       inference to provide more context to the model). Only use `stride` with CTC models.
             return_timestamps (*optional*, `str`):
-                Only available for pure CTC models. If set to `"char"`, the pipeline will return `timestamps` along the
-                text for every character in the text. For instance if you get `[{"text": "h", "timestamps": (0.5,0.6),
-                {"text": "i", "timestamps": (0.7, .9)}]`, then it means the model predicts that the letter "h" was
+                Only available for pure CTC models. If set to `"char"`, the pipeline will return timestamps along the
+                text for every character in the text. For instance if you get `[{"text": "h", "timestamp": (0.5, 0.6)},
+                {"text": "i", "timestamp": (0.7, 0.9)}]`, then it means the model predicts that the letter "h" was
                 pronounced after `0.5` and before `0.6` seconds. If set to `"word"`, the pipeline will return
-                `timestamps` along the text for every word in the text. For instance if you get `[{"text": "hi ",
-                "timestamps": (0.5,0.9), {"text": "there", "timestamps": (1.0, .1.5)}]`, then it means the model
+                timestamps along the text for every word in the text. For instance if you get `[{"text": "hi ",
+                "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp": (1.0, 1.5)}]`, then it means the model
                 predicts that the word "hi" was pronounced after `0.5` and before `0.9` seconds.
             generate_kwargs (`dict`, *optional*):
                 The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
@@ -265,8 +265,8 @@ def __call__(
                 - **text** (`str` ) -- The recognized text.
                 - **chunks** (*optional(, `List[Dict]`)
                         When using `return_timestamps`, the `chunks` will become a list containing all the various text
-                        chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
-                        "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                        chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                        "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
                         `"".join(chunk["text"] for chunk in output["chunks"])`.
         """
         return super().__call__(inputs, **kwargs)
@@ -421,6 +421,8 @@ def _forward(self, model_inputs, return_timestamps=False, generate_kwargs=None):
             generate_kwargs = {}
         if return_timestamps and self.type == "seq2seq_whisper":
             generate_kwargs["return_timestamps"] = return_timestamps
+            if return_timestamps == "word":
+                generate_kwargs["return_token_timestamps"] = True
         is_last = model_inputs.pop("is_last")
 
         if self.type in {"seq2seq", "seq2seq_whisper"}:
@@ -447,7 +449,10 @@ def _forward(self, model_inputs, return_timestamps=False, generate_kwargs=None):
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )
-            out = {"tokens": tokens}
+            if return_timestamps == "word" and self.type == "seq2seq_whisper":
+                out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
+            else:
+                out = {"tokens": tokens}
             if self.type == "seq2seq_whisper":
                 stride = model_inputs.pop("stride", None)
                 if stride is not None:
@@ -486,9 +491,9 @@ def postprocess(
         if return_timestamps and self.type == "seq2seq":
             raise ValueError("We cannot return_timestamps yet on non-ctc models apart from Whisper !")
         if return_timestamps == "char" and self.type == "ctc_with_lm":
-            raise ValueError("CTC with LM cannot return `char` timestamps, only `words`")
-        if return_timestamps in {"char", "words"} and self.type == "seq2seq_whisper":
-            raise ValueError("Whisper cannot return `char` nor `words` timestamps, use `True` instead.")
+            raise ValueError("CTC with LM cannot return `char` timestamps, only `word`")
+        if return_timestamps == "char" and self.type == "seq2seq_whisper":
+            raise ValueError("Whisper cannot return `char` timestamps, use `True` or `word` instead.")
 
         if return_language is not None and self.type != "seq2seq_whisper":
             raise ValueError("Only whisper can return language for now.")
@@ -574,6 +579,7 @@ def postprocess(
             output.pop("logits", None)
             output.pop("is_last", None)
             output.pop("stride", None)
+            output.pop("token_timestamps", None)
             for k, v in output.items():
                 extra[k].append(v)
         return {"text": text, **optional, **extra}
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 2bb34aa1af7b..07fd5dc2baae 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1436,6 +1436,35 @@ def test_tiny_timestamp_generation(self):
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
+    @slow
+    def test_tiny_token_timestamp_generation(self):
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
+
+        generate_outputs = model.generate(
+            input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
+        )
+
+        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
+
+        # fmt: off
+        EXPECTED_OUTPUT = torch.tensor([
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.4800, 0.8200, 0.9600, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0000, 2.3400, 2.5000, 2.6600, 3.1800, 3.5600, 3.6800, 3.8000, 4.1000, 4.3000, 4.5800, 4.9400, 5.3800, 12.4200, 12.8400, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9400, 26.9400, 26.9400, 26.9400, 29.8400 ],
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.5200, 0.9000, 1.1400, 1.4200, 1.5200, 1.6800, 1.6800, 1.8800, 2.1000, 2.2200, 2.6200, 3.1400, 3.5800, 3.9600, 4.4000, 17.3000, 17.3000, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 28.0000 ],
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7600, 1.0000, 1.4200, 1.8000, 1.9400, 2.1800, 2.5200, 3.0200, 3.3200, 3.5400, 3.9400, 4.5600, 4.9200, 5.2800, 5.5600, 5.9000, 6.1600, 6.3000, 6.4800, 6.4800, 6.6400, 7.8200, 7.9600, 8.2200, 8.6000, 8.9200, 9.2200, 9.5200, 9.7200, 10.0600, 10.5400, 10.8800, 11.2600, 11.5400, 11.7400, 12.0800, 15.6800, 15.6800],
+            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7400, 1.0400, 1.3200, 1.6800, 2.1400, 2.4800, 2.7800, 3.0800, 3.1600, 3.4000, 3.6000, 4.0200, 4.2200, 4.8600, 5.2400, 5.7400, 6.3400, 6.6200, 6.7600, 6.7600, 6.8600, 7.2400, 7.4200, 7.6800, 7.9200, 8.4800, 8.7600, 9.2000, 9.2000, 9.4200, 15.8200, 15.8200, 29.6400, 29.6600, 29.6600, 29.6600, 29.6600, 29.7600]
+        ])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
+
     @slow
     def test_tiny_specaugment_librispeech(self):
         torch_device = "cpu"
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index aea86525f52a..403c5fe9e83d 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
-from transformers.models.whisper.tokenization_whisper import _find_longest_common_sequence
+from transformers.models.whisper.tokenization_whisper import _combine_tokens_into_words, _find_longest_common_sequence
 from transformers.testing_utils import slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
@@ -255,6 +255,24 @@ def test_fast_tokenizer_get_prompt_ids(self):
 
         self.assertListEqual(tokenizer_prompt_ids.tolist(), fast_tokenizer_prompt_ids.tolist())
 
+    def test_combine_tokens_into_words(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # 'whatever "whatever" said someone, clever!?'
+        encoded_input = [1363, 7969, 503, 1363, 7969, 1, 848, 1580, 11, 13494, 7323]
+        expected_words = ["whatever", ' "whatever"', " said", " someone,", " clever!?"]
+        expected_tokens = [[1363, 7969], [503, 1363, 7969, 1], [848], [1580, 11], [13494, 7323]]
+        expected_indices = [[0, 1], [2, 3, 4, 5], [6], [7, 8], [9, 10]]
+        output = _combine_tokens_into_words(tokenizer, encoded_input)
+        self.assertEqual(expected_words, output[0])
+        self.assertEqual(expected_tokens, output[1])
+        self.assertEqual(expected_indices, output[2])
+        output_rust = _combine_tokens_into_words(rust_tokenizer, encoded_input)
+        self.assertEqual(expected_words, output_rust[0])
+        self.assertEqual(expected_tokens, output_rust[1])
+        self.assertEqual(expected_indices, output_rust[2])
+
 
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "openai/whisper-small.en"
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 952508dca441..2b77c556f4c8 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -316,6 +316,27 @@ def test_return_timestamps_in_preprocess(self):
                 "chunks": [{"text": " Conquered returned to its place amidst the tents.", "timestamp": (0.0, 3.36)}],
             },
         )
+        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        res = pipe(sample["audio"]["array"], return_timestamps="word")
+        # fmt: off
+        # Note that the word-level timestamps predicted here are pretty bad.
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [
+                    {'text': ' Conquered', 'timestamp': (29.78, 29.9)},
+                    {'text': ' returned', 'timestamp': (29.9, 29.9)},
+                    {'text': ' to', 'timestamp': (29.9, 29.9)},
+                    {'text': ' its', 'timestamp': (29.9, 29.9)},
+                    {'text': ' place', 'timestamp': (29.9, 29.9)},
+                    {'text': ' amidst', 'timestamp': (29.9, 29.9)},
+                    {'text': ' the', 'timestamp': (29.9, 29.9)},
+                    {'text': ' tents.', 'timestamp': (29.9, 29.9)}
+                ]
+            }
+        )
+        # fmt: on
 
     @require_torch
     @slow
@@ -699,6 +720,35 @@ def test_simple_whisper_asr(self):
                 ],
             },
         )
+        speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        output = speech_recognizer(filename, return_timestamps="word")
+        # fmt: off
+        self.assertEqual(
+            output,
+            {
+                "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+                "chunks": [
+                    {'text': ' Mr.', 'timestamp': (0.0, 1.02)},
+                    {'text': ' Quilter', 'timestamp': (1.02, 1.18)},
+                    {'text': ' is', 'timestamp': (1.18, 1.44)},
+                    {'text': ' the', 'timestamp': (1.44, 1.58)},
+                    {'text': ' apostle', 'timestamp': (1.58, 1.98)},
+                    {'text': ' of', 'timestamp': (1.98, 2.3)},
+                    {'text': ' the', 'timestamp': (2.3, 2.46)},
+                    {'text': ' middle', 'timestamp': (2.46, 2.56)},
+                    {'text': ' classes,', 'timestamp': (2.56, 3.38)},
+                    {'text': ' and', 'timestamp': (3.38, 3.52)},
+                    {'text': ' we', 'timestamp': (3.52, 3.6)},
+                    {'text': ' are', 'timestamp': (3.6, 3.72)},
+                    {'text': ' glad', 'timestamp': (3.72, 4.0)},
+                    {'text': ' to', 'timestamp': (4.0, 4.26)},
+                    {'text': ' welcome', 'timestamp': (4.26, 4.54)},
+                    {'text': ' his', 'timestamp': (4.54, 4.92)},
+                    {'text': ' gospel.', 'timestamp': (4.92, 6.66)},
+                ],
+            },
+        )
+        # fmt: on
 
     @slow
     @require_torch

From 127e81c272060709e451627903082fd3b55a7039 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 21 Jun 2023 11:51:27 -0400
Subject: [PATCH 495/935] Remove redundant code from TrainingArgs (#24401)

Remove redundant code
---
 src/transformers/training_args.py | 38 +++++++++++--------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index dc692f5aa74f..b27e9bba4c34 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1815,15 +1815,10 @@ def world_size(self):
         The number of processes used in parallel.
         """
         requires_backends(self, ["torch"])
-
-        if is_torch_tpu_available():
-            return xm.xrt_world_size()
+        if self.distributed_state is not None:
+            return self.distributed_state.num_processes
         elif is_sagemaker_mp_enabled():
             return smp.dp_size() if not smp.state.cfg.prescaled_batch else smp.rdp_size()
-        elif is_sagemaker_dp_enabled():
-            return dist.get_world_size()
-        elif self.parallel_mode == ParallelMode.DISTRIBUTED:
-            return torch.distributed.get_world_size()
         return 1
 
     @property
@@ -1832,14 +1827,10 @@ def process_index(self):
         The index of the current process used.
         """
         requires_backends(self, ["torch"])
-        if is_torch_tpu_available():
-            return xm.get_ordinal()
+        if self.distributed_state is not None:
+            return self.distributed_state.process_index
         elif is_sagemaker_mp_enabled():
             return smp.dp_rank() if not smp.state.cfg.prescaled_batch else smp.rdp_rank()
-        elif is_sagemaker_dp_enabled():
-            return dist.get_rank()
-        elif self.parallel_mode == ParallelMode.DISTRIBUTED:
-            return torch.distributed.get_rank()
         return 0
 
     @property
@@ -1848,14 +1839,11 @@ def local_process_index(self):
         The index of the local process used.
         """
         requires_backends(self, ["torch"])
-        if is_torch_tpu_available():
-            return xm.get_local_ordinal()
+
+        if self.distributed_state is not None:
+            return self.distributed_state.local_process_index
         elif is_sagemaker_mp_enabled():
             return smp.local_rank()
-        elif is_sagemaker_dp_enabled():
-            return dist.get_rank()
-        elif self.parallel_mode == ParallelMode.DISTRIBUTED:
-            return self.local_rank
         return 0
 
     @property
@@ -1944,19 +1932,19 @@ def main_process_first(self, local=True, desc="work"):
 
         """
         if is_torch_available() and self.world_size > 1:
-            main_process_desc = "main process"
-            if local:
-                is_main_process = self.local_process_index == 0
-                main_process_desc = "main local process"
+            main_process_desc = "main local process" if local else "main process"
+            if self.distributed_state is not None:
+                is_main_process = (
+                    self.distributed_state.is_local_main_process if local else self.distributed_state.is_main_process
+                )
             elif is_sagemaker_mp_enabled():
                 is_main_process = smp.rank() == 0
-            else:
-                is_main_process = self.process_index == 0
 
             try:
                 if not is_main_process:
                     # tell all replicas to wait
                     logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
+
                     if is_torch_tpu_available():
                         xm.rendezvous(desc)
                     else:

From 6ce6d62b6f20040129ec9831e7c4f6576402ea42 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 21 Jun 2023 19:24:11 +0200
Subject: [PATCH 496/935] Explicit arguments in `from_pretrained` (#24306)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/configuration_utils.py       | 47 ++++++++++++++++-
 src/transformers/feature_extraction_utils.py  | 34 +++++++++++--
 .../generation/configuration_utils.py         | 24 ++++++---
 src/transformers/image_processing_utils.py    | 34 ++++++++++++-
 src/transformers/modeling_flax_utils.py       | 38 +++++++++-----
 src/transformers/modeling_tf_utils.py         | 42 +++++++++++-----
 src/transformers/modeling_utils.py            | 50 +++++++++++++------
 .../models/align/configuration_align.py       |  4 ++
 .../models/altclip/configuration_altclip.py   |  2 +
 .../models/blip/configuration_blip.py         |  4 ++
 .../models/blip_2/configuration_blip_2.py     |  4 ++
 .../configuration_chinese_clip.py             |  4 ++
 .../models/clap/configuration_clap.py         |  4 ++
 .../models/clip/configuration_clip.py         |  4 ++
 .../models/clipseg/configuration_clipseg.py   |  4 ++
 .../models/flava/configuration_flava.py       |  8 +++
 .../models/git/configuration_git.py           |  2 +
 .../models/groupvit/configuration_groupvit.py |  4 ++
 .../models/owlvit/configuration_owlvit.py     |  6 +++
 .../pix2struct/configuration_pix2struct.py    |  4 ++
 .../models/x_clip/configuration_x_clip.py     |  4 ++
 src/transformers/processing_utils.py          | 33 +++++++++++-
 src/transformers/tokenization_utils_base.py   | 39 +++++++++++----
 23 files changed, 336 insertions(+), 63 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 1bcdef152a62..bd1ce0e78521 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -467,7 +467,43 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
             )
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+    def _set_token_in_kwargs(self, kwargs, token=None):
+        """Temporary method to deal with `token` and `use_auth_token`.
+
+        This method is to avoid apply the same changes in all model config classes that overwrite `from_pretrained`.
+
+        Need to clean up `use_auth_token` in a follow PR.
+        """
+        # Some model config classes like CLIP define their own `from_pretrained` without the new argument `token` yet.
+        if token is None:
+            token = kwargs.pop("token", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            # change to `token` in a follow-up PR
+            kwargs["use_auth_token"] = token
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> "PretrainedConfig":
         r"""
         Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model configuration.
 
@@ -493,7 +529,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -544,6 +580,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         assert config.output_attentions == True
         assert unused_kwargs == {"foo": False}
         ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        cls._set_token_in_kwargs(kwargs, token)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 9caf2b9f722b..1eb1218d6fc5 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -19,6 +19,7 @@
 import copy
 import json
 import os
+import warnings
 from collections import UserDict
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 
@@ -255,8 +256,15 @@ def _set_processor_class(self, processor_class: str):
 
     @classmethod
     def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> PreTrainedFeatureExtractor:
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature extractor, *e.g.* a
         derived class of [`SequenceFeatureExtractor`].
@@ -285,7 +293,7 @@ def from_pretrained(
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -335,6 +343,26 @@ def from_pretrained(
         assert feature_extractor.return_attention_mask is False
         assert unused_kwargs == {"foo": False}
         ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            # change to `token` in a follow-up PR
+            kwargs["use_auth_token"] = token
+
         feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(feature_extractor_dict, **kwargs)
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 20caed9cc513..99426790db4c 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -17,6 +17,7 @@
 import copy
 import json
 import os
+import warnings
 from typing import Any, Dict, Optional, Union
 
 from .. import __version__
@@ -382,6 +383,11 @@ def from_pretrained(
         cls,
         pretrained_model_name: Union[str, os.PathLike],
         config_file_name: Optional[Union[str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
         **kwargs,
     ) -> "GenerationConfig":
         r"""
@@ -410,7 +416,7 @@ def from_pretrained(
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -470,18 +476,24 @@ def from_pretrained(
         ```"""
         config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
 
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
         subfolder = kwargs.pop("subfolder", "")
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
@@ -509,7 +521,7 @@ def from_pretrained(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    use_auth_token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index d51c9eebd227..a712ed490f43 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -16,6 +16,7 @@
 import copy
 import json
 import os
+import warnings
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
@@ -81,7 +82,16 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
 
@@ -109,7 +119,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -162,6 +172,26 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         assert image_processor.do_normalize is False
         assert unused_kwargs == {"foo": False}
         ```"""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            # change to `token` in a follow-up PR
+            kwargs["use_auth_token"] = token
+
         image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(image_processor_dict, **kwargs)
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index df307d9f4b4e..c2732a7818a4 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -18,9 +18,10 @@
 import json
 import os
 import re
+import warnings
 from functools import partial
 from pickle import UnpicklingError
-from typing import Any, Dict, Set, Tuple, Union
+from typing import Any, Dict, Optional, Set, Tuple, Union
 
 import flax.linen as nn
 import jax
@@ -485,6 +486,13 @@ def from_pretrained(
         pretrained_model_name_or_path: Union[str, os.PathLike],
         dtype: jnp.dtype = jnp.float32,
         *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
         **kwargs,
     ):
         r"""
@@ -558,7 +566,7 @@ def from_pretrained(
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -603,16 +611,10 @@ def from_pretrained(
         >>> config = BertConfig.from_json_file("./pt_model/config.json")
         >>> model = FlaxBertModel.from_pretrained("./pt_model/pytorch_model.bin", from_pt=True, config=config)
         ```"""
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         from_pt = kwargs.pop("from_pt", False)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
@@ -620,6 +622,16 @@ def from_pretrained(
         subfolder = kwargs.pop("subfolder", "")
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         if trust_remote_code is True:
             logger.warning(
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
@@ -645,7 +657,7 @@ def from_pretrained(
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 revision=revision,
                 subfolder=subfolder,
                 _from_auto=from_auto_class,
@@ -715,7 +727,7 @@ def from_pretrained(
                         "proxies": proxies,
                         "resume_download": resume_download,
                         "local_files_only": local_files_only,
-                        "use_auth_token": use_auth_token,
+                        "use_auth_token": token,
                         "user_agent": user_agent,
                         "revision": revision,
                         "subfolder": subfolder,
@@ -746,7 +758,7 @@ def from_pretrained(
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": use_auth_token,
+                            "use_auth_token": token,
                         }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -797,7 +809,7 @@ def from_pretrained(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                use_auth_token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
@@ -986,7 +998,7 @@ def from_pretrained(
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     subfolder=subfolder,
                     _from_auto=from_auto_class,
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 48876d27f7cf..9d0cc8687c19 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2507,7 +2507,19 @@ def save_pretrained(
             )
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
 
@@ -2573,7 +2585,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -2629,17 +2641,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
         >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config)
         ```"""
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         from_pt = kwargs.pop("from_pt", False)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         _ = kwargs.pop("mirror", None)
         load_weight_prefix = kwargs.pop("load_weight_prefix", None)
@@ -2649,6 +2655,16 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         commit_hash = kwargs.pop("_commit_hash", None)
         tf_to_pt_weight_rename = kwargs.pop("tf_to_pt_weight_rename", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         if trust_remote_code is True:
             logger.warning(
                 "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
@@ -2674,7 +2690,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=use_auth_token,
                 revision=revision,
                 _from_auto=from_auto_class,
                 _from_pipeline=from_pipeline,
@@ -2761,7 +2777,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         "proxies": proxies,
                         "resume_download": resume_download,
                         "local_files_only": local_files_only,
-                        "use_auth_token": use_auth_token,
+                        "use_auth_token": token,
                         "user_agent": user_agent,
                         "revision": revision,
                         "subfolder": subfolder,
@@ -2808,7 +2824,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": use_auth_token,
+                            "use_auth_token": token,
                         }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -2855,7 +2871,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                use_auth_token=token,
                 user_agent=user_agent,
                 revision=revision,
                 _commit_hash=commit_hash,
@@ -3011,7 +3027,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     subfolder=subfolder,
                     _from_auto=from_auto_class,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 4129c52702c2..b3667eae0a87 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1914,7 +1914,20 @@ def float(self, *args):
             return super().float(*args)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
@@ -1995,7 +2008,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -2085,6 +2098,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             variant (`str`, *optional*):
                 If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
                 ignored when using `from_tf` or `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
+                is not installed, it will be set to `False`.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
@@ -2142,19 +2158,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         Currently, it can't handle deepspeed ZeRO stage 3 and ignores loading errors
 
         """
-        config = kwargs.pop("config", None)
         state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
         from_tf = kwargs.pop("from_tf", False)
         from_flax = kwargs.pop("from_flax", False)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         _ = kwargs.pop("mirror", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
@@ -2172,7 +2182,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         subfolder = kwargs.pop("subfolder", "")
         commit_hash = kwargs.pop("_commit_hash", None)
         variant = kwargs.pop("variant", None)
-        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if use_safetensors is None and not is_safetensors_available():
+            use_safetensors = False
 
         if is_bitsandbytes_available():
             is_8bit_serializable = version.parse(importlib_metadata.version("bitsandbytes")) > version.parse("0.37.2")
@@ -2302,7 +2324,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 revision=revision,
                 subfolder=subfolder,
                 _from_auto=from_auto_class,
@@ -2481,7 +2503,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         "proxies": proxies,
                         "resume_download": resume_download,
                         "local_files_only": local_files_only,
-                        "use_auth_token": use_auth_token,
+                        "use_auth_token": token,
                         "user_agent": user_agent,
                         "revision": revision,
                         "subfolder": subfolder,
@@ -2526,7 +2548,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": use_auth_token,
+                            "use_auth_token": token,
                         }
                         if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -2587,7 +2609,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                use_auth_token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
@@ -2910,7 +2932,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     subfolder=subfolder,
                     _from_auto=from_auto_class,
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index cb7baddfe19e..787ed4b69720 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -139,6 +139,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from AlignConfig
@@ -276,6 +278,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from AlignConfig
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index 4ddbb5ec8160..ddf22a8a526f 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -228,6 +228,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from AltCLIPConfig
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index 53a778528285..a1e2ff37ef4a 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -161,6 +161,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from BlipConfig
@@ -258,6 +260,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from BlipConfig
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 6adf85e6115f..650ce264ac99 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -113,6 +113,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from Blip2Config
@@ -229,6 +231,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index 85dfd0e45675..a1243b1852e2 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -144,6 +144,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from ChineseCLIPConfig
@@ -247,6 +249,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from ChineseCLIPConfig
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 9886c614da7b..32d06813777e 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -144,6 +144,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from ClapConfig
@@ -312,6 +314,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the audio config dict if we are loading from ClapConfig
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 2485e6893306..0a3f6b9fc10a 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -127,6 +127,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLIPConfig
@@ -230,6 +232,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from CLIPConfig
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 2bb7360d5c64..2f05492a7553 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -117,6 +117,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLIPSegConfig
@@ -218,6 +220,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from CLIPSegConfig
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index f5f69ff6e5dc..8e899ad0f6f6 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -131,6 +131,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the image config dict if we are loading from FlavaConfig
@@ -258,6 +260,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from FlavaConfig
@@ -359,6 +363,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the multimodal config dict if we are loading from FlavaConfig
@@ -442,6 +448,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the image codebook config dict if we are loading from FlavaConfig
diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py
index 7840ce63de65..8e1a7cf2e362 100644
--- a/src/transformers/models/git/configuration_git.py
+++ b/src/transformers/models/git/configuration_git.py
@@ -109,6 +109,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from GITConfig
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 31fb19cac0de..9f29b9a48120 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -127,6 +127,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from GroupViTConfig
@@ -250,6 +252,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from GroupViTConfig
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 978af9dc3d33..673f6d7ee6dd 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -127,6 +127,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from OwlViTConfig
@@ -230,6 +232,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from OwlViTConfig
@@ -301,6 +305,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index cf1dd91b4635..a86abae07391 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -153,6 +153,8 @@ def __init__(
     def from_pretrained(
         cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
     ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from Pix2StructConfig
@@ -270,6 +272,8 @@ def __init__(
     def from_pretrained(
         cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
     ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from Pix2StructConfig
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index ddb30d2ad2be..17e8c1c23053 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -118,6 +118,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from XCLIPConfig
@@ -243,6 +245,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from XCLIPConfig
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 09484cd1328b..f7f5de7d7c7b 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -17,7 +17,9 @@
 """
 
 import os
+import warnings
 from pathlib import Path
+from typing import Optional, Union
 
 from .dynamic_module_utils import custom_object_save
 from .tokenization_utils_base import PreTrainedTokenizerBase
@@ -151,7 +153,16 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
             )
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a processor associated with a pretrained model.
 
@@ -181,6 +192,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
                 [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
         """
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            # change to `token` in a follow-up PR
+            kwargs["use_auth_token"] = token
+
         args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(*args)
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index eb757d98acfb..bcf2d8f7ec1a 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1615,7 +1615,17 @@ def get_vocab(self) -> Dict[str, int]:
         raise NotImplementedError()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ):
         r"""
         Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
         tokenizer.
@@ -1645,7 +1655,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             local_files_only (`bool`, *optional*, defaults to `False`):
@@ -1692,18 +1702,29 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
         assert tokenizer.unk_token == "<unk>"
         ```"""
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
         subfolder = kwargs.pop("subfolder", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            # change to `token` in a follow-up PR
+            kwargs["use_auth_token"] = token
+
         user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
@@ -1752,7 +1773,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     force_download=force_download,
                     resume_download=resume_download,
                     proxies=proxies,
-                    use_auth_token=use_auth_token,
+                    use_auth_token=token,
                     revision=revision,
                     local_files_only=local_files_only,
                     subfolder=subfolder,
@@ -1789,7 +1810,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    use_auth_token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
@@ -1827,7 +1848,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
             pretrained_model_name_or_path,
             init_configuration,
             *init_inputs,
-            use_auth_token=use_auth_token,
+            use_auth_token=token,
             cache_dir=cache_dir,
             local_files_only=local_files_only,
             _commit_hash=commit_hash,

From 7e03e469345f34298cb41ec34ecb33781d7c6467 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 22 Jun 2023 13:48:49 +0100
Subject: [PATCH 497/935] [ASR pipeline] Check for torchaudio (#23953)

* [ASR pipeline] Check for torchaudio

* add pip instructions

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>

---------

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
---
 .../pipelines/automatic_speech_recognition.py         | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 7b23fb527814..d798c44b0dc6 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -17,7 +17,7 @@
 import numpy as np
 import requests
 
-from ..utils import is_torch_available, logging
+from ..utils import is_torch_available, is_torchaudio_available, logging
 from .audio_utils import ffmpeg_read
 from .base import ChunkPipeline
 
@@ -349,7 +349,14 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warn
             inputs = _inputs
             if in_sampling_rate != self.feature_extractor.sampling_rate:
                 import torch
-                from torchaudio import functional as F
+
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
 
                 inputs = F.resample(
                     torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate

From 22fe73c3785d7d2da356b69336d2a65d58edca4d Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Thu, 22 Jun 2023 14:06:16 +0100
Subject: [PATCH 498/935] TF safetensors reduced mem usage (#24404)

* Slight comment cleanup

* Reduce peak mem usage when loading TF-format safetensor weights

* Tweak the PyTorch loading code to support lazy loading from safetensors

* Pass safe_open objects to the PyTorch loading function

* Do GPU transposes for speed

* One more tweak to reduce peak usage further

* One-line hasattr

* Fix bug when there's a shape mismatch

* Rename state_dict in the loading code to be clearer

* Use TF format everywhere for consistency
---
 src/transformers/modeling_tf_pytorch_utils.py | 44 +++++-----
 src/transformers/modeling_tf_utils.py         | 85 +++++++++----------
 2 files changed, 61 insertions(+), 68 deletions(-)

diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 3d08be1a8a1e..81ec306c409c 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -248,7 +248,8 @@ def load_pytorch_state_dict_in_tf2_model(
     tf_to_pt_weight_rename=None,
     ignore_mismatched_sizes=False,
 ):
-    """Load a pytorch state_dict in a TF 2.0 model."""
+    """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
+    safetensors archive created with the safe_open() function."""
     import tensorflow as tf
     from packaging.version import parse
 
@@ -262,13 +263,11 @@ def load_pytorch_state_dict_in_tf2_model(
 
     if _prefix is None:
         _prefix = ""
-    if tf_inputs is not None:
+    if tf_inputs:
         with tf.name_scope(_prefix):
             tf_model(tf_inputs, training=False)  # Make sure model is built
-    # Adapt state dict - TODO remove this and update the AWS weights files instead
     # Convert old format to new format if needed from a PyTorch state_dict
-    old_keys = []
-    new_keys = []
+    tf_keys_to_pt_keys = {}
     for key in pt_state_dict.keys():
         new_key = None
         if "gamma" in key:
@@ -279,26 +278,24 @@ def load_pytorch_state_dict_in_tf2_model(
             new_key = key.replace("running_var", "moving_variance")
         if "running_mean" in key:
             new_key = key.replace("running_mean", "moving_mean")
-        if new_key:
-            old_keys.append(key)
-            new_keys.append(new_key)
-    for old_key, new_key in zip(old_keys, new_keys):
-        pt_state_dict[new_key] = pt_state_dict.pop(old_key)
+        if new_key is None:
+            new_key = key
+        tf_keys_to_pt_keys[new_key] = key
 
     # Matt: All TF models store the actual model stem in a MainLayer class, including the base model.
-    # In PT, the derived models (with heads) use the base model class as the stem instead, and the base model
-    # just contains the stem itself, and there is no MainLayer class. This means that TF base classes have one
+    # In PT, the derived models (with heads) use the base model class as the stem instead,
+    # and there is no MainLayer class. This means that TF base classes have one
     # extra layer in their weight names, corresponding to the MainLayer class. This code block compensates for that.
     start_prefix_to_remove = ""
-    if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
+    if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys.keys()):
         start_prefix_to_remove = tf_model.base_model_prefix + "."
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
     tf_loaded_numel = 0
-    weight_value_tuples = []
-    all_pytorch_weights = set(pt_state_dict.keys())
+    all_pytorch_weights = set(tf_keys_to_pt_keys.keys())
     missing_keys = []
     mismatched_keys = []
+    is_safetensor_archive = hasattr(pt_state_dict, "get_tensor")
     for symbolic_weight in symbolic_weights:
         sw_name = symbolic_weight.name
         name, transpose = convert_tf_weight_name_to_pt_weight_name(
@@ -311,7 +308,7 @@ def load_pytorch_state_dict_in_tf2_model(
             name = tf_to_pt_weight_rename(name)
 
         # Find associated numpy array in pytorch model state dict
-        if name not in pt_state_dict:
+        if name not in tf_keys_to_pt_keys:
             if allow_missing_keys:
                 missing_keys.append(name)
                 continue
@@ -320,9 +317,13 @@ def load_pytorch_state_dict_in_tf2_model(
                 if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
                     continue
             raise AttributeError(f"{name} not found in PyTorch model")
-
+        state_dict_name = tf_keys_to_pt_keys[name]
+        if is_safetensor_archive:
+            array = pt_state_dict.get_tensor(state_dict_name)
+        else:
+            array = pt_state_dict[state_dict_name]
         try:
-            array = apply_transpose(transpose, pt_state_dict[name], symbolic_weight.shape)
+            array = apply_transpose(transpose, array, symbolic_weight.shape)
         except tf.errors.InvalidArgumentError as e:
             if not ignore_mismatched_sizes:
                 error_msg = str(e)
@@ -331,16 +332,15 @@ def load_pytorch_state_dict_in_tf2_model(
                 )
                 raise tf.errors.InvalidArgumentError(error_msg)
             else:
-                mismatched_keys.append((name, pt_state_dict[name].shape, symbolic_weight.shape))
+                mismatched_keys.append((name, array.shape, symbolic_weight.shape))
                 continue
 
         tf_loaded_numel += tensor_size(array)
 
-        weight_value_tuples.append((symbolic_weight, array))
+        K.set_value(symbolic_weight, array)
+        del array  # Immediately free memory to keep peak usage as low as possible
         all_pytorch_weights.discard(name)
 
-    K.batch_set_value(weight_value_tuples)
-
     logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.")
 
     unexpected_keys = list(all_pytorch_weights)
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 9d0cc8687c19..08f5aa0b9329 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -87,7 +87,6 @@
 
 if is_safetensors_available():
     from safetensors import safe_open
-    from safetensors.tensorflow import load_file as safe_load_file
     from safetensors.tensorflow import save_file as safe_save_file
 
 if TYPE_CHECKING:
@@ -1000,42 +999,33 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
 
 def load_tf_weights_from_safetensors(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
     # Read the safetensors file
-    state_dict = safe_load_file(resolved_archive_file)
-
-    weight_value_tuples = []
-    mismatched_layers = []
-
-    weight_names = [format_weight_name(w.name, _prefix=_prefix) for w in model.weights]
-    loaded_weight_names = list(state_dict.keys())
-
-    # Find the missing layers from the high level list of layers
-    missing_layers = list(set(weight_names) - set(loaded_weight_names))
-    # Find the unexpected layers from the high level list of layers
-    unexpected_layers = list(set(loaded_weight_names) - set(weight_names))
-
-    weight_value_tuples = []
-    for weight in model.weights:
-        weight_name = format_weight_name(weight.name, _prefix=_prefix)
-        if weight_name in state_dict:
-            weight_value = state_dict[weight_name]
-            # Check if the shape of the current weight and the one from the H5 file are different
-            if K.int_shape(weight) != weight_value.shape:
-                # If yes we reshape the weight from the H5 file accordingly to the current weight
-                # If the two shapes are not compatible we raise an issue
-                try:
-                    weight_value = tf.reshape(weight_value, K.int_shape(weight))
-                except ValueError as e:
-                    if ignore_mismatched_sizes:
-                        mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight)))
-                        continue
-                    else:
-                        raise e
-
-            weight_value_tuples.append((weight, weight_value))
-
-    # Load all the weights
-    K.batch_set_value(weight_value_tuples)
+    with safe_open(resolved_archive_file, framework="tf") as safetensors_archive:
+        mismatched_layers = []
+        weight_names = [format_weight_name(w.name, _prefix=_prefix) for w in model.weights]
+        loaded_weight_names = list(safetensors_archive.keys())
+        # Find the missing layers from the high level list of layers
+        missing_layers = list(set(weight_names) - set(loaded_weight_names))
+        # Find the unexpected layers from the high level list of layers
+        unexpected_layers = list(set(loaded_weight_names) - set(weight_names))
+
+        for weight in model.weights:
+            weight_name = format_weight_name(weight.name, _prefix=_prefix)
+            if weight_name in loaded_weight_names:
+                weight_value = safetensors_archive.get_tensor(weight_name)
+                # Check if the shape of the current weight and the one from the H5 file are different
+                if K.int_shape(weight) != weight_value.shape:
+                    # If yes we reshape the weight from the H5 file accordingly to the current weight
+                    # If the two shapes are not compatible we raise an issue
+                    try:
+                        weight_value = tf.reshape(weight_value, K.int_shape(weight))
+                    except ValueError as e:
+                        if ignore_mismatched_sizes:
+                            mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight)))
+                            continue
+                        else:
+                            raise e
 
+                K.set_value(weight, weight_value)  # weight.assign() might break if weight is a DTensor
     return missing_layers, unexpected_layers, mismatched_layers
 
 
@@ -2921,16 +2911,19 @@ def from_pretrained(
         if safetensors_from_pt:
             from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model
 
-            state_dict = safe_load_file(resolved_archive_file)
-            # Load from a PyTorch checkpoint
-            return load_pytorch_state_dict_in_tf2_model(
-                model,
-                state_dict,
-                allow_missing_keys=True,
-                output_loading_info=output_loading_info,
-                _prefix=load_weight_prefix,
-                ignore_mismatched_sizes=ignore_mismatched_sizes,
-            )
+            with safe_open(resolved_archive_file, framework="tf") as safetensors_archive:
+                # Load from a PyTorch checkpoint
+                # We load in TF format here because PT weights often need to be transposed, and this is much
+                # faster on GPU. Loading as numpy and transposing on CPU adds several seconds to load times.
+                return load_pytorch_state_dict_in_tf2_model(
+                    model,
+                    safetensors_archive,
+                    tf_inputs=False,  # No need to build the model again
+                    allow_missing_keys=True,
+                    output_loading_info=output_loading_info,
+                    _prefix=load_weight_prefix,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
 
         # 'by_name' allow us to do transfer learning by skipping/adding layers
         # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357

From 652ece0710e62aaae82122fcf30c5acbe19c26ab Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 22 Jun 2023 15:34:13 +0200
Subject: [PATCH 499/935] Skip `test_conditional_generation_pt_pix2struct` in
 Past CI (torch < 1.11) (#24417)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../pipelines/test_pipelines_image_to_text.py  | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 2a73206f1ba6..7514f17919b1 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -18,11 +18,24 @@
 
 from transformers import MODEL_FOR_VISION_2_SEQ_MAPPING, TF_MODEL_FOR_VISION_2_SEQ_MAPPING, is_vision_available
 from transformers.pipelines import pipeline
-from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
 
 from .test_pipelines_common import ANY
 
 
+if is_torch_available():
+    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_11
+else:
+    is_torch_greater_or_equal_than_1_11 = False
+
+
 if is_vision_available():
     from PIL import Image
 else:
@@ -204,6 +217,9 @@ def test_conditional_generation_pt_git(self):
         with self.assertRaises(ValueError):
             outputs = pipe([image, image], prompt=[prompt, prompt])
 
+    @unittest.skipIf(
+        not is_torch_greater_or_equal_than_1_11, reason="`Pix2StructImageProcessor` requires `torch>=1.11.0`."
+    )
     @slow
     @require_torch
     def test_conditional_generation_pt_pix2struct(self):

From ebb62e8880229e893c82d4b760927ef6aa4579a4 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 22 Jun 2023 15:40:38 +0200
Subject: [PATCH 500/935] =?UTF-8?q?[`bnb`]=C2=A0Fix=20bnb=20serialization?=
 =?UTF-8?q?=20issue=20with=20new=20release=20(#24416)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix bnb issue

* fixup

* revert and do simple patching instead

* add more details
---
 src/transformers/modeling_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b3667eae0a87..2f4e755131e5 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -305,7 +305,12 @@ def shard_checkpoint(
     storage_id_to_block = {}
 
     for key, weight in state_dict.items():
-        storage_id = id_tensor_storage(weight)
+        # when bnb serialization is used the weights in the state dict can be strings
+        # check: https://github.com/huggingface/transformers/pull/24416 for more details
+        if isinstance(weight, str):
+            continue
+        else:
+            storage_id = id_tensor_storage(weight)
 
         # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
         if storage_id in storage_id_to_block:

From 3ce3385c47b4bf302ff85911fee194e483c8fa00 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 22 Jun 2023 16:11:27 +0200
Subject: [PATCH 501/935] Revert "Fix gradient checkpointing + fp16 autocast
 for most models" (#24420)

Revert "Fix gradient checkpointing + fp16 autocast for most models (#24247)"

This reverts commit 285a48011da3145ae77c5b22bcfbe77d367e5173.
---
 .../models/albert/modeling_albert.py          |  6 +---
 .../models/align/modeling_align.py            |  9 ++---
 .../models/altclip/modeling_altclip.py        | 11 ++----
 .../modeling_audio_spectrogram_transformer.py |  4 +--
 .../models/autoformer/modeling_autoformer.py  |  5 ++-
 src/transformers/models/bart/modeling_bart.py |  5 ++-
 src/transformers/models/beit/modeling_beit.py |  4 +--
 src/transformers/models/bert/modeling_bert.py |  9 ++---
 .../modeling_bert_generation.py               |  9 ++---
 .../models/big_bird/modeling_big_bird.py      |  4 +--
 .../modeling_bigbird_pegasus.py               |  5 ++-
 .../models/biogpt/modeling_biogpt.py          |  3 +-
 .../models/blenderbot/modeling_blenderbot.py  |  5 ++-
 .../modeling_blenderbot_small.py              |  5 ++-
 src/transformers/models/blip/modeling_blip.py |  3 +-
 .../models/blip/modeling_blip_text.py         |  3 +-
 .../models/blip_2/modeling_blip_2.py          | 11 ++----
 .../models/bloom/modeling_bloom.py            |  3 +-
 .../bridgetower/modeling_bridgetower.py       | 11 ++----
 .../models/camembert/modeling_camembert.py    |  9 ++---
 .../models/canine/modeling_canine.py          |  9 ++---
 .../chinese_clip/modeling_chinese_clip.py     | 11 ++----
 src/transformers/models/clap/modeling_clap.py | 12 ++-----
 src/transformers/models/clip/modeling_clip.py |  3 +-
 .../models/clipseg/modeling_clipseg.py        |  3 +-
 .../models/codegen/modeling_codegen.py        |  3 +-
 .../modeling_conditional_detr.py              |  3 +-
 .../models/convbert/modeling_convbert.py      |  9 ++---
 src/transformers/models/cvt/modeling_cvt.py   |  3 +-
 .../data2vec/modeling_data2vec_audio.py       |  5 ++-
 .../models/data2vec/modeling_data2vec_text.py |  9 ++---
 .../data2vec/modeling_data2vec_vision.py      |  4 +--
 .../models/deberta/modeling_deberta.py        |  4 +--
 .../models/deberta_v2/modeling_deberta_v2.py  |  4 +--
 .../modeling_decision_transformer.py          |  4 +--
 .../modeling_deformable_detr.py               |  4 +--
 src/transformers/models/deit/modeling_deit.py |  4 +--
 src/transformers/models/deta/modeling_deta.py |  4 +--
 src/transformers/models/detr/modeling_detr.py |  3 +-
 .../models/donut/modeling_donut_swin.py       |  4 +--
 src/transformers/models/dpt/modeling_dpt.py   |  4 +--
 .../models/electra/modeling_electra.py        |  9 ++---
 .../models/ernie/modeling_ernie.py            |  9 ++---
 src/transformers/models/esm/modeling_esm.py   |  5 ++-
 .../models/flava/modeling_flava.py            |  5 ++-
 src/transformers/models/fnet/modeling_fnet.py |  4 +--
 .../models/focalnet/modeling_focalnet.py      |  3 +-
 src/transformers/models/git/modeling_git.py   | 11 ++----
 src/transformers/models/gpt2/modeling_gpt2.py | 10 ++----
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  3 +-
 .../models/gpt_neo/modeling_gpt_neo.py        |  3 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |  3 +-
 src/transformers/models/gptj/modeling_gptj.py |  3 +-
 .../models/groupvit/modeling_groupvit.py      |  3 +-
 .../models/hubert/modeling_hubert.py          |  7 ++--
 .../models/imagegpt/modeling_imagegpt.py      |  4 +--
 .../models/informer/modeling_informer.py      |  7 ++--
 .../models/layoutlm/modeling_layoutlm.py      |  9 ++---
 .../models/layoutlmv2/modeling_layoutlmv2.py  |  4 +--
 .../models/layoutlmv3/modeling_layoutlmv3.py  |  4 +--
 src/transformers/models/led/modeling_led.py   |  5 ++-
 src/transformers/models/lilt/modeling_lilt.py |  9 ++---
 .../models/llama/modeling_llama.py            |  3 +-
 .../models/longformer/modeling_longformer.py  |  9 ++---
 .../models/longt5/modeling_longt5.py          | 10 ++----
 src/transformers/models/luke/modeling_luke.py |  4 +--
 .../models/m2m_100/modeling_m2m_100.py        |  5 ++-
 .../models/marian/modeling_marian.py          |  5 ++-
 .../models/markuplm/modeling_markuplm.py      |  3 +-
 .../mask2former/modeling_mask2former.py       |  3 +-
 .../models/maskformer/modeling_maskformer.py  |  3 +-
 .../maskformer/modeling_maskformer_swin.py    |  4 +--
 .../models/mbart/modeling_mbart.py            |  5 ++-
 .../models/mctct/modeling_mctct.py            |  3 +-
 .../megatron_bert/modeling_megatron_bert.py   |  9 ++---
 .../models/mobilevit/modeling_mobilevit.py    |  4 +--
 .../mobilevitv2/modeling_mobilevitv2.py       |  3 +-
 src/transformers/models/mt5/modeling_mt5.py   |  5 +--
 src/transformers/models/mvp/modeling_mvp.py   |  5 ++-
 .../models/nezha/modeling_nezha.py            |  9 ++---
 .../models/nllb_moe/modeling_nllb_moe.py      |  6 ++--
 .../nystromformer/modeling_nystromformer.py   |  9 ++---
 .../models/oneformer/modeling_oneformer.py    |  3 +-
 .../models/open_llama/modeling_open_llama.py  |  3 +-
 src/transformers/models/opt/modeling_opt.py   |  3 +-
 .../models/owlvit/modeling_owlvit.py          |  3 +-
 .../models/pegasus/modeling_pegasus.py        |  5 ++-
 .../models/pegasus_x/modeling_pegasus_x.py    |  5 ++-
 .../models/pix2struct/modeling_pix2struct.py  |  7 ++--
 .../models/plbart/modeling_plbart.py          |  5 ++-
 .../models/prophetnet/modeling_prophetnet.py  |  5 ++-
 .../models/qdqbert/modeling_qdqbert.py        |  4 +--
 .../models/realm/modeling_realm.py            |  9 ++---
 .../models/rembert/modeling_rembert.py        |  9 ++---
 .../models/retribert/modeling_retribert.py    |  4 +--
 .../models/roberta/modeling_roberta.py        |  9 ++---
 .../modeling_roberta_prelayernorm.py          |  9 ++---
 .../models/roc_bert/modeling_roc_bert.py      |  9 ++---
 .../models/roformer/modeling_roformer.py      |  9 ++---
 src/transformers/models/sam/modeling_sam.py   |  3 +-
 src/transformers/models/sew/modeling_sew.py   |  5 ++-
 .../models/sew_d/modeling_sew_d.py            |  6 ++--
 .../speech_to_text/modeling_speech_to_text.py |  5 ++-
 .../modeling_speech_to_text_2.py              |  3 +-
 .../models/speecht5/modeling_speecht5.py      |  7 ++--
 .../models/splinter/modeling_splinter.py      |  9 ++---
 src/transformers/models/swin/modeling_swin.py |  4 +--
 .../models/swin2sr/modeling_swin2sr.py        |  4 +--
 .../models/swinv2/modeling_swinv2.py          |  4 +--
 .../modeling_switch_transformers.py           | 10 ++----
 src/transformers/models/t5/modeling_t5.py     | 10 ++----
 .../modeling_table_transformer.py             |  3 +-
 .../models/tapas/modeling_tapas.py            |  3 +-
 .../modeling_time_series_transformer.py       |  5 ++-
 .../timesformer/modeling_timesformer.py       |  3 +-
 .../modeling_trajectory_transformer.py        |  3 +-
 .../models/trocr/modeling_trocr.py            |  3 +-
 src/transformers/models/tvlt/modeling_tvlt.py |  6 ++--
 .../models/unispeech/modeling_unispeech.py    |  7 ++--
 .../unispeech_sat/modeling_unispeech_sat.py   |  7 ++--
 .../models/videomae/modeling_videomae.py      |  6 ++--
 src/transformers/models/vilt/modeling_vilt.py |  3 +-
 .../visual_bert/modeling_visual_bert.py       |  9 ++---
 src/transformers/models/vit/modeling_vit.py   |  4 +--
 .../models/vit_hybrid/modeling_vit_hybrid.py  |  4 +--
 .../models/vit_mae/modeling_vit_mae.py        |  6 ++--
 .../models/vit_msn/modeling_vit_msn.py        |  4 +--
 .../models/wav2vec2/modeling_wav2vec2.py      |  7 ++--
 .../modeling_wav2vec2_conformer.py            |  5 ++-
 .../models/wavlm/modeling_wavlm.py            |  7 ++--
 .../models/whisper/modeling_whisper.py        |  5 ++-
 .../models/x_clip/modeling_x_clip.py          |  5 ++-
 src/transformers/models/xglm/modeling_xglm.py |  3 +-
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |  5 ++-
 .../xlm_roberta/modeling_xlm_roberta.py       |  9 ++---
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |  9 ++---
 src/transformers/models/xmod/modeling_xmod.py |  9 ++---
 .../models/yolos/modeling_yolos.py            |  4 +--
 src/transformers/models/yoso/modeling_yoso.py |  9 ++---
 src/transformers/pytorch_utils.py             | 15 --------
 ...ng_{{cookiecutter.lowercase_modelname}}.py |  8 ++---
 tests/models/align/test_modeling_align.py     |  6 ----
 tests/models/altclip/test_modeling_altclip.py |  6 ----
 .../autoformer/test_modeling_autoformer.py    |  6 ----
 tests/models/beit/test_modeling_beit.py       |  6 ----
 .../models/big_bird/test_modeling_big_bird.py |  6 ----
 tests/models/blip/test_modeling_blip.py       |  6 ----
 tests/models/canine/test_modeling_canine.py   |  6 ----
 .../test_modeling_chinese_clip.py             | 12 -------
 tests/models/clip/test_modeling_clip.py       |  6 ----
 tests/models/clipseg/test_modeling_clipseg.py | 12 -------
 .../data2vec/test_modeling_data2vec_vision.py |  6 ----
 tests/models/dpt/test_modeling_dpt.py         |  6 ----
 tests/models/dpt/test_modeling_dpt_hybrid.py  |  6 ----
 tests/models/flava/test_modeling_flava.py     | 30 ----------------
 tests/models/fnet/test_modeling_fnet.py       |  6 ----
 tests/models/gpt2/test_modeling_gpt2.py       |  6 ----
 .../graphormer/test_modeling_graphormer.py    |  6 ----
 .../models/imagegpt/test_modeling_imagegpt.py |  6 ----
 .../models/informer/test_modeling_informer.py |  6 ----
 .../models/layoutlm/test_modeling_layoutlm.py |  6 ----
 tests/models/lilt/test_modeling_lilt.py       |  6 ----
 tests/models/luke/test_modeling_luke.py       |  6 ----
 tests/models/marian/test_modeling_marian.py   |  6 ----
 tests/models/owlvit/test_modeling_owlvit.py   | 12 -------
 tests/models/pegasus/test_modeling_pegasus.py |  6 ----
 .../pix2struct/test_modeling_pix2struct.py    |  6 ----
 tests/models/regnet/test_modeling_regnet.py   |  6 ----
 .../models/roformer/test_modeling_roformer.py |  6 ----
 tests/models/sam/test_modeling_sam.py         |  6 ----
 .../test_modeling_speech_to_text.py           |  6 ----
 .../test_modeling_switch_transformers.py      |  6 ----
 .../test_modeling_time_series_transformer.py  |  6 ----
 tests/models/van/test_modeling_van.py         |  6 ----
 tests/models/vilt/test_modeling_vilt.py       |  6 ----
 .../visual_bert/test_modeling_visual_bert.py  |  6 ----
 tests/models/vit_mae/test_modeling_vit_mae.py |  6 ----
 tests/models/x_clip/test_modeling_x_clip.py   |  6 ----
 tests/test_modeling_common.py                 | 36 +------------------
 179 files changed, 271 insertions(+), 836 deletions(-)

diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 6aaf187b040b..9eadaa219834 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -34,11 +34,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 0a7238ef6069..09ee6eca6265 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -30,12 +30,7 @@
     BaseModelOutputWithPoolingAndNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1105,7 +1100,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 68a3a28a4801..26b3f5928081 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -30,12 +30,7 @@
     BaseModelOutputWithPoolingAndProjection,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig
 
@@ -656,7 +651,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -970,7 +965,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 15b8c379352d..0f8c045121c7 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -25,7 +25,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_audio_spectrogram_transformer import ASTConfig
 
@@ -343,7 +343,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 981df3ab845c..70587add17e7 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -34,7 +34,6 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_autoformer import AutoformerConfig
@@ -1211,7 +1210,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1429,7 +1428,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 9000ad3d0602..50452449021c 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -35,7 +35,6 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -850,7 +849,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1106,7 +1105,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index b546f1400191..b17721fb2bcd 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -34,7 +34,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -517,7 +517,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 37f236d4a602..fb92a0e84cc4 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -40,12 +40,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -603,7 +598,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index f20503c594df..f92b7a0633e8 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -25,12 +25,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -413,7 +408,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 5e80d0423f74..e1346a23c9db 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -37,7 +37,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1622,7 +1622,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 1ab72f0b4912..8d7906631d54 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -36,7 +36,6 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -1946,7 +1945,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -2292,7 +2291,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index c29c13547eb3..a9ecb11a61f1 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -32,7 +32,6 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -595,7 +594,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index f96531f51f76..8f2780772cbd 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -36,7 +36,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -780,7 +779,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1035,7 +1034,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index b09dce88e02e..ef8d51a2b0e7 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -34,7 +34,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -778,7 +777,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1032,7 +1031,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 93bb26c5b969..f16b89b7a316 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -25,7 +25,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -621,7 +620,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 38866578b6b0..1f269cf852ee 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -34,7 +34,6 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_blip import BlipTextConfig
 
@@ -428,7 +427,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index b326ff36c7ef..82a879771b78 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -31,12 +31,7 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -497,7 +492,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
@@ -968,7 +963,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 2144c43687ae..4f6de49a1447 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -33,7 +33,6 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_bloom import BloomConfig
 
@@ -776,7 +775,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     alibi,
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 37424e03545a..4290241fbc09 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -32,13 +32,8 @@
     ModelOutput,
     SequenceClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig
 
@@ -815,7 +810,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 25d11d24e14c..e98840fbc6d2 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -35,12 +35,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -534,7 +529,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 8406a9d1d42f..a91d42f0395e 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -36,12 +36,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -805,7 +800,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 975857024e33..0adf5cfdcb18 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -31,12 +31,7 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -919,7 +914,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -1028,7 +1023,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                 )
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index fa836700066b..c4dbcb03f34d 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -30,13 +30,7 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    meshgrid,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -953,7 +947,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
@@ -1607,7 +1601,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 6a96715a276b..ee9d660ef713 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -25,7 +25,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -645,7 +644,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index fc37277c34d5..85b119653068 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -26,7 +26,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -655,7 +654,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 7cee097b2b1a..8b1d34f59e7b 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -24,7 +24,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_codegen import CodeGenConfig
 
@@ -550,7 +549,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 44d9cc9bb599..023cb2784841 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -26,7 +26,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1396,7 +1395,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index 49923ba1234e..bbdba210c233 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -35,12 +35,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_convbert import ConvBertConfig
 
@@ -644,7 +639,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index 8784fff414fb..99e3a02febf4 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -26,8 +26,7 @@
 
 from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_cvt import CvtConfig
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 72a53c292cee..168f342acd32 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -35,7 +35,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
@@ -301,7 +300,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -601,7 +600,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 45c182a95c3d..206fe1603b00 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -34,12 +34,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -520,7 +515,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index cbef81d2a81b..77b424354892 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -33,7 +33,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -529,7 +529,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 260e713d5b9e..9a0d43db3a0a 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -31,7 +31,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing
+from ...pytorch_utils import softmax_backward_data
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta import DebertaConfig
 
@@ -464,7 +464,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     next_kv,
                     attention_mask,
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 22aef3592404..1596ad4ffad4 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -32,7 +32,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing
+from ...pytorch_utils import softmax_backward_data
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
 
@@ -508,7 +508,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                output_states = torch_custom_checkpointing(
+                output_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     next_kv,
                     attention_mask,
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 64d64191c484..926947b1617d 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer, torch_custom_checkpointing
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -643,7 +643,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index fc195622c2dd..6469cf7a65df 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -41,7 +41,7 @@
 )
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid, torch_custom_checkpointing
+from ...pytorch_utils import meshgrid
 from ...utils import is_ninja_available, logging
 from ..auto import AutoBackbone
 from .configuration_deformable_detr import DeformableDetrConfig
@@ -1380,7 +1380,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     encoder_hidden_states,
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 4c5491935cad..8b03835812fc 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -33,7 +33,7 @@
     MaskedImageModelingOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -364,7 +364,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 67427b4f4137..af218829d6f9 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -36,7 +36,7 @@
 )
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid, torch_custom_checkpointing
+from ...pytorch_utils import meshgrid
 from ...utils import is_torchvision_available, logging, requires_backends
 from ..auto import AutoBackbone
 from .configuration_deta import DetaConfig
@@ -1272,7 +1272,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     encoder_hidden_states,
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 684129663fa8..c92c43e46d18 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -26,7 +26,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1131,7 +1130,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 07f9fee14ed6..65c48eb81f83 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -28,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -756,7 +756,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 0630a3c48be9..187a6c36656a 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -39,7 +39,7 @@
 )
 from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import ModelOutput, logging
 from ..auto import AutoBackbone
 from .configuration_dpt import DPTConfig
@@ -535,7 +535,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 3197e060bd1a..a7ee4ec93202 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -36,12 +36,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -581,7 +576,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index a5f16a3a867f..b8df1b2d5035 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -38,12 +38,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -516,7 +511,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index 27b7bb2d917c..e0b26e0f7812 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -30,8 +30,7 @@
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import logging
 from .configuration_esm import EsmConfig
 
@@ -611,7 +610,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 0f85ff06f585..5d49197f8ca5 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -26,8 +26,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -669,7 +668,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 8d8de88c8f1d..6bc526eeebcb 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -43,7 +43,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -297,7 +297,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(create_custom_forward(layer_module), hidden_states)
+                layer_outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(layer_module), hidden_states)
             else:
                 layer_outputs = layer_module(hidden_states)
 
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index 9e8efed44388..fc327ad0b39f 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -28,7 +28,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -594,7 +593,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                stage_outputs = torch_custom_checkpointing(
+                stage_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(stage_module),
                     hidden_states,
                     input_dimensions,
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 83bf591fdb98..23ae6d64962f 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -34,12 +34,7 @@
     CausalLMOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_git import GitConfig, GitVisionConfig
 
@@ -462,7 +457,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -888,7 +883,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index dab2c613532b..b9a8568f00e7 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -35,12 +35,8 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_utils import Conv1D, PreTrainedModel, SequenceSummary
-from ...pytorch_utils import (
-    find_pruneable_heads_and_indices,
-    prune_conv1d_layer,
-    torch_custom_checkpointing,
-)
+from ...modeling_utils import PreTrainedModel, SequenceSummary
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -894,7 +890,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index cf23e1ba08a5..705d07b1da25 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -28,7 +28,6 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -662,7 +661,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 768893cb4471..b67f4ddbfaca 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -34,7 +34,6 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_gpt_neo import GPTNeoConfig
 
@@ -614,7 +613,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 3f7bbcdf64e6..7c3bfd1035f9 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -36,7 +36,6 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_gpt_neox import GPTNeoXConfig
 
@@ -558,7 +557,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 4969bd7fd1bb..de120167989d 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -31,7 +31,6 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -678,7 +677,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index e5ee94adbd26..c19ebd13b91d 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -28,7 +28,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1038,7 +1037,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 774c4826c9bb..70a8c079409b 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -27,7 +27,6 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -354,7 +353,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -739,7 +738,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -829,7 +828,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 31b911431f92..539119fabf28 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -32,7 +32,7 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer, torch_custom_checkpointing
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_imagegpt import ImageGPTConfig
 
@@ -826,7 +826,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 4774f1d91d66..2bf3f208a903 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -30,7 +30,6 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_informer import InformerConfig
@@ -1218,14 +1217,14 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                     )
                     if conv_layer is not None:
-                        output = torch_custom_checkpointing(conv_layer, layer_outputs[0])
+                        output = torch.utils.checkpoint.checkpoint(conv_layer, layer_outputs[0])
                         layer_outputs = (output,) + layer_outputs[1:]
                 else:
                     layer_outputs = encoder_layer(
@@ -1441,7 +1440,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 614bebe12196..410f76509422 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -33,12 +33,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlm import LayoutLMConfig
 
@@ -497,7 +492,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 0e0f2c1bd823..5a6f39ce31a6 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -31,7 +31,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -455,7 +455,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index 31fb1f6fb572..db6618caaeaf 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -32,7 +32,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlmv3 import LayoutLMv3Config
 
@@ -671,7 +671,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 8fa8c00aadf7..a11659e38933 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -35,7 +35,6 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1885,7 +1884,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -2151,7 +2150,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index 1953992d058f..74454d244e8d 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -31,12 +31,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_lilt import LiltConfig
 
@@ -524,7 +519,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layout_inputs,
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 2468f7088ba4..c9debdd252dc 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -29,7 +29,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_llama import LlamaConfig
 
@@ -569,7 +568,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 809d889eed47..665e2cb56421 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -25,12 +25,7 @@
 
 from ...activations import ACT2FN, gelu
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -1316,7 +1311,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index d1358a78d8f5..1a49444e8a50 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -23,6 +23,7 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -32,12 +33,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    ALL_LAYERNORM_LAYERS,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1521,7 +1517,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 0f217909b0ca..ba21d3deb32e 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -26,7 +26,7 @@
 from ...activations import ACT2FN, gelu
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing
+from ...pytorch_utils import apply_chunking_to_forward
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -795,7 +795,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     word_hidden_states,
                     entity_hidden_states,
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index db8e017d17f3..f8f9e1d3a8ee 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -32,7 +32,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -828,7 +827,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1075,7 +1074,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(decoder_layer),
                         hidden_states,
                         combined_attention_mask,
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 15d58baeda6a..a75f833fb5cb 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -35,7 +35,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -791,7 +790,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1040,7 +1039,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 0792ff1b723e..0c6847b47815 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -43,7 +43,6 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_markuplm import MarkupLMConfig
 
@@ -654,7 +653,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 2fd61a179b1b..4cb2493e58c8 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -36,7 +36,6 @@
 )
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_mask2former import Mask2FormerConfig
 
@@ -1876,7 +1875,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 2b91e975ce1e..830f8b23c816 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -28,7 +28,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -777,7 +776,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index e22f466edce8..7016b598e853 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -28,7 +28,7 @@
 from ...file_utils import ModelOutput
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils.backbone_utils import BackboneMixin
 from .configuration_maskformer_swin import MaskFormerSwinConfig
 
@@ -695,7 +695,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_hidden_states, output_dimensions, layer_all_hidden_states = torch_custom_checkpointing(
+                layer_hidden_states, output_dimensions, layer_all_hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module), hidden_states, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 660177708a83..67750ab42f71 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -34,7 +34,6 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -832,7 +831,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1090,7 +1089,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py
index 22838d4e28d0..08e280b3ccf9 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/mctct/modeling_mctct.py
@@ -33,7 +33,6 @@
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_mctct import MCTCTConfig
 
@@ -624,7 +623,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 9a24f41e70b0..bba7e7369cb8 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -40,12 +40,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -561,7 +556,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index e68357e6d37d..3503e86c9c75 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -33,7 +33,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -633,7 +633,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
index bd2f2bd9cf6f..b8c071a74f4b 100644
--- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -32,7 +32,6 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -590,7 +589,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index ce5c81f63425..a3cfce8ffc4a 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -23,6 +23,7 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -32,7 +33,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1045,7 +1046,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 4f905a7b51ed..6a44768d8eec 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -34,7 +34,6 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -954,7 +953,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1232,7 +1231,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index 68a78a64faeb..97c5b5a90ec3 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -38,12 +38,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -589,7 +584,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index d67032d119bc..06b61c7497db 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -22,6 +22,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
@@ -32,7 +33,6 @@
     Seq2SeqMoEOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -1155,7 +1155,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1428,7 +1428,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = checkpoint(
                         create_custom_forward(decoder_layer),
                         hidden_states,
                         combined_attention_mask,
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index 6bbd95e7091d..b859b0db1d4f 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -33,12 +33,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_nystromformer import NystromformerConfig
 
@@ -380,7 +375,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 1e2c59a717d5..a874611acde8 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -28,7 +28,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -2620,7 +2619,7 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor):
         for layer in self.layers:
             if self.use_checkpoint:
-                hidden_states = torch_custom_checkpointing(layer, hidden_states)
+                hidden_states = torch.utils.checkpoint.checkpoint(layer, hidden_states)
             else:
                 hidden_states = layer(hidden_states)
         return hidden_states
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 07b19a808de3..16ad554dc313 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -29,7 +29,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_open_llama import OpenLlamaConfig
 
@@ -604,7 +603,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 79b555a5e3e0..bd64630c6200 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -29,7 +29,6 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -701,7 +700,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     causal_attention_mask,
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 6f43ea603e3f..f65a0688578e 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -27,7 +27,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -755,7 +754,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 3af479971e18..a2bd3f3812e5 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -34,7 +34,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -806,7 +805,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1090,7 +1089,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index 94fc2d25ddcb..8e380a4de5f0 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -33,7 +33,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_end_docstrings,
     add_start_docstrings,
@@ -1073,7 +1072,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         global_hidden_states,
@@ -1331,7 +1330,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 0834bbeaaf7b..2db104a5a112 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -20,6 +20,7 @@
 import torch
 import torch.utils.checkpoint
 from torch import nn
+from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -30,7 +31,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS, torch_custom_checkpointing
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -349,7 +350,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -1501,7 +1502,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 23a9f928d193..365429360af5 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -33,7 +33,6 @@
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
@@ -811,7 +810,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1068,7 +1067,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 007d9aadf268..9160d5e1eb46 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -28,7 +28,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1337,7 +1336,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     extended_attention_mask,
@@ -1578,7 +1577,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index d4371c0efbb2..47a34e959072 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -39,7 +39,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -586,7 +586,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 09d6bb7325b4..f68fc04105de 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -31,12 +31,7 @@
     ModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_realm import RealmConfig
 
@@ -596,7 +591,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 06da821d4dd3..da4ad9608514 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -36,12 +36,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -553,7 +548,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py
index e1397d39ceae..240d9476e70b 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/retribert/modeling_retribert.py
@@ -21,10 +21,10 @@
 from typing import Optional
 
 import torch
+import torch.utils.checkpoint as checkpoint
 from torch import nn
 
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, logging
 from ..bert.modeling_bert import BertModel
 from .configuration_retribert import RetriBertConfig
@@ -141,7 +141,7 @@ def partial_encode(*inputs):
             for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
                 b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
                 b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
-                pooled_output = torch_custom_checkpointing(partial_encode, b_embedding_output, b_attention_mask)
+                pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
                 pooled_output_list.append(pooled_output)
             return torch.cat(pooled_output_list, dim=0)
 
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index f86fa4aa8082..b0f136924601 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -35,12 +35,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -520,7 +515,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 01276cd07119..b1e02e27f138 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -35,12 +35,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -522,7 +517,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 63abc9d4aa18..7647c14a9ea3 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -35,12 +35,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -654,7 +649,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index 586ecbd2dad6..b966bf4490a9 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -36,12 +36,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -590,7 +585,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 0e4177d90b0a..c3cbaa9176f0 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -28,7 +28,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
 
@@ -1050,7 +1049,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 75ad1f97dffe..dd854c49f5c9 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -28,7 +28,6 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew import SEWConfig
 
@@ -368,7 +367,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -681,7 +680,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index b7acb306bb91..7f7c1977d692 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -29,7 +29,7 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing
+from ...pytorch_utils import softmax_backward_data
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew_d import SEWDConfig
 
@@ -460,7 +460,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -1141,7 +1141,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                output_states = torch_custom_checkpointing(
+                output_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     next_kv,
                     attention_mask,
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 3e2024dc69ec..d8a19084eb38 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -31,7 +31,6 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_speech_to_text import Speech2TextConfig
 
@@ -821,7 +820,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1069,7 +1068,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 12e8d4592adb..c13b04642d9d 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -27,7 +27,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_speech_to_text_2 import Speech2Text2Config
 
@@ -678,7 +677,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 5988607f1cb4..3e8ce5a23b7e 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -35,7 +35,6 @@
     Seq2SeqSpectrogramOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig
 
@@ -529,7 +528,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -1395,7 +1394,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1724,7 +1723,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 88d6a480b705..6e636fb695da 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -27,12 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_splinter import SplinterConfig
 
@@ -469,7 +464,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 93144c66a913..b324cfdcd935 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -29,7 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -832,7 +832,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index 6b1b80334555..cd58b7065058 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageSuperResolutionOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -753,7 +753,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(stage_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 07dd0a79b7ae..97b460479d6d 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -28,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -908,7 +908,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
                 )
             else:
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 1378ec9a98d5..008e23531ac1 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -23,6 +23,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -32,12 +33,7 @@
     Seq2SeqMoEOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    ALL_LAYERNORM_LAYERS,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1079,7 +1075,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 4531214e19cd..050309fa9a33 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -24,6 +24,7 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
@@ -33,12 +34,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    ALL_LAYERNORM_LAYERS,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -1078,7 +1074,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 998f21a28610..733ff7b9b453 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -26,7 +26,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1075,7 +1074,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     combined_attention_mask,
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 4f736a367e30..1621653f3ee0 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -34,7 +34,6 @@
     find_pruneable_heads_and_indices,
     is_torch_greater_or_equal_than_1_12,
     prune_linear_layer,
-    torch_custom_checkpointing,
 )
 from ...utils import (
     ModelOutput,
@@ -654,7 +653,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index e3e0b3055d8b..8986ef6729ca 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -31,7 +31,6 @@
     Seq2SeqTSPredictionOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
@@ -950,7 +949,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -1167,7 +1166,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index 5ff5bd7fd19d..9f886b6ece53 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -27,7 +27,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_timesformer import TimesformerConfig
 
@@ -447,7 +446,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                 )
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
index 1027bd73f3fe..e8ecedccb5ea 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
@@ -26,7 +26,6 @@
 from torch.nn import functional as F
 
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -557,7 +556,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                outputs = torch_custom_checkpointing(
+                outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     layer_past,
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index e8ee10f7defd..6276c68a425d 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -27,7 +27,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_trocr import TrOCRConfig
 
@@ -710,7 +709,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/tvlt/modeling_tvlt.py b/src/transformers/models/tvlt/modeling_tvlt.py
index 4b990cdb03eb..3725c5e7728b 100644
--- a/src/transformers/models/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/tvlt/modeling_tvlt.py
@@ -29,7 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -567,7 +567,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -884,7 +884,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 5bd1af95c75a..e068fa59e579 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -29,7 +29,6 @@
 from ...deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -392,7 +391,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -775,7 +774,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -865,7 +864,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index f603d2712f6d..2ed8a5d57204 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -36,7 +36,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -406,7 +405,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -789,7 +788,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -879,7 +878,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 5f44a5e4b3aa..c62d0c4632cb 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -30,7 +30,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -441,7 +441,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
@@ -724,7 +724,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 5499a26cc748..6ee1e396a625 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -38,7 +38,6 @@
     find_pruneable_heads_and_indices,
     meshgrid,
     prune_linear_layer,
-    torch_custom_checkpointing,
 )
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_vilt import ViltConfig
@@ -537,7 +536,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index a73d6ac72072..0bef6e4af9d9 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -32,12 +32,7 @@
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -428,7 +423,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 28ea5740ca59..bfd440caae2b 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -32,7 +32,7 @@
     MaskedImageModelingOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -404,7 +404,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index ba3bbddf563b..051d431946a8 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from ..auto import AutoBackbone
 from .configuration_vit_hybrid import ViTHybridConfig
@@ -422,7 +422,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index 5a9c539fbc0b..ef0c7c9f3686 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -29,7 +29,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -543,7 +543,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
@@ -800,7 +800,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     None,
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index 4f7b412fecb8..46639e7d622c 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_vit_msn import ViTMSNConfig
 
@@ -394,7 +394,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 9705a51e488a..43ab2408bb23 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -37,7 +37,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -459,7 +458,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -811,7 +810,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -900,7 +899,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 86c0cbe5e2d6..3e37a4a504b0 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -35,7 +35,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -524,7 +523,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -917,7 +916,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 35dc46bac1f9..e4072d93724f 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -36,7 +36,6 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_wavlm import WavLMConfig
 
@@ -362,7 +361,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch_custom_checkpointing(
+                hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(conv_layer),
                     hidden_states,
                 )
@@ -721,7 +720,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
@@ -812,7 +811,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(layer),
                         hidden_states,
                         attention_mask,
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ef6a98b6c5ba..42fda344f610 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -34,7 +34,6 @@
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -929,7 +928,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         None,
@@ -1161,7 +1160,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index be6b28189003..8db4ee0fd194 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -26,7 +26,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -709,7 +708,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
@@ -956,7 +955,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 61b51d51fcbc..4a72b785a024 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -27,7 +27,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xglm import XGLMConfig
 
@@ -684,7 +683,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index fd90086672cf..2d14bfb6a7b5 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -29,7 +29,6 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -1357,7 +1356,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(encoder_layer),
                     hidden_states,
                     extended_attention_mask,
@@ -1601,7 +1600,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     extended_attention_mask,
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index e00574239c92..ae8d51a3f8eb 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -35,12 +35,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -521,7 +516,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 71f8de5a7277..fb86717e1d7f 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -34,12 +34,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -509,7 +504,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index 44e50bed3b21..d99b77fedda3 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -34,12 +34,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xmod import XmodConfig
 
@@ -583,7 +578,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     lang_ids,
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 4b4aa0124167..e3cb02ceae6e 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -27,7 +27,7 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -499,7 +499,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     layer_head_mask,
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index 1b1e6b13add8..8c2ff9fa4e07 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -34,12 +34,7 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import (
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-    torch_custom_checkpointing,
-)
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_yoso import YosoConfig
 
@@ -571,7 +566,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 520eeb89393a..4723c43035e6 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -285,18 +285,3 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     non-overlapping lifetimes may have the same id.
     """
     return tensor.device, storage_ptr(tensor), storage_size(tensor)
-
-
-def torch_custom_checkpointing(*args):
-    r"""
-    A correct usage of `torch.utils.checkpoint.checkpoint` as the default call leads to silent bugs that leads to the
-    gradients of the last layers not being updated. For more in depth detail of the issue, please have a look at:
-    https://github.com/huggingface/transformers/pull/24247
-    """
-    kwargs = {}
-    if "use_reentrant" in list(inspect.signature(torch.utils.checkpoint.checkpoint).parameters):
-        kwargs["use_reentrant"] = False
-    return torch.utils.checkpoint.checkpoint(
-        *args,
-        **kwargs,
-    )
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index c5d141b1f4b8..4899e195986f 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -43,7 +43,6 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, SequenceSummary
-from ...pytorch_utils import torch_custom_checkpointing
 from ...pytorch_utils import (
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
@@ -551,7 +550,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(layer_module),
                     hidden_states,
                     attention_mask,
@@ -1586,7 +1585,6 @@ def forward(
     CausalLMOutputWithCrossAttentions
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_custom_checkpointing
 from ...utils import logging
 from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
 
@@ -2320,7 +2318,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    layer_outputs = torch_custom_checkpointing(
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
                         attention_mask,
@@ -2559,7 +2557,7 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                layer_outputs = torch_custom_checkpointing(
+                layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     attention_mask,
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index c8ac69840f77..2357c20e213a 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -352,12 +352,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="ALIGN does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 266e0c47b6bb..28213de84df6 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -186,12 +186,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index ad006d9d0794..9f0434689c4b 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -238,12 +238,6 @@ def test_encoder_decoder_model_standalone(self):
     def test_resize_tokens_embeddings(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     # # Input is 'static_categorical_features' not 'input_ids'
     def test_model_main_input_name(self):
         model_signature = inspect.signature(getattr(AutoformerModel, "forward"))
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 149820023a69..f9aa7339f7e0 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -227,12 +227,6 @@ def test_inputs_embeds(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index 45bff430bfdb..f86c6d0ac70a 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -609,12 +609,6 @@ def test_for_change_to_full_attn(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     # overwrite from common in order to skip the check on `attentions`
     def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
         # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index a34efc026474..7d9c6b5ba58b 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -789,12 +789,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index 6e6d7ce3836a..d612a02bf47c 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -499,12 +499,6 @@ def test_inputs_embeds(self):
         # ViT does not use inputs_embeds
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip("CANINE does not have a get_input_embeddings() method.")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index cf2668f4d8b5..57f532da8635 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -395,12 +395,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
@@ -475,12 +469,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 82592d8452f5..d16241ab2f22 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -227,12 +227,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 387a2e1c8f34..b54861d8d8d0 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -202,12 +202,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
@@ -454,12 +448,6 @@ def test_model_for_image_segmentation(self):
     def test_hidden_states_output(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index 90786de24978..b4c391fea17e 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -310,12 +310,6 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 5889653991ca..76790ee79502 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -182,12 +182,6 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 04ba8c0289be..6d4a75c80da1 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -196,12 +196,6 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index b6d71f33a684..2544b7ee93f6 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -185,12 +185,6 @@ def test_inputs_embeds(self):
         # FLAVA does not use inputs_embeds
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -468,12 +462,6 @@ def test_inputs_embeds(self):
         # FLAVA does not use inputs_embeds
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     # skip this test as FlavaTextModel has no base class and is
     # not available in MODEL_MAPPING
     def test_save_load_fast_init_from_base(self):
@@ -636,12 +624,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -749,12 +731,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_CODEBOOK_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -1180,12 +1156,6 @@ class FlavaForPreTrainingTest(FlavaModelTest):
     class_for_tester = FlavaForPreTrainingTester
     test_torchscript = False
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 968218427365..e7e592d5b62f 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -444,12 +444,6 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 620bb30b2657..65542b495497 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -562,12 +562,6 @@ def test_gpt2_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_batch_generation(self):
         model = GPT2LMHeadModel.from_pretrained("gpt2")
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index f1c63729e000..e874ebf0f44a 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -356,12 +356,6 @@ def test_inputs_embeds(self):
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="Graphormer does not share input and output embeddings")
     def test_model_common_attributes(self):
         pass
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 1f4ea02f8d20..27d83f3eb8c1 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -304,12 +304,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_imagegpt_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_imagegpt_model(*config_and_inputs)
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 2202d62242ca..f3c8539d8450 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -216,12 +216,6 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index b88d0c4b50d8..0535fbf4e1f4 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,12 +279,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
 
 def prepare_layoutlm_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index 4032504b8b25..1bb92300c3db 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -275,12 +275,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in LILT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 4e1ef3d173b4..35bdb6b6d5fa 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -697,12 +697,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in LUKE_PRETRAINED_MODEL_ARCHIVE_LIST:
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 933383d2929a..6cbcd55d3f76 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -263,12 +263,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_save_load_strict(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 83fb86ba0e93..acf078ffe800 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -155,12 +155,6 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -639,12 +633,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             return
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index 1f409d1b004b..bde7477f9450 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -280,12 +280,6 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         input_ids = input_dict["input_ids"]
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 1eba4cb10c28..8ec023676d63 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -332,12 +332,6 @@ def test_model(self):
     def test_training(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
     def test_training_gradient_checkpointing(self):
         pass
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index 9b260845287b..e7c33699fda7 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -161,12 +161,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 6d54b7c1286e..357e126a047a 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -452,12 +452,6 @@ def test_model_as_decoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_model_as_decoder_with_default_input_mask(self):
         # This regression test was failing with PyTorch < 1.3
         (
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 8a4772138647..a0f39a401355 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -421,12 +421,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 1524ce24d262..16ad704fd510 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -324,12 +324,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         input_features = input_dict["input_features"]
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 4ff4554fbc35..f8730d899329 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -613,12 +613,6 @@ def test_decoder_model_past_with_attn_mask(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_beam_sample_generate_dict_output(self):
         r"""
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 44962267feea..42319a1dd0a2 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -200,12 +200,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_save_load_strict(self):
         config, _ = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py
index 7ec941dbc885..49df30a828a6 100644
--- a/tests/models/van/test_modeling_van.py
+++ b/tests/models/van/test_modeling_van.py
@@ -243,12 +243,6 @@ def test_model_from_pretrained(self):
             model = VanModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 17447acf680d..772091d5b976 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -340,12 +340,6 @@ def test_determinism(self):
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index 5dae4ebe1f94..cf48fd7ffbec 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -549,12 +549,6 @@ def test_model_for_flickr(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_flickr()
         self.model_tester.create_and_check_for_flickr(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 77c36bef8bab..c58e2e94802e 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -208,12 +208,6 @@ def test_for_pretraining(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 7fd65d871dbf..2efece44caeb 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -202,12 +202,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip(
-        reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247"
-    )
-    def test_training_gradient_checkpointing_autocast(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 7c02141f057e..07a8b16bfef7 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import collections
 import copy
 import gc
@@ -548,41 +549,6 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    @slow
-    @require_torch_gpu
-    def test_training_gradient_checkpointing_autocast(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            if (
-                model_class.__name__
-                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
-                or not model_class.supports_gradient_checkpointing
-            ):
-                continue
-            model = model_class(config)
-            model.to(torch_device)
-
-            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-
-            model.gradient_checkpointing_enable()
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            with torch.cuda.amp.autocast(True, dtype=torch.float16):
-                output = model(**inputs)[0]
-                loss = output.mean()
-
-                loss.backward()
-                optimizer.step()
-
-                for n, param in model.named_parameters():
-                    self.assertTrue(param.grad is not None, f"None gradient in param {n}")
-
     def test_attention_outputs(self):
         if not self.has_attentions:
             self.skipTest(reason="Model does not output attentions")

From 8f2ef52fb615bfa4d44467e18c949a9eb52bd260 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 22 Jun 2023 16:18:16 +0200
Subject: [PATCH 502/935] Fix `save_cache` version in `config.yml` (#24419)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 822c67374895..b7a0652b9fa5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -141,7 +141,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[all,quality]
             - save_cache:
-                  key: v0.5-code_quality-{{ checksum "setup.py" }}
+                  key: v0.6-code_quality-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
             - run:
@@ -174,7 +174,7 @@ jobs:
             - run: pip install --upgrade pip
             - run: pip install .[all,quality]
             - save_cache:
-                  key: v0.5-repository_consistency-{{ checksum "setup.py" }}
+                  key: v0.6-repository_consistency-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
             - run:

From 754f61ca055ae05dffc24440fface5f633ad777a Mon Sep 17 00:00:00 2001
From: Josh <66880119+JoshuaEPSamuel@users.noreply.github.com>
Date: Thu, 22 Jun 2023 15:38:01 +0100
Subject: [PATCH 503/935] Update RayTune doc link for Hyperparameter tuning
 (#24422)

Update outdated hyperlink hpo_train.md

Link to RayTune search space API docs was outdated - have provided correct new link for docs.

Co-authored-by: Joshua Samuel <66880119+Joshsamuel101@users.noreply.github.com>
---
 docs/source/en/hpo_train.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/hpo_train.md b/docs/source/en/hpo_train.md
index e78fc4fd39d6..490af690bc35 100644
--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@@ -54,7 +54,7 @@ For optuna, see optuna [object_parameter](https://optuna.readthedocs.io/en/stabl
 ...     }
 ```
 
-For raytune, see raytune [object_parameter](https://docs.ray.io/en/latest/tune/api_docs/search_space.html), it's like following:
+For raytune, see raytune [object_parameter](https://docs.ray.io/en/latest/tune/api/search_space.html), it's like following:
 
 ```py
 >>> def ray_hp_space(trial):
@@ -121,4 +121,4 @@ You could define your own compute_objective function, if not defined, the defaul
 ```
 
 ## Hyperparameter search For DDP finetune
-Currently, Hyperparameter search for DDP is enabled for optuna and sigopt. Only the rank-zero process will generate the search trial and pass the argument to other ranks.
\ No newline at end of file
+Currently, Hyperparameter search for DDP is enabled for optuna and sigopt. Only the rank-zero process will generate the search trial and pass the argument to other ranks.

From a1c4b63076ed11946a24a3d2e3ab7d7e77819546 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Thu, 22 Jun 2023 15:49:13 +0100
Subject: [PATCH 504/935] TF CI fix for Segformer (#24426)

Fix segformer so compilation can figure out the channel dim
---
 .../models/segformer/modeling_tf_segformer.py         | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index b3090135afc2..632382f95ed0 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -710,21 +710,20 @@ def __init__(self, config: SegformerConfig, **kwargs):
         self.config = config
 
     def call(self, encoder_hidden_states, training: bool = False):
-        batch_size = shape_list(encoder_hidden_states[-1])[0]
-
         all_hidden_states = ()
         for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
             if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
                 height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
                 height = width = tf.cast(height, tf.int32)
-                encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+                channel_dim = shape_list(encoder_hidden_state)[-1]
+                encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
 
             # unify channel dimension
             encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
-            height = shape_list(encoder_hidden_state)[1]
-            width = shape_list(encoder_hidden_state)[2]
+            height, width = shape_list(encoder_hidden_state)[1:3]
             encoder_hidden_state = mlp(encoder_hidden_state)
-            encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+            channel_dim = shape_list(encoder_hidden_state)[-1]
+            encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
 
             # upsample
             temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])

From b6295b26c500024ec733e18730463a5e94a7b716 Mon Sep 17 00:00:00 2001
From: Alex Hall <alex.mojaki@gmail.com>
Date: Thu, 22 Jun 2023 20:28:25 +0200
Subject: [PATCH 505/935] Refactor hyperparameter search backends (#24384)

* Refactor hyperparameter search backends

* Simpler refactoring without abstract base class

* black

* review comments:
specify name in class
use methods instead of callable class attributes
name constant better

* review comments: safer bool checking, log multiple available backends

* test ALL_HYPERPARAMETER_SEARCH_BACKENDS vs HPSearchBackend in unit test, not module. format with black.

* copyright
---
 src/transformers/__init__.py              |   1 +
 src/transformers/hyperparameter_search.py | 136 ++++++++++++++++++++++
 src/transformers/integrations.py          |   9 --
 src/transformers/trainer.py               |  40 +------
 src/transformers/trainer_utils.py         |   8 --
 tests/trainer/test_trainer.py             |  11 +-
 6 files changed, 152 insertions(+), 53 deletions(-)
 create mode 100644 src/transformers/hyperparameter_search.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 548e9bf660d1..7bb7a4342ea3 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -98,6 +98,7 @@
     "file_utils": [],
     "generation": ["GenerationConfig", "TextIteratorStreamer", "TextStreamer"],
     "hf_argparser": ["HfArgumentParser"],
+    "hyperparameter_search": [],
     "image_transforms": [],
     "integrations": [
         "is_clearml_available",
diff --git a/src/transformers/hyperparameter_search.py b/src/transformers/hyperparameter_search.py
new file mode 100644
index 000000000000..f0f5f46a0dde
--- /dev/null
+++ b/src/transformers/hyperparameter_search.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .integrations import (
+    is_optuna_available,
+    is_ray_available,
+    is_sigopt_available,
+    is_wandb_available,
+    run_hp_search_optuna,
+    run_hp_search_ray,
+    run_hp_search_sigopt,
+    run_hp_search_wandb,
+)
+from .trainer_utils import (
+    HPSearchBackend,
+    default_hp_space_optuna,
+    default_hp_space_ray,
+    default_hp_space_sigopt,
+    default_hp_space_wandb,
+)
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class HyperParamSearchBackendBase:
+    name: str
+    pip_package: str = None
+
+    def is_available(self):
+        raise NotImplementedError
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        raise NotImplementedError
+
+    def default_hp_space(self, trial):
+        raise NotImplementedError
+
+    def ensure_available(self):
+        if not self.is_available():
+            raise RuntimeError(
+                f"You picked the {self.name} backend, but it is not installed. Run {self.pip_install()}."
+            )
+
+    @classmethod
+    def pip_install(cls):
+        return f"`pip install {cls.pip_package or cls.name}`"
+
+
+class OptunaBackend(HyperParamSearchBackendBase):
+    name = "optuna"
+
+    def is_available(self):
+        return is_optuna_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_optuna(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_optuna(trial)
+
+
+class RayTuneBackend(HyperParamSearchBackendBase):
+    name = "ray"
+    pip_package = "'ray[tune]'"
+
+    def is_available(self):
+        return is_ray_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_ray(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_ray(trial)
+
+
+class SigOptBackend(HyperParamSearchBackendBase):
+    name = "sigopt"
+
+    def is_available(self):
+        return is_sigopt_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_sigopt(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_sigopt(trial)
+
+
+class WandbBackend(HyperParamSearchBackendBase):
+    name = "wandb"
+
+    def is_available(self):
+        return is_wandb_available()
+
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_wandb(trainer, n_trials, direction, **kwargs)
+
+    def default_hp_space(self, trial):
+        return default_hp_space_wandb(trial)
+
+
+ALL_HYPERPARAMETER_SEARCH_BACKENDS = {
+    HPSearchBackend(backend.name): backend for backend in [OptunaBackend, RayTuneBackend, SigOptBackend, WandbBackend]
+}
+
+
+def default_hp_search_backend() -> str:
+    available_backends = [backend for backend in ALL_HYPERPARAMETER_SEARCH_BACKENDS.values() if backend.is_available()]
+    if len(available_backends) > 0:
+        name = available_backends[0].name
+        if len(available_backends) > 1:
+            logger.info(
+                f"{len(available_backends)} hyperparameter search backends available. Using {name} as the default."
+            )
+        return name
+    raise RuntimeError(
+        "No hyperparameter search backend available.\n"
+        + "\n".join(
+            f" - To install {backend.name} run {backend.pip_install()}"
+            for backend in ALL_HYPERPARAMETER_SEARCH_BACKENDS.values()
+        )
+    )
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index dc3d00d4d134..a4af915fdce7 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -177,15 +177,6 @@ def hp_params(trial):
     raise RuntimeError(f"Unknown type for trial {trial.__class__}")
 
 
-def default_hp_search_backend():
-    if is_optuna_available():
-        return "optuna"
-    elif is_ray_tune_available():
-        return "ray"
-    elif is_sigopt_available():
-        return "sigopt"
-
-
 def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
     import optuna
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 73ef46655b45..d76f9b49710d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -36,18 +36,9 @@
 # Integrations must be imported before ML frameworks:
 # isort: off
 from .integrations import (
-    default_hp_search_backend,
     get_reporting_integration_callbacks,
     hp_params,
     is_fairscale_available,
-    is_optuna_available,
-    is_ray_tune_available,
-    is_sigopt_available,
-    is_wandb_available,
-    run_hp_search_optuna,
-    run_hp_search_ray,
-    run_hp_search_sigopt,
-    run_hp_search_wandb,
 )
 
 # isort: on
@@ -66,6 +57,7 @@
 from .debug_utils import DebugOption, DebugUnderflowOverflow
 from .deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
 from .dependency_versions_check import dep_version_check
+from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
@@ -114,7 +106,6 @@
     TrainerMemoryTracker,
     TrainOutput,
     default_compute_objective,
-    default_hp_space,
     denumpify_detensorize,
     enable_full_determinism,
     find_executable_batch_size,
@@ -2517,41 +2508,20 @@ def hyperparameter_search(
         """
         if backend is None:
             backend = default_hp_search_backend()
-            if backend is None:
-                raise RuntimeError(
-                    "At least one of optuna or ray should be installed. "
-                    "To install optuna run `pip install optuna`. "
-                    "To install ray run `pip install ray[tune]`. "
-                    "To install sigopt run `pip install sigopt`."
-                )
         backend = HPSearchBackend(backend)
-        if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
-            raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
-        if backend == HPSearchBackend.RAY and not is_ray_tune_available():
-            raise RuntimeError(
-                "You picked the Ray Tune backend, but it is not installed. Use `pip install 'ray[tune]'`."
-            )
-        if backend == HPSearchBackend.SIGOPT and not is_sigopt_available():
-            raise RuntimeError("You picked the sigopt backend, but it is not installed. Use `pip install sigopt`.")
-        if backend == HPSearchBackend.WANDB and not is_wandb_available():
-            raise RuntimeError("You picked the wandb backend, but it is not installed. Use `pip install wandb`.")
+        backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]()
+        backend_obj.ensure_available()
         self.hp_search_backend = backend
         if self.model_init is None:
             raise RuntimeError(
                 "To use hyperparameter search, you need to pass your model through a model_init function."
             )
 
-        self.hp_space = default_hp_space[backend] if hp_space is None else hp_space
+        self.hp_space = backend_obj.default_hp_space if hp_space is None else hp_space
         self.hp_name = hp_name
         self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
 
-        backend_dict = {
-            HPSearchBackend.OPTUNA: run_hp_search_optuna,
-            HPSearchBackend.RAY: run_hp_search_ray,
-            HPSearchBackend.SIGOPT: run_hp_search_sigopt,
-            HPSearchBackend.WANDB: run_hp_search_wandb,
-        }
-        best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
+        best_run = backend_obj.run(self, n_trials, direction, **kwargs)
 
         self.hp_search_backend = None
         return best_run
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index d6008b53e752..13e72b3d4412 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -301,14 +301,6 @@ class HPSearchBackend(ExplicitEnum):
     WANDB = "wandb"
 
 
-default_hp_space = {
-    HPSearchBackend.OPTUNA: default_hp_space_optuna,
-    HPSearchBackend.RAY: default_hp_space_ray,
-    HPSearchBackend.SIGOPT: default_hp_space_sigopt,
-    HPSearchBackend.WANDB: default_hp_space_wandb,
-}
-
-
 def is_main_process(local_rank):
     """
     Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ea823d06a789..3442b52b0171 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -42,6 +42,7 @@
     is_torch_available,
     logging,
 )
+from transformers.hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS
 from transformers.testing_utils import (
     ENDPOINT_STAGING,
     TOKEN,
@@ -72,7 +73,7 @@
     require_wandb,
     slow,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend
 from transformers.training_args import OptimizerNames
 from transformers.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
@@ -2803,3 +2804,11 @@ def hp_name(params):
             trainer.hyperparameter_search(
                 direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="wandb", n_trials=4, anonymous="must"
             )
+
+
+class HyperParameterSearchBackendsTest(unittest.TestCase):
+    def test_hyperparameter_search_backends(self):
+        self.assertEqual(
+            list(ALL_HYPERPARAMETER_SEARCH_BACKENDS.keys()),
+            list(HPSearchBackend),
+        )

From 2834c17ad2978b5c82facf37895c5a6fd8ddee3a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 22 Jun 2023 14:46:20 -0400
Subject: [PATCH 506/935] Clarify batch size displayed when using DataParallel
 (#24430)

---
 src/transformers/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d76f9b49710d..e8edf624b6b8 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1671,7 +1671,9 @@ def _inner_training_loop(
         logger.info("***** Running training *****")
         logger.info(f"  Num examples = {num_examples:,}")
         logger.info(f"  Num Epochs = {num_train_epochs:,}")
-        logger.info(f"  Instantaneous batch size per device = {self._train_batch_size:,}")
+        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        if self.args.per_device_train_batch_size != self._train_batch_size:
+            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
         logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
         logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
         logger.info(f"  Total optimization steps = {max_steps:,}")

From 2c977e4a9031e3961ef185b40ff4b7f54eda363c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 22 Jun 2023 23:16:35 +0200
Subject: [PATCH 507/935] Save `site-packages` as cache in CircleCI job
 (#24424)

* fix

* fix

* Upgrade complete!

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml                | 28 +++++++---
 .circleci/create_circleci_config.py | 84 +++++++++++++++++------------
 2 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b7a0652b9fa5..99d0e05c6451 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,8 +31,8 @@ jobs:
         steps:
             - checkout
             - run: pip install --upgrade pip
-            - run: pip install GitPython
-            - run: pip install .
+            - run: pip install -U GitPython
+            - run: pip install -U .
             - run: mkdir -p test_preparation
             - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
             - store_artifacts:
@@ -105,8 +105,8 @@ jobs:
         steps:
             - checkout
             - run: pip install --upgrade pip
-            - run: pip install GitPython
-            - run: pip install .
+            - run: pip install -U GitPython
+            - run: pip install -U .
             - run: |
                   mkdir test_preparation
                   echo -n "tests" > test_preparation/test_list.txt
@@ -138,12 +138,20 @@ jobs:
                   keys:
                       - v0.6-code_quality-{{ checksum "setup.py" }}
                       - v0.6-code-quality
+            - restore_cache:
+                  keys:
+                      - v0.6-code_quality-{{ checksum "setup.py" }}-site-packages
+                      - v0.6-code-quality-site-packages
             - run: pip install --upgrade pip
-            - run: pip install .[all,quality]
+            - run: pip install -U .[all,quality]
             - save_cache:
                   key: v0.6-code_quality-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
+            - save_cache:
+                  key: v0.6-code_quality-{{ checksum "setup.py" }}-site-packages
+                  paths:
+                      - '~/.pyenv/versions/'
             - run:
                 name: Show installed libraries and their versions
                 command: pip freeze | tee installed.txt
@@ -171,12 +179,20 @@ jobs:
                   keys:
                       - v0.6-repository_consistency-{{ checksum "setup.py" }}
                       - v0.6-repository_consistency
+            - restore_cache:
+                  keys:
+                      - v0.6-repository_consistency-{{ checksum "setup.py" }}-site-packages
+                      - v0.6-repository_consistency-site-packages
             - run: pip install --upgrade pip
-            - run: pip install .[all,quality]
+            - run: pip install -U .[all,quality]
             - save_cache:
                   key: v0.6-repository_consistency-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
+            - save_cache:
+                  key: v0.6-repository_consistency-{{ checksum "setup.py" }}-site-packages
+                  paths:
+                      - '~/.pyenv/versions/'
             - run:
                 name: Show installed libraries and their versions
                 command: pip freeze | tee installed.txt
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 0b7923ef6766..85552297b1ca 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -106,10 +106,18 @@ def to_dict(self):
                     ]
                 }
             },
+            {
+                "restore_cache": {
+                    "keys": [
+                        f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}-site-packages',
+                        f"v{self.cache_version}-{self.cache_name}-site-packages",
+                    ]
+                }
+            },
         ]
         steps.extend([{"run": l} for l in self.install_steps])
         # TODO (ydshieh): Remove this line after the next release (the one after 2023/06/19) of `huggingface_hub`
-        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install git+https://github.com/huggingface/huggingface_hub.git@c36817701ecd4694b290178846176da2e195d838"}})
+        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install -U git+https://github.com/huggingface/huggingface_hub.git@c36817701ecd4694b290178846176da2e195d838"}})
         steps.append(
             {
                 "save_cache": {
@@ -118,6 +126,14 @@ def to_dict(self):
                 }
             }
         )
+        steps.append(
+            {
+                "save_cache": {
+                    "key": f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}-site-packages',
+                    "paths": ["~/.pyenv/versions/"],
+                }
+            }
+        )
         steps.append({"run": {"name": "Show installed libraries and their versions", "command": "pip freeze | tee installed.txt"}})
         steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
 
@@ -229,9 +245,9 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake",
         "git lfs install",
         "pip install --upgrade pip",
-        "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install tensorflow_probability",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install -U .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
+        "pip install -U tensorflow_probability",
+        "pip install -U git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -243,9 +259,9 @@ def job_name(self):
     additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install -U --upgrade pip",
+        "pip install -U .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
+        "pip install -U git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_flax_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -257,8 +273,8 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
         "pip install --upgrade pip",
-        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
+        "pip install -U git+https://github.com/huggingface/accelerate",
     ],
     parallelism=1,
     pytest_num_workers=3,
@@ -270,8 +286,8 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
         "pip install --upgrade pip",
-        "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
-        "pip install tensorflow_probability",
+        "pip install -U .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
+        "pip install -U tensorflow_probability",
     ],
     parallelism=1,
     pytest_num_workers=6,
@@ -284,7 +300,7 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
         "pip install --upgrade pip",
-        "pip install .[flax,testing,sentencepiece,flax-speech,vision]",
+        "pip install -U .[flax,testing,sentencepiece,flax-speech,vision]",
     ],
     parallelism=1,
     pytest_options={"rA": None},
@@ -297,7 +313,7 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
         "pip install --upgrade pip",
-        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
+        "pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
     ],
     pytest_options={"rA": None},
     marker="is_pipeline_test",
@@ -310,8 +326,8 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y cmake",
         "pip install --upgrade pip",
-        "pip install .[sklearn,tf-cpu,testing,sentencepiece,vision]",
-        "pip install tensorflow_probability",
+        "pip install -U .[sklearn,tf-cpu,testing,sentencepiece,vision]",
+        "pip install -U tensorflow_probability",
     ],
     pytest_options={"rA": None},
     marker="is_pipeline_test",
@@ -334,7 +350,7 @@ def job_name(self):
                 "sudo make install\n",
         },
         "pip install --upgrade pip",
-        "pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
+        "pip install -U .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
         "python -m unidic download",
     ],
     parallelism=None,
@@ -353,8 +369,8 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
         "pip install --upgrade pip",
-        "pip install .[sklearn,torch,sentencepiece,testing,torch-speech]",
-        "pip install -r examples/pytorch/_tests_requirements.txt",
+        "pip install -U .[sklearn,torch,sentencepiece,testing,torch-speech]",
+        "pip install -U -r examples/pytorch/_tests_requirements.txt",
     ],
 )
 
@@ -365,8 +381,8 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y cmake",
         "pip install --upgrade pip",
-        "pip install .[sklearn,tensorflow,sentencepiece,testing]",
-        "pip install -r examples/tensorflow/_tests_requirements.txt",
+        "pip install -U .[sklearn,tensorflow,sentencepiece,testing]",
+        "pip install -U -r examples/tensorflow/_tests_requirements.txt",
     ],
 )
 
@@ -376,8 +392,8 @@ def job_name(self):
     cache_name="flax_examples",
     install_steps=[
         "pip install --upgrade pip",
-        "pip install .[flax,testing,sentencepiece]",
-        "pip install -r examples/flax/_tests_requirements.txt",
+        "pip install -U .[flax,testing,sentencepiece]",
+        "pip install -U -r examples/flax/_tests_requirements.txt",
     ],
 )
 
@@ -389,7 +405,7 @@ def job_name(self):
         'git config --global user.email "ci@dummy.com"',
         'git config --global user.name "ci"',
         "pip install --upgrade pip",
-        "pip install .[torch,sentencepiece,testing]",
+        "pip install -U .[torch,sentencepiece,testing]",
     ],
     marker="is_staging_test",
     pytest_num_workers=1,
@@ -401,7 +417,7 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y cmake",
         "pip install --upgrade pip",
-        "pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
+        "pip install -U .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
     ],
     pytest_options={"k onnx": None},
     pytest_num_workers=1,
@@ -413,13 +429,13 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev",
         "pip install --upgrade pip",
-        "pip install .[torch,testing,vision]",
-        "pip install torchvision",
-        "pip install scipy",
-        "pip install 'git+https://github.com/facebookresearch/detectron2.git'",
+        "pip install -U .[torch,testing,vision]",
+        "pip install -U torchvision",
+        "pip install -U scipy",
+        "pip install -U 'git+https://github.com/facebookresearch/detectron2.git'",
         "sudo apt install tesseract-ocr",
-        "pip install pytesseract",
-        "pip install natten",
+        "pip install -U pytesseract",
+        "pip install -U natten",
     ],
     tests_to_run=[
         "tests/models/*layoutlmv*",
@@ -435,7 +451,7 @@ def job_name(self):
     "repo_utils",
     install_steps=[
         "pip install --upgrade pip",
-        "pip install .[quality,testing,torch]",
+        "pip install -U .[quality,testing,torch]",
     ],
     parallelism=None,
     pytest_num_workers=1,
@@ -456,10 +472,10 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg",
         "pip install --upgrade pip",
-        "pip install -e .[dev]",
-        "pip install git+https://github.com/huggingface/accelerate",
+        "pip install -U -e .[dev]",
+        "pip install -U git+https://github.com/huggingface/accelerate",
         "pip install --upgrade pytest pytest-sugar",
-        "pip install natten",
+        "pip install -U natten",
         "find -name __pycache__ -delete",
         "find . -name \*.pyc -delete",
         # Add an empty file to keep the test step running correctly even no file is selected to be tested.

From feb83521eca849731573dd40da89a02e4f370e5a Mon Sep 17 00:00:00 2001
From: Weiming Zhao <wmzhao.code@gmail.com>
Date: Thu, 22 Jun 2023 17:38:53 -0700
Subject: [PATCH 508/935] [llama] Fix comments in weights converter (#24436)

Explain the reason to clone tensor
---
 .../models/llama/convert_llama_weights_to_hf.py             | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index 45ae4f807225..e8fb7f825279 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -136,8 +136,10 @@ def permute(w):
             }
         else:
             # Sharded
-            # Note that in the 13B checkpoint, not cloning the two following weights will result in the checkpoint
-            # becoming 37GB instead of 26GB for some reason.
+            # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+            # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+            # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
             state_dict = {
                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
                     f"layers.{layer_i}.attention_norm.weight"

From ea91c2adca842da3d2f87e094504fa7d66a7008a Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 23 Jun 2023 10:01:37 +0100
Subject: [PATCH 509/935] [AutoModel] Add AutoModelForTextEncoding (#24305)

* [AutoModel] Add AutoModelForTextEncoding

* add mt5

* add other models

* add to docs

* fix tf imports

* add tf to docs / init

* up

* fix inits

* add to dummy objects
---
 docs/source/en/model_doc/auto.md              |  8 +++++
 src/transformers/__init__.py                  |  8 +++++
 src/transformers/models/auto/__init__.py      |  8 +++++
 src/transformers/models/auto/modeling_auto.py | 36 +++++++++++++++++++
 .../models/auto/modeling_tf_auto.py           | 28 +++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    | 10 ++++++
 src/transformers/utils/dummy_tf_objects.py    | 10 ++++++
 7 files changed, 108 insertions(+)

diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
index 722cafa87554..07ff0b8d6d86 100644
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -214,6 +214,14 @@ The following auto classes are available for the following natural language proc
 
 [[autodoc]] FlaxAutoModelForQuestionAnswering
 
+### AutoModelForTextEncoding
+
+[[autodoc]] AutoModelForTextEncoding
+
+### TFAutoModelForTextEncoding
+
+[[autodoc]] TFAutoModelForTextEncoding
+
 ## Computer vision
 
 The following auto classes are available for the following computer vision tasks.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7bb7a4342ea3..9d3c9a4ff232 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1053,6 +1053,7 @@
             "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+            "MODEL_FOR_TEXT_ENCODING_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
             "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -1087,6 +1088,7 @@
             "AutoModelForSequenceClassification",
             "AutoModelForSpeechSeq2Seq",
             "AutoModelForTableQuestionAnswering",
+            "AutoModelForTextEncoding",
             "AutoModelForTokenClassification",
             "AutoModelForUniversalSegmentation",
             "AutoModelForVideoClassification",
@@ -2984,6 +2986,7 @@
             "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+            "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
             "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
             "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
@@ -3003,6 +3006,7 @@
             "TFAutoModelForSequenceClassification",
             "TFAutoModelForSpeechSeq2Seq",
             "TFAutoModelForTableQuestionAnswering",
+            "TFAutoModelForTextEncoding",
             "TFAutoModelForTokenClassification",
             "TFAutoModelForVision2Seq",
             "TFAutoModelForZeroShotImageClassification",
@@ -4807,6 +4811,7 @@
             MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_TEXT_ENCODING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
@@ -4841,6 +4846,7 @@
             AutoModelForSequenceClassification,
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
+            AutoModelForTextEncoding,
             AutoModelForTokenClassification,
             AutoModelForUniversalSegmentation,
             AutoModelForVideoClassification,
@@ -6374,6 +6380,7 @@
             TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_TEXT_ENCODING_MAPPING,
             TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
             TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
@@ -6393,6 +6400,7 @@
             TFAutoModelForSequenceClassification,
             TFAutoModelForSpeechSeq2Seq,
             TFAutoModelForTableQuestionAnswering,
+            TFAutoModelForTextEncoding,
             TFAutoModelForTokenClassification,
             TFAutoModelForVision2Seq,
             TFAutoModelForZeroShotImageClassification,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index df9958a80e4d..26b6609c28fc 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -64,6 +64,7 @@
         "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+        "MODEL_FOR_TEXT_ENCODING_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -85,6 +86,7 @@
         "AutoModelForImageSegmentation",
         "AutoModelForInstanceSegmentation",
         "AutoModelForMaskGeneration",
+        "AutoModelForTextEncoding",
         "AutoModelForMaskedImageModeling",
         "AutoModelForMaskedLM",
         "AutoModelForMultipleChoice",
@@ -131,6 +133,7 @@
         "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
+        "TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
         "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
         "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
@@ -150,6 +153,7 @@
         "TFAutoModelForSequenceClassification",
         "TFAutoModelForSpeechSeq2Seq",
         "TFAutoModelForTableQuestionAnswering",
+        "TFAutoModelForTextEncoding",
         "TFAutoModelForTokenClassification",
         "TFAutoModelForVision2Seq",
         "TFAutoModelForZeroShotImageClassification",
@@ -233,6 +237,7 @@
             MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_TEXT_ENCODING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
@@ -267,6 +272,7 @@
             AutoModelForSequenceClassification,
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
+            AutoModelForTextEncoding,
             AutoModelForTokenClassification,
             AutoModelForUniversalSegmentation,
             AutoModelForVideoClassification,
@@ -300,6 +306,7 @@
             TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_TEXT_ENCODING_MAPPING,
             TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
             TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
@@ -319,6 +326,7 @@
             TFAutoModelForSequenceClassification,
             TFAutoModelForSpeechSeq2Seq,
             TFAutoModelForTableQuestionAnswering,
+            TFAutoModelForTextEncoding,
             TFAutoModelForTokenClassification,
             TFAutoModelForVision2Seq,
             TFAutoModelForZeroShotImageClassification,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4c675ff05097..d0c4a236c7c0 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1011,6 +1011,36 @@
     ]
 )
 
+MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
+    [
+        ("albert", "AlbertModel"),
+        ("bert", "BertModel"),
+        ("big_bird", "BigBirdModel"),
+        ("data2vec-text", "Data2VecTextModel"),
+        ("deberta", "DebertaModel"),
+        ("deberta-v2", "DebertaV2Model"),
+        ("distilbert", "DistilBertModel"),
+        ("electra", "ElectraModel"),
+        ("flaubert", "FlaubertModel"),
+        ("ibert", "IBertModel"),
+        ("longformer", "LongformerModel"),
+        ("mobilebert", "MobileBertModel"),
+        ("mt5", "MT5EncoderModel"),
+        ("nystromformer", "NystromformerModel"),
+        ("reformer", "ReformerModel"),
+        ("rembert", "RemBertModel"),
+        ("roberta", "RobertaModel"),
+        ("roberta-prelayernorm", "RobertaPreLayerNormModel"),
+        ("roc_bert", "RoCBertModel"),
+        ("roformer", "RoFormerModel"),
+        ("squeezebert", "SqueezeBertModel"),
+        ("t5", "T5EncoderModel"),
+        ("xlm", "XLMModel"),
+        ("xlm-roberta", "XLMRobertaModel"),
+        ("xlm-roberta-xl", "XLMRobertaXLModel"),
+    ]
+)
+
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
@@ -1088,11 +1118,17 @@
 
 MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
 
+MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
+
 
 class AutoModelForMaskGeneration(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING
 
 
+class AutoModelForTextEncoding(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TEXT_ENCODING_MAPPING
+
+
 class AutoModel(_BaseAutoModelClass):
     _model_mapping = MODEL_MAPPING
 
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index bd86431c8cb2..10e33cbebbe6 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -437,6 +437,28 @@
         ("sam", "TFSamModel"),
     ]
 )
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict(
+    [
+        ("albert", "TFAlbertModel"),
+        ("bert", "TFBertModel"),
+        ("convbert", "TFConvBertModel"),
+        ("deberta", "TFDebertaModel"),
+        ("deberta-v2", "TFDebertaV2Model"),
+        ("distilbert", "TFDistilBertModel"),
+        ("electra", "TFElectraModel"),
+        ("flaubert", "TFFlaubertModel"),
+        ("longformer", "TFLongformerModel"),
+        ("mobilebert", "TFMobileBertModel"),
+        ("mt5", "TFMT5EncoderModel"),
+        ("rembert", "TFRemBertModel"),
+        ("roberta", "TFRobertaModel"),
+        ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
+        ("roformer", "TFRoFormerModel"),
+        ("t5", "TFT5EncoderModel"),
+        ("xlm", "TFXLMModel"),
+        ("xlm-roberta", "TFXLMRobertaModel"),
+    ]
+)
 
 TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES)
 TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES)
@@ -491,11 +513,17 @@
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
 )
 
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
+
 
 class TFAutoModelForMaskGeneration(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING
 
 
+class TFAutoModelForTextEncoding(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_TEXT_ENCODING_MAPPING
+
+
 class TFAutoModel(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_MAPPING
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1490adf3b20e..2ca6f0156222 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -524,6 +524,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
 
 
+MODEL_FOR_TEXT_ENCODING_MAPPING = None
+
+
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
@@ -726,6 +729,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForTextEncoding(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForTokenClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 4da32ae6034a..e55684402805 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -264,6 +264,9 @@ def __init__(self, *args, **kwargs):
 TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
 
 
+TF_MODEL_FOR_TEXT_ENCODING_MAPPING = None
+
+
 TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
@@ -377,6 +380,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForTextEncoding(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForTokenClassification(metaclass=DummyObject):
     _backends = ["tf"]
 

From 468aed39afffafe417819a309a4e6d45d2a9e8f4 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 23 Jun 2023 13:35:04 +0200
Subject: [PATCH 510/935] [`Trainer`] Fix `.to` call on 4bit models (#24444)

* fix `.to` call on 4bit models

* better check
---
 src/transformers/trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e8edf624b6b8..57b197e5610d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -493,7 +493,8 @@ def __init__(
         self.eval_dataset = eval_dataset
         self.tokenizer = tokenizer
 
-        if self.place_model_on_device and not getattr(model, "is_loaded_in_8bit", False):
+        # Quantized models doesn't support `.to` operation.
+        if self.place_model_on_device and not getattr(model, "is_quantized", False):
             self._move_model_to_device(model, args.device)
 
         # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs

From c036c814f427415db15d37fca98540d55abea492 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 23 Jun 2023 17:43:07 +0530
Subject: [PATCH 511/935] fix the grad_acc issue at epoch boundaries (#24415)

* fix the grad_acc issue at epoch boundaries

Co-Authored-By: Zach Mueller <7831895+muellerzr@users.noreply.github.com>

* add contributors.

Co-authored-by: sumpster

* address comments

---------

Co-authored-by: Zach Mueller <7831895+muellerzr@users.noreply.github.com>
---
 src/transformers/trainer.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 57b197e5610d..7aba74fdfc21 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -196,7 +196,7 @@
 if is_accelerate_available():
     from accelerate import Accelerator, skip_first_batches
     from accelerate import __version__ as accelerate_version
-    from accelerate.utils import DistributedDataParallelKwargs
+    from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin
 
     if version.parse(accelerate_version) > version.parse("0.20.3"):
         from accelerate.utils import (
@@ -1806,14 +1806,23 @@ def _inner_training_loop(
 
                 self.current_flos += float(self.floating_point_ops(inputs))
 
-                # should this be under the accumulate context manager?
-                # the `or` condition of `steps_in_epoch <= args.gradient_accumulation_steps` is not covered
-                # in accelerate
-                if total_batched_samples % args.gradient_accumulation_steps == 0 or (
+                is_last_step_and_steps_less_than_grad_acc = (
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
+                )
+
+                if (
+                    total_batched_samples % args.gradient_accumulation_steps == 0
+                    or
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    steps_in_epoch <= args.gradient_accumulation_steps
-                    and (step + 1) == steps_in_epoch
+                    is_last_step_and_steps_less_than_grad_acc
                 ):
+                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
+                    # in accelerate. So, explicitly enable sync gradients to True in that case.
+                    if is_last_step_and_steps_less_than_grad_acc or (
+                        version.parse(accelerate_version) <= version.parse("0.20.3")
+                    ):
+                        self.accelerator.gradient_state._set_sync_gradients(True)
+
                     # Gradient clipping
                     if args.max_grad_norm is not None and args.max_grad_norm > 0:
                         # deepspeed does its own clipping
@@ -3815,10 +3824,14 @@ def _add_sm_patterns_to_gitignore(self) -> None:
             self.repo.git_push()
 
     def create_accelerator_and_postprocess(self):
+        grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
+        if version.parse(accelerate_version) > version.parse("0.20.3"):
+            grad_acc_kwargs["sync_with_dataloader"] = False
+        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+
         # create accelerator object
         self.accelerator = Accelerator(
-            deepspeed_plugin=self.args.deepspeed_plugin,
-            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
+            deepspeed_plugin=self.args.deepspeed_plugin, gradient_accumulation_plugin=gradient_accumulation_plugin
         )
 
         # deepspeed and accelerate flags covering both trainer args and accelerate launcher

From a28325e25ef399e1a616d67f50ca05abba931163 Mon Sep 17 00:00:00 2001
From: Bowen Bao <bowbao@microsoft.com>
Date: Fri, 23 Jun 2023 05:17:21 -0700
Subject: [PATCH 512/935] Replace python random with torch.rand to enable
 dynamo.export (#24434)

* Replace python random with torch.rand to enable dynamo.export

* revert changes to flax model code

* Remove unused random import

* Fix torch template

* Move torch.manual_seed(0) to right location
---
 src/transformers/models/autoformer/modeling_autoformer.py    | 5 ++---
 src/transformers/models/bart/modeling_bart.py                | 5 ++---
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py       | 5 ++---
 src/transformers/models/biogpt/modeling_biogpt.py            | 3 +--
 src/transformers/models/blenderbot/modeling_blenderbot.py    | 5 ++---
 .../models/blenderbot_small/modeling_blenderbot_small.py     | 5 ++---
 .../models/conditional_detr/modeling_conditional_detr.py     | 5 ++---
 src/transformers/models/data2vec/modeling_data2vec_audio.py  | 2 +-
 src/transformers/models/detr/modeling_detr.py                | 5 ++---
 src/transformers/models/flaubert/modeling_flaubert.py        | 3 +--
 src/transformers/models/fsmt/modeling_fsmt.py                | 5 ++---
 src/transformers/models/hubert/modeling_hubert.py            | 4 ++--
 src/transformers/models/informer/modeling_informer.py        | 5 ++---
 src/transformers/models/led/modeling_led.py                  | 5 ++---
 src/transformers/models/m2m_100/modeling_m2m_100.py          | 5 ++---
 src/transformers/models/marian/modeling_marian.py            | 5 ++---
 src/transformers/models/mask2former/modeling_mask2former.py  | 3 +--
 src/transformers/models/maskformer/modeling_maskformer.py    | 3 +--
 src/transformers/models/mbart/modeling_mbart.py              | 5 ++---
 src/transformers/models/mctct/modeling_mctct.py              | 3 +--
 src/transformers/models/mvp/modeling_mvp.py                  | 5 ++---
 src/transformers/models/nllb_moe/modeling_nllb_moe.py        | 5 ++---
 src/transformers/models/opt/modeling_opt.py                  | 3 +--
 src/transformers/models/pegasus/modeling_pegasus.py          | 5 ++---
 src/transformers/models/pegasus_x/modeling_pegasus_x.py      | 5 ++---
 src/transformers/models/plbart/modeling_plbart.py            | 5 ++---
 src/transformers/models/sew/modeling_sew.py                  | 2 +-
 .../models/speech_to_text/modeling_speech_to_text.py         | 5 ++---
 .../models/speech_to_text_2/modeling_speech_to_text_2.py     | 3 +--
 src/transformers/models/speecht5/modeling_speecht5.py        | 5 ++---
 .../models/table_transformer/modeling_table_transformer.py   | 5 ++---
 .../modeling_time_series_transformer.py                      | 5 ++---
 src/transformers/models/trocr/modeling_trocr.py              | 3 +--
 src/transformers/models/unispeech/modeling_unispeech.py      | 4 ++--
 .../models/unispeech_sat/modeling_unispeech_sat.py           | 4 ++--
 src/transformers/models/wav2vec2/modeling_wav2vec2.py        | 4 ++--
 .../models/wav2vec2_conformer/modeling_wav2vec2_conformer.py | 2 +-
 src/transformers/models/wavlm/modeling_wavlm.py              | 4 ++--
 src/transformers/models/whisper/modeling_whisper.py          | 5 ++---
 src/transformers/models/xglm/modeling_xglm.py                | 3 +--
 .../modeling_{{cookiecutter.lowercase_modelname}}.py         | 5 ++---
 tests/generation/test_utils.py                               | 2 +-
 42 files changed, 71 insertions(+), 104 deletions(-)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 70587add17e7..01c20dc52a01 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -17,7 +17,6 @@
 """ PyTorch Autoformer model."""
 
 import math
-import random
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -1198,7 +1197,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1408,7 +1407,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 50452449021c..51afe26301b3 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -15,7 +15,6 @@
 """ PyTorch BART model."""
 import copy
 import math
-import random
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -837,7 +836,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1090,7 +1089,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 8d7906631d54..e529aec5ec8a 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -1933,7 +1932,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -2276,7 +2275,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index a9ecb11a61f1..40fa81de9ce4 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -579,7 +578,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 8f2780772cbd..3fe45ee216e9 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -18,7 +18,6 @@
 import copy
 import math
 import os
-import random
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -767,7 +766,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1019,7 +1018,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index ef8d51a2b0e7..536554669706 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -765,7 +764,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1016,7 +1015,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 023cb2784841..979cef5b403b 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -1224,7 +1223,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1378,7 +1377,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
             if idx == 0:
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 168f342acd32..76b6b4d485f0 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -587,7 +587,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index c92c43e46d18..165c98f1e6f0 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -979,7 +978,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1118,7 +1117,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 38705bec09e0..11f6f0fb3f62 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -16,7 +16,6 @@
 
 import itertools
 import math
-import random
 from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
 
@@ -580,7 +579,7 @@ def forward(
         attentions = () if output_attentions else None
         for i in range(self.n_layers):
             # LayerDrop
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 22c3a0a2489a..35d34324c722 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -28,7 +28,6 @@
 """PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19"""
 
 import math
-import random
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -550,7 +549,7 @@ def forward(
                 encoder_states += (x,)
                 x = x.transpose(0, 1)  # B x T x C -> T x B x C
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 attn = None
             else:
@@ -794,7 +793,7 @@ def forward(
                 x = x.transpose(0, 1)
                 all_hidden_states += (x,)
                 x = x.transpose(0, 1)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 70a8c079409b..af3d4e2d0aca 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -725,7 +725,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -814,7 +814,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 2bf3f208a903..1645cacd3d4d 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ PyTorch Informer model."""
 
-import random
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -1205,7 +1204,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1425,7 +1424,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index a11659e38933..38400590d3b6 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
@@ -1871,7 +1870,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None, None)
@@ -2135,7 +2134,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index f8f9e1d3a8ee..a9cde571f7d2 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -813,7 +812,7 @@ def forward(
                 encoder_states = encoder_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -1057,7 +1056,7 @@ def forward(
                 all_hidden_states += (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index a75f833fb5cb..c1d6a6768420 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -778,7 +777,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1024,7 +1023,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 4cb2493e58c8..86091062923c 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -15,7 +15,6 @@
 """ PyTorch Mask2Former model."""
 
 import math
-import random
 import warnings
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
@@ -1862,7 +1861,7 @@ def forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             if self.training and (dropout_probability < self.layerdrop):
                 continue
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 830f8b23c816..55efe64da3c3 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -15,7 +15,6 @@
 """ PyTorch MaskFormer model."""
 
 import math
-import random
 from dataclasses import dataclass
 from numbers import Number
 from typing import Dict, List, Optional, Tuple
@@ -764,7 +763,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 67750ab42f71..8a088b68ab0b 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -15,7 +15,6 @@
 """ PyTorch MBART model."""
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -819,7 +818,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1074,7 +1073,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py
index 08e280b3ccf9..7f2de9f952a9 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/mctct/modeling_mctct.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -610,7 +609,7 @@ def forward(
                 encoder_states = encoder_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 6a44768d8eec..a1fca99dadec 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -15,7 +15,6 @@
 """ PyTorch MVP model."""
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -941,7 +940,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1216,7 +1215,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 06b61c7497db..3585b1d3b62f 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -1143,7 +1142,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None, None)
             else:
@@ -1405,7 +1404,7 @@ def forward(
                 all_hidden_states += (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index bd64630c6200..92c616bb631d 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch OPT model."""
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -685,7 +684,7 @@ def forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index a2bd3f3812e5..9565ee0d91fe 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -16,7 +16,6 @@
 
 import copy
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -793,7 +792,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1074,7 +1073,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index 8e380a4de5f0..661cb85a3bb6 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -16,7 +16,6 @@
 
 import dataclasses
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -1060,7 +1059,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1315,7 +1314,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 365429360af5..2a80ae3d593d 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -15,7 +15,6 @@
 """ PyTorch PLBART model."""
 import copy
 import math
-import random
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -798,7 +797,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1052,7 +1051,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index dd854c49f5c9..6b0869c87ad6 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -667,7 +667,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index d8a19084eb38..bca2669ae13f 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -808,7 +807,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1053,7 +1052,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index c13b04642d9d..31e9bc34c943 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -662,7 +661,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 3e8ce5a23b7e..c91b90d63cdf 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -15,7 +15,6 @@
 """ PyTorch SpeechT5 model."""
 
 import math
-import random
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -1381,7 +1380,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and (dropout_probability < self.layerdrop)
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -1706,7 +1705,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and (dropout_probability < self.layerdrop)
             if skip_the_layer and not deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 733ff7b9b453..d2de059470d9 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -920,7 +919,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1062,7 +1061,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 8986ef6729ca..477a52a57c7f 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """ PyTorch Time Series Transformer model."""
 
-import random
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -937,7 +936,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1151,7 +1150,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 6276c68a425d..ede83af6ed7d 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -17,7 +17,6 @@
 
 import copy
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -694,7 +693,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index e068fa59e579..16c08bbbf3e0 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -761,7 +761,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -850,7 +850,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 2ed8a5d57204..b57369ea6f75 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -775,7 +775,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -864,7 +864,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 43ab2408bb23..1c8965c96003 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -797,7 +797,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -885,7 +885,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 3e37a4a504b0..7a757d0a51f9 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -903,7 +903,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index e4072d93724f..d782a47402f0 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -707,7 +707,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -797,7 +797,7 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = np.random.uniform(0, 1)
+            dropout_probability = torch.rand([])
 
             skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 42fda344f610..c5e9c94d3f94 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -15,7 +15,6 @@
 """ PyTorch Whisper model."""
 
 import math
-import random
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -916,7 +915,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -1145,7 +1144,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 4a72b785a024..19ae63199c72 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -16,7 +16,6 @@
 
 
 import math
-import random
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -668,7 +667,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.rand([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 4899e195986f..879100aeaa41 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -1560,7 +1560,6 @@ def forward(
 {% else %}
 import math
 import copy
-import random
 from typing import Optional, Tuple, List, Union
 
 import torch
@@ -2306,7 +2305,7 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.randn([])
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 layer_outputs = (None, None)
             else:
@@ -2543,7 +2542,7 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
+            dropout_probability = torch.randn([])
             if self.training and (dropout_probability < self.layerdrop):
                 continue
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 4e09f21898fd..de38705ce472 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -464,6 +464,7 @@ def _beam_sample_generate(
             **model_kwargs,
         )
         # beam_search does not automatically interleave `batch_size` dim for `num_beams * num_return_sequences`
+        torch.manual_seed(0)
         kwargs = {}
         if model.config.is_encoder_decoder:
             encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
@@ -482,7 +483,6 @@ def _beam_sample_generate(
         logits_processor = LogitsProcessorList()
         logits_processor.append(InfNanRemoveLogitsProcessor())
 
-        torch.manual_seed(0)
         with torch.no_grad():
             model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_beam_sample = model.beam_sample(

From 5e9f6752eefe0fc830129e9964d2ae5027caa6ba Mon Sep 17 00:00:00 2001
From: Moon Gi Cho <siryuon@gmail.com>
Date: Fri, 23 Jun 2023 21:21:08 +0900
Subject: [PATCH 513/935] Fix typo (#24440)

---
 src/transformers/pipelines/text_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index 1c1c7249f49d..f758522c9fe3 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -102,7 +102,7 @@ def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, t
             postprocess_params["_legacy"] = False
         elif return_all_scores is not None:
             warnings.warn(
-                "`return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of"
+                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
                 " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
                 UserWarning,
             )

From 2898fd396831bccd71e4e7056c2ed816c4a11406 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 23 Jun 2023 14:27:49 +0200
Subject: [PATCH 514/935] Fix some `TFWhisperModelIntegrationTests` (#24428)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* Update src/transformers/models/whisper/modeling_tf_whisper.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/whisper/modeling_tf_whisper.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/whisper/modeling_tf_whisper.py     | 227 +++++++++++++++++-
 .../whisper/test_modeling_tf_whisper.py       |  64 ++++-
 2 files changed, 278 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 4d6ecb85b599..c36450dd12f3 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -19,12 +19,14 @@
 
 import math
 import random
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
+from ...generation.configuration_utils import GenerationConfig
+from ...generation.tf_logits_process import TFLogitsProcessorList
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -41,6 +43,7 @@
 from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_whisper import WhisperConfig
+from .tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
 
 
 logger = logging.get_logger(__name__)
@@ -1324,6 +1327,228 @@ def call(
             encoder_attentions=outputs.encoder_attentions,
         )
 
+    def generate(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        seed: Optional[List[int]] = None,
+        return_timestamps: Optional[bool] = None,
+        task: Optional[str] = None,
+        language: Optional[str] = None,
+        is_multilingual: Optional[bool] = None,
+        prompt_ids: Optional[tf.Tensor] = None,
+        return_token_timestamps=None,
+        **kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`tf.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If unset the method
+                initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` should of in
+                the format of `input_ids`. For encoder-decoder models *inputs* can represent any of `input_ids`,
+                `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            seed (`List[int]`, *optional*):
+                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
+                `seed` argument from stateless functions in `tf.random`.
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `TFWhisperTimestampsLogitsProcessor`.
+            task (`str`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. You can
+                find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            prompt_ids (`tf.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
+            kwargs:
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `tf.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+            `config.return_dict_in_generate=True`) or a `tf.Tensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation.TFSampleDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation.TFSampleEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSampleEncoderDecoderOutput`]
+
+        """
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            if not hasattr(generation_config, "no_timestamps_token_id"):
+                raise ValueError(
+                    "You are trying to return timestamps, but the generation config is not properly set."
+                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`."
+                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+                )
+
+            generation_config.return_timestamps = return_timestamps
+        else:
+            generation_config.return_timestamps = False
+
+        if language is not None:
+            language = language.lower()
+            generation_config.language = language
+        if task is not None:
+            generation_config.task = task
+
+        forced_decoder_ids = None
+
+        # Legacy code for backward compatibility
+        if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
+            forced_decoder_ids = self.config.forced_decoder_ids
+        elif (
+            hasattr(self.generation_config, "forced_decoder_ids")
+            and self.generation_config.forced_decoder_ids is not None
+        ):
+            forced_decoder_ids = self.generation_config.forced_decoder_ids
+        else:
+            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
+
+        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
+            forced_decoder_ids = []
+            if hasattr(generation_config, "language"):
+                if generation_config.language in generation_config.lang_to_id.keys():
+                    language_token = generation_config.language
+                elif generation_config.language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
+                elif generation_config.language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{generation_config.language}|>"
+                else:
+                    is_language_code = len(generation_config.language) == 2
+                    raise ValueError(
+                        f"Unsupported language: {generation_config.language}. Language should be one of:"
+                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            else:
+                forced_decoder_ids.append((1, None))  # automatically detect the language
+
+            if hasattr(generation_config, "task"):
+                if generation_config.task in TASK_IDS:
+                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+                else:
+                    raise ValueError(
+                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
+                    )
+            elif hasattr(generation_config, "task_to_id"):
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
+            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        if forced_decoder_ids is not None:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        if prompt_ids is not None:
+            if kwargs.get("decoder_start_token_id") is not None:
+                raise ValueError(
+                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
+                )
+            prompt_ids = prompt_ids.tolist()
+            decoder_start_token_id, *text_prompt_ids = prompt_ids
+            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
+            # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
+            text_prompt_ids = text_prompt_ids[-self.config.max_length // 2 - 1 :]
+            # Set the decoder_start_token_id to <|startofprev|>
+            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
+
+            # Update the max generation length to include the prompt
+            specified_max_length = kwargs.pop("max_new_tokens", None) or kwargs.pop("max_length", None)
+            default_max_length = generation_config.max_new_tokens or generation_config.max_length
+            non_prompt_max_length = specified_max_length or default_max_length
+            kwargs["max_new_tokens"] = non_prompt_max_length + len(text_prompt_ids)
+
+            # Reformat the forced_decoder_ids to incorporate the prompt
+            non_prompt_forced_decoder_ids = (
+                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
+            )
+            forced_decoder_ids = [
+                *text_prompt_ids,
+                generation_config.decoder_start_token_id,
+                *[token for _rank, token in non_prompt_forced_decoder_ids],
+            ]
+            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        # TODO: Implement `WhisperTimeStampLogitsProcessor`.
+        if generation_config.return_timestamps:
+            # logits_processor = [TFWhisperTimeStampLogitsProcessor(generation_config)]
+            raise ValueError("`TFWhisperForConditionalGeneration` doesn't support returning the timestamps yet.")
+
+        if return_token_timestamps:
+            kwargs["output_attentions"] = True
+            kwargs["return_dict_in_generate"] = True
+
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+
+        outputs = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            **kwargs,
+        )
+
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            outputs["token_timestamps"] = self._extract_token_timestamps(outputs, generation_config.alignment_heads)
+
+        return outputs
+
     def serving_output(self, output):
         pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
         dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 0783bd67bf43..1bf5c2ccc230 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -634,6 +634,48 @@ def test_lm_head_model_random_beam_search_generate(self):
             generated_ids = output_tokens[:, input_features.shape[-1] :]
             self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
 
+    def test_generate_with_prompt_ids_and_task_and_language(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = TFWhisperForConditionalGeneration(config)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.arange(5)
+        language = "<|de|>"
+        task = "translate"
+        lang_id = 6
+        task_id = 7
+        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
+        model.generation_config.__setattr__("task_to_id", {task: task_id})
+
+        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            lang_id,
+            task_id,
+        ]
+        for row in output.numpy().tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
+    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = TFWhisperForConditionalGeneration(config)
+        input_features = input_dict["input_features"]
+        prompt_ids = np.asarray(range(5))
+        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
+
+        output = model.generate(
+            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
+        )
+
+        expected_output_start = [
+            *prompt_ids.tolist(),
+            model.generation_config.decoder_start_token_id,
+            *[token for _rank, token in forced_decoder_ids],
+        ]
+        for row in output.numpy().tolist():
+            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
+
 
 def _load_datasamples(num_samples):
     ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
@@ -779,24 +821,22 @@ def _test_large_batched_generation(in_queue, out_queue, timeout):
         generated_ids = np.concatenate([generated_ids_1, generated_ids_2])
 
         # fmt: off
-        EXPECTED_LOGITS = tf.convert_to_tensor(
-            [
-                [50258, 50259, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404],
-                [50258, 50259, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257],
-                [50258, 50259, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904],
-                [50258, 50259, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439]
-            ]
-        )
+        EXPECTED_IDS = [
+            [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
+            [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
+            [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
+            [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
+        ]
         # fmt: on
 
-        unittest.TestCase().assertTrue(np.allclose(generated_ids, EXPECTED_LOGITS))
+        unittest.TestCase().assertEqual(generated_ids.tolist(), EXPECTED_IDS)
 
         # fmt: off
         EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes and we are glad",
+            " Mr. Quilter is the apostle of the middle classes and we are glad to",
             " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all",
+            " He tells us that at this festive season of the year, with Christmas and roast beef",
+            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"
         ]
         # fmt: on
 

From a6f37f8879f9e6496eb3b71ee1695edc1cf28124 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 23 Jun 2023 18:03:57 +0530
Subject: [PATCH 515/935] fixes issue when saving fsdp via accelerate's FSDP
 plugin (#24446)

---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 7aba74fdfc21..85d0f717d93f 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2322,7 +2322,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
                     torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
         elif self.args.should_save and not self.is_deepspeed_enabled:
             # deepspeed.save_checkpoint above saves model/optim/sched
-            if self.fsdp:
+            if self.fsdp and not self.is_fsdp_enabled:
                 torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
             else:
                 torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))

From 8767958fc15a5549cc0f94db5ee721ccc426bb55 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 23 Jun 2023 13:50:37 +0100
Subject: [PATCH 516/935] Allow dict input for audio classification pipeline
 (#23445)

* Allow dict input for audio classification pipeline

* make style

* Empty commit to trigger CI

* Empty commit to trigger CI

* check for torchaudio

* add pip instructions

Co-authored-by: Sylvain <sylvain.gugger@gmail.com>

* Update src/transformers/pipelines/audio_classification.py

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* asr -> audio class

* asr -> audio class

---------

Co-authored-by: Sylvain <sylvain.gugger@gmail.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .../pipelines/audio_classification.py         | 54 ++++++++++++++++---
 .../test_pipelines_audio_classification.py    |  4 ++
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 7f104e74e128..8ac2a4a554e3 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -17,7 +17,7 @@
 import numpy as np
 import requests
 
-from ..utils import add_end_docstrings, is_torch_available, logging
+from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
@@ -110,12 +110,18 @@ def __call__(
         information.
 
         Args:
-            inputs (`np.ndarray` or `bytes` or `str`):
-                The inputs is either a raw waveform (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
-                at the correct sampling rate (no further check will be done) or a `str` that is the filename of the
-                audio file, the file will be read at the correct sampling rate to get the waveform using *ffmpeg*. This
-                requires *ffmpeg* to be installed on the system. If *inputs* is `bytes` it is supposed to be the
-                content of an audio file and is interpreted by *ffmpeg* in the same way.
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+                      `"array"` is used to denote the raw audio waveform.
             top_k (`int`, *optional*, defaults to None):
                 The number of top labels that will be returned by the pipeline. If the provided number is `None` or
                 higher than the number of labels available in the model configuration, it will default to the number of
@@ -151,10 +157,42 @@ def preprocess(self, inputs):
         if isinstance(inputs, bytes):
             inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
 
+        if isinstance(inputs, dict):
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                import torch
+
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+
         if not isinstance(inputs, np.ndarray):
             raise ValueError("We expect a numpy ndarray as input")
         if len(inputs.shape) != 1:
-            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
 
         processed = self.feature_extractor(
             inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index 8f2e46e0a50b..48c39ff663fb 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -103,6 +103,10 @@ def test_small_model_pt(self):
         ]
         self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
 
+        audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
+        output = audio_classifier(audio_dict, top_k=4)
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
     @require_torch
     @slow
     def test_large_model_pt(self):

From 1e9da2b0a6ef964c2cf72dd715dbee991a3f49fa Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 23 Jun 2023 15:00:52 +0200
Subject: [PATCH 517/935] Update `JukeboxConfig.from_pretrained` (#24443)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/jukebox/configuration_jukebox.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index c9a8d63757a6..2cd0c53dc93b 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -353,6 +353,8 @@ def __init__(
     def from_pretrained(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs
     ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the prior config dict if we are loading from JukeboxConfig
@@ -486,6 +488,8 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLIPConfig

From 8e164c5400b7b413c7b8fb32e35132001effc970 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:09:34 +0100
Subject: [PATCH 518/935] Improved keras imports (#24448)

* An end to accursed version-specific imports

* No more K.is_keras_tensor() either

* Update dependency tables

* Use a cleaner call context function getter

* Add a cap to <2.14

* Add cap to examples requirements too
---
 examples/tensorflow/_tests_requirements.txt   |  2 +-
 setup.py                                      |  6 ++---
 src/transformers/dependency_versions_table.py |  6 ++---
 src/transformers/modeling_tf_pytorch_utils.py |  7 +----
 src/transformers/modeling_tf_utils.py         | 26 +++++--------------
 5 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/examples/tensorflow/_tests_requirements.txt b/examples/tensorflow/_tests_requirements.txt
index d44a342b4f32..e28367c01c52 100644
--- a/examples/tensorflow/_tests_requirements.txt
+++ b/examples/tensorflow/_tests_requirements.txt
@@ -1,4 +1,4 @@
-tensorflow<2.13
+tensorflow<2.14
 tensorboard
 scikit-learn
 seqeval
diff --git a/setup.py b/setup.py
index d6e213798dbb..21489a1ecfec 100644
--- a/setup.py
+++ b/setup.py
@@ -168,9 +168,9 @@
     "sudachipy>=0.6.6",
     "sudachidict_core>=20220729",
     # TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
-    "tensorflow-cpu>=2.4,<2.13",
-    "tensorflow>=2.4,<2.13",
-    "tensorflow-text<2.13",
+    "tensorflow-cpu>=2.6,<2.14",
+    "tensorflow>=2.6,<2.14",
+    "tensorflow-text<2.14",
     "tf2onnx",
     "timeout-decorator",
     "timm",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 39d8d0cb30ca..afea4a016944 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -72,9 +72,9 @@
     "starlette": "starlette",
     "sudachipy": "sudachipy>=0.6.6",
     "sudachidict_core": "sudachidict_core>=20220729",
-    "tensorflow-cpu": "tensorflow-cpu>=2.4,<2.13",
-    "tensorflow": "tensorflow>=2.4,<2.13",
-    "tensorflow-text": "tensorflow-text<2.13",
+    "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.14",
+    "tensorflow": "tensorflow>=2.6,<2.14",
+    "tensorflow-text": "tensorflow-text<2.14",
     "tf2onnx": "tf2onnx",
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 81ec306c409c..afefa31b2cca 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -251,12 +251,7 @@ def load_pytorch_state_dict_in_tf2_model(
     """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
     safetensors archive created with the safe_open() function."""
     import tensorflow as tf
-    from packaging.version import parse
-
-    if parse(tf.__version__) >= parse("2.11.0"):
-        from keras import backend as K
-    else:
-        from tensorflow.python.keras import backend as K
+    from keras import backend as K
 
     if tf_inputs is None:
         tf_inputs = tf_model.dummy_inputs
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 08f5aa0b9329..dd65c7b23b21 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -33,7 +33,9 @@
 import numpy as np
 import tensorflow as tf
 from huggingface_hub import Repository, list_repo_files
+from keras import backend as K
 from packaging.version import parse
+from tensorflow.python.util.keras_deps import get_call_context_function
 
 from . import DataCollatorWithPadding, DefaultDataCollator
 from .activations_tf import get_tf_activation
@@ -71,20 +73,6 @@
 from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 
 
-if parse(tf.__version__).minor >= 13:
-    from keras import backend as K
-    from keras.__internal__ import KerasTensor
-    from keras.src.engine.base_layer_utils import call_context
-elif parse(tf.__version__).minor >= 11:
-    from keras import backend as K
-    from keras.engine.base_layer_utils import call_context
-    from keras.engine.keras_tensor import KerasTensor
-else:
-    from tensorflow.python.keras import backend as K
-    from tensorflow.python.keras.engine.base_layer_utils import call_context
-    from tensorflow.python.keras.engine.keras_tensor import KerasTensor
-
-
 if is_safetensors_available():
     from safetensors import safe_open
     from safetensors.tensorflow import save_file as safe_save_file
@@ -99,13 +87,10 @@
 TFModelInputType = Union[
     List[tf.Tensor],
     List[np.ndarray],
-    List[KerasTensor],
     Dict[str, tf.Tensor],
     Dict[str, np.ndarray],
-    Dict[str, KerasTensor],
     tf.Tensor,
     np.ndarray,
-    KerasTensor,
 ]
 
 
@@ -472,7 +457,7 @@ def input_processing(func, config, **kwargs):
     main_input_name = parameter_names[0]
     main_input = kwargs.pop(main_input_name, None)
     output = {}
-    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray, KerasTensor)
+    allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray)
 
     if "inputs" in kwargs["kwargs_call"]:
         warnings.warn(
@@ -511,7 +496,7 @@ def input_processing(func, config, **kwargs):
         kwargs.pop("kwargs_call")
 
     for k, v in kwargs.items():
-        if isinstance(v, allowed_types) or v is None:
+        if isinstance(v, allowed_types) or tf.is_tensor(v) or v is None:
             output[k] = v
         else:
             raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
@@ -564,7 +549,7 @@ def input_processing(func, config, **kwargs):
             else:
                 raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.")
     else:
-        if isinstance(main_input, (tf.Tensor, KerasTensor)) or main_input is None:
+        if tf.is_tensor(main_input) or main_input is None:
             output[main_input_name] = main_input
         else:
             raise ValueError(
@@ -1142,6 +1127,7 @@ def framework(self) -> str:
         return "tf"
 
     def build(self, input_shape=None):
+        call_context = get_call_context_function()
         if self.built or call_context().in_call:
             self.built = True
         else:

From 868363abb9e72a638b4710d1f5ef1199839b3eec Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Mon, 26 Jun 2023 11:23:57 +0200
Subject: [PATCH 519/935] Add InstructBLIP (#23460)

* Squash 88 commits

* Use markdown

* Remove mdx files due to bad rebase

* Fix modeling files due to bad rebase

* Fix style

* Update comment

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/instructblip.md      |   68 +
 src/transformers/__init__.py                  |   30 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 .../models/auto/image_processing_auto.py      |    1 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    1 +
 src/transformers/models/blip/modeling_blip.py |    4 +-
 .../models/blip/processing_blip.py            |    3 +-
 .../models/blip_2/modeling_blip_2.py          |   16 +-
 .../models/blip_2/processing_blip_2.py        |    3 +-
 .../models/instructblip/__init__.py           |   69 +
 .../configuration_instructblip.py             |  374 ++++
 ...onvert_instructblip_original_to_pytorch.py |  303 ++++
 .../instructblip/modeling_instructblip.py     | 1552 +++++++++++++++++
 .../instructblip/processing_instructblip.py   |  171 ++
 src/transformers/utils/dummy_pt_objects.py    |   31 +
 tests/models/blip_2/test_modeling_blip_2.py   |    4 +-
 tests/models/instructblip/__init__.py         |    0
 .../test_modeling_instructblip.py             |  605 +++++++
 .../test_processor_instructblip.py            |  191 ++
 utils/check_repo.py                           |    3 +
 32 files changed, 3425 insertions(+), 21 deletions(-)
 create mode 100644 docs/source/en/model_doc/instructblip.md
 create mode 100644 src/transformers/models/instructblip/__init__.py
 create mode 100644 src/transformers/models/instructblip/configuration_instructblip.py
 create mode 100644 src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
 create mode 100644 src/transformers/models/instructblip/modeling_instructblip.py
 create mode 100644 src/transformers/models/instructblip/processing_instructblip.py
 create mode 100644 tests/models/instructblip/__init__.py
 create mode 100644 tests/models/instructblip/test_modeling_instructblip.py
 create mode 100644 tests/models/instructblip/test_processor_instructblip.py

diff --git a/README.md b/README.md
index be56b8de554e..2a8e515e14fc 100644
--- a/README.md
+++ b/README.md
@@ -375,6 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/README_es.md b/README_es.md
index 76270b2c7cd9..f5077993aa91 100644
--- a/README_es.md
+++ b/README_es.md
@@ -350,6 +350,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/README_hd.md b/README_hd.md
index 02fdadf751cc..146af0585c51 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -322,6 +322,7 @@ conda install -c huggingface transformers
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/README_ja.md b/README_ja.md
index 2116fee408c7..5584a84fac7e 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -384,6 +384,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
diff --git a/README_ko.md b/README_ko.md
index 34ffed005b66..a476f4493b02 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -299,6 +299,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 81912c08fd84..bdc661091a9f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -323,6 +323,7 @@ conda install -c huggingface transformers
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 56b4bfad6dba..94a275dc2167 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -335,6 +335,7 @@ conda install -c huggingface transformers
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 51ef7287d8f7..414e67ffe79d 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -608,6 +608,8 @@
         title: GIT
       - local: model_doc/groupvit
         title: GroupViT
+      - local: model_doc/instructblip
+        title: InstructBLIP
       - local: model_doc/layoutlm
         title: LayoutLM
       - local: model_doc/layoutlmv2
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index aeb73adf4646..4c7f23881515 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -139,6 +139,7 @@ The documentation is organized into five sections:
 1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -348,6 +349,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Informer            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         InstructBLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
new file mode 100644
index 000000000000..d2cf80e50a5d
--- /dev/null
+++ b/docs/source/en/model_doc/instructblip.md
@@ -0,0 +1,68 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# InstructBLIP
+
+## Overview
+
+The InstructBLIP model was proposed in [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+InstructBLIP leverages the [BLIP-2](blip2) architecture for visual instruction tuning.
+
+The abstract from the paper is the following:
+
+*General-purpose language models that can solve various language-domain tasks have emerged driven by the pre-training and instruction-tuning pipeline. However, building general-purpose vision-language models is challenging due to the increased task discrepancy introduced by the additional visual input. Although vision-language pre-training has been widely studied, vision-language instruction tuning remains relatively less explored. In this paper, we conduct a systematic and comprehensive study on vision-language instruction tuning based on the pre-trained BLIP-2 models. We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models.*
+
+Tips:
+
+- InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/instructblip_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> InstructBLIP architecture. Taken from the <a href="https://arxiv.org/abs/2305.06500">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip).
+
+
+## InstructBlipConfig
+
+[[autodoc]] InstructBlipConfig
+    - from_vision_qformer_text_configs
+
+## InstructBlipVisionConfig
+
+[[autodoc]] InstructBlipVisionConfig
+
+## InstructBlipQFormerConfig
+
+[[autodoc]] InstructBlipQFormerConfig
+
+## InstructBlipProcessor
+
+[[autodoc]] InstructBlipProcessor
+
+## InstructBlipVisionModel
+
+[[autodoc]] InstructBlipVisionModel
+    - forward
+
+## InstructBlipQFormerModel
+
+[[autodoc]] InstructBlipQFormerModel
+    - forward
+
+## InstructBlipForConditionalGeneration
+
+[[autodoc]] InstructBlipForConditionalGeneration
+    - forward
+    - generate
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9d3c9a4ff232..d3ea00546639 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -332,6 +332,13 @@
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
     "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"],
+    "models.instructblip": [
+        "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InstructBlipConfig",
+        "InstructBlipProcessor",
+        "InstructBlipQFormerConfig",
+        "InstructBlipVisionConfig",
+    ],
     "models.jukebox": [
         "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "JukeboxConfig",
@@ -1838,6 +1845,15 @@
             "InformerPreTrainedModel",
         ]
     )
+    _import_structure["models.instructblip"].extend(
+        [
+            "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "InstructBlipForConditionalGeneration",
+            "InstructBlipPreTrainedModel",
+            "InstructBlipQFormerModel",
+            "InstructBlipVisionModel",
+        ]
+    )
     _import_structure["models.jukebox"].extend(
         [
             "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4168,6 +4184,13 @@
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
     from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
+    from .models.instructblip import (
+        INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InstructBlipConfig,
+        InstructBlipProcessor,
+        InstructBlipQFormerConfig,
+        InstructBlipVisionConfig,
+    )
     from .models.jukebox import (
         JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP,
         JukeboxConfig,
@@ -5453,6 +5476,13 @@
             InformerModel,
             InformerPreTrainedModel,
         )
+        from .models.instructblip import (
+            INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InstructBlipForConditionalGeneration,
+            InstructBlipPreTrainedModel,
+            InstructBlipQFormerModel,
+            InstructBlipVisionModel,
+        )
         from .models.jukebox import (
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index c41059698558..771ba9be099c 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -100,6 +100,7 @@
     ibert,
     imagegpt,
     informer,
+    instructblip,
     jukebox,
     layoutlm,
     layoutlmv2,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index f2c1c8829994..2f2714e61966 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -107,6 +107,7 @@
         ("ibert", "IBertConfig"),
         ("imagegpt", "ImageGPTConfig"),
         ("informer", "InformerConfig"),
+        ("instructblip", "InstructBlipConfig"),
         ("jukebox", "JukeboxConfig"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
@@ -299,6 +300,7 @@
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("informer", "INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("instructblip", "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -493,6 +495,7 @@
         ("ibert", "I-BERT"),
         ("imagegpt", "ImageGPT"),
         ("informer", "Informer"),
+        ("instructblip", "InstructBLIP"),
         ("jukebox", "Jukebox"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index ab02bd495aac..7d7502670126 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -66,6 +66,7 @@
         ("glpn", "GLPNImageProcessor"),
         ("groupvit", "CLIPImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
+        ("instructblip", "BlipImageProcessor"),
         ("layoutlmv2", "LayoutLMv2ImageProcessor"),
         ("layoutlmv3", "LayoutLMv3ImageProcessor"),
         ("levit", "LevitImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d0c4a236c7c0..4259f50374b5 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -536,6 +536,7 @@
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
         ("git", "GitForCausalLM"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
     ]
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 2dc8be70fe20..229b342d4d1f 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -54,6 +54,7 @@
         ("git", "GitProcessor"),
         ("groupvit", "CLIPProcessor"),
         ("hubert", "Wav2Vec2Processor"),
+        ("instructblip", "InstructBlipProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("markuplm", "MarkupLMProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 049310e629e5..ad2d310663bc 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -167,6 +167,7 @@
             ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
             ("hubert", ("Wav2Vec2CTCTokenizer", None)),
             ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("jukebox", ("JukeboxTokenizer", None)),
             ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index f16b89b7a316..0e70333c3404 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -222,9 +222,7 @@ def __init__(self, config: BlipVisionConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.class_embedding = nn.Parameter(
-            torch.randn(1, 1, self.embed_dim),
-        )
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
         self.patch_embedding = nn.Conv2d(
             in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index 5a9967913e44..c4df8ddffaba 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -18,6 +18,7 @@
 
 from typing import List, Optional, Union
 
+from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
@@ -47,7 +48,7 @@ def __init__(self, image_processor, tokenizer):
 
     def __call__(
         self,
-        images=None,
+        images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 82a879771b78..b52a58d97f4a 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -95,9 +95,7 @@ def __init__(self, config: Blip2VisionConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.class_embedding = nn.Parameter(
-            torch.randn(1, 1, self.embed_dim),
-        )
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
         self.patch_embedding = nn.Conv2d(
             in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
@@ -171,11 +169,7 @@ def forward(
         mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
             2, 0, 3, 1, 4
         )
-        query_states, key_states, value_states = (
-            mixed_qkv[0],
-            mixed_qkv[1],
-            mixed_qkv[2],
-        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
@@ -1155,11 +1149,7 @@ def forward(
             if type(encoder_hidden_states) == list:
                 encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
             else:
-                (
-                    encoder_batch_size,
-                    encoder_sequence_length,
-                    _,
-                ) = encoder_hidden_states.size()
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
 
             if type(encoder_attention_mask) == list:
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 5616a30ab356..837056f88891 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -18,6 +18,7 @@
 
 from typing import List, Optional, Union
 
+from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
@@ -49,7 +50,7 @@ def __init__(self, image_processor, tokenizer):
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
     def __call__(
         self,
-        images=None,
+        images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
diff --git a/src/transformers/models/instructblip/__init__.py b/src/transformers/models/instructblip/__init__.py
new file mode 100644
index 000000000000..201db4d272d4
--- /dev/null
+++ b/src/transformers/models/instructblip/__init__.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_instructblip": [
+        "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "InstructBlipConfig",
+        "InstructBlipQFormerConfig",
+        "InstructBlipVisionConfig",
+    ],
+    "processing_instructblip": ["InstructBlipProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_instructblip"] = [
+        "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "InstructBlipQFormerModel",
+        "InstructBlipPreTrainedModel",
+        "InstructBlipForConditionalGeneration",
+        "InstructBlipVisionModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_instructblip import (
+        INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        InstructBlipConfig,
+        InstructBlipQFormerConfig,
+        InstructBlipVisionConfig,
+    )
+    from .processing_instructblip import InstructBlipProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_instructblip import (
+            INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            InstructBlipForConditionalGeneration,
+            InstructBlipPreTrainedModel,
+            InstructBlipQFormerModel,
+            InstructBlipVisionModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
new file mode 100644
index 000000000000..417f8538909e
--- /dev/null
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -0,0 +1,374 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InstructBLIP model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...configuration_utils import PretrainedConfig
+from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Salesforce/instruct-blip-flan-t5": "https://huggingface.co/Salesforce/instruct-blip-flan-t5/resolve/main/config.json",
+}
+
+
+class InstructBlipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipVisionModel`]. It is used to
+    instantiate a InstructBLIP vision encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration defaults will yield a similar configuration to that of the InstructBLIP
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer
+            normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import InstructBlipVisionConfig, InstructBlipVisionModel
+
+    >>> # Initializing a InstructBlipVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipVisionConfig()
+
+    >>> # Initializing a InstructBlipVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "instructblip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from InstructBlipConfig
+        if config_dict.get("model_type") == "instructblip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class InstructBlipQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InstructBlipQFormerModel`]. It is used to
+    instantiate a InstructBLIP Querying Transformer (Q-Former) model according to the specified arguments, defining the
+    model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the InstructBLIP [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
+    architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+
+    Note that [`InstructBlipQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+
+    Examples:
+
+    ```python
+    >>> from transformers import InstructBlipQFormerConfig, InstructBlipQFormerModel
+
+    >>> # Initializing a InstructBLIP Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipQFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipQFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "instructblip_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from InstructBlipConfig
+        if config_dict.get("model_type") == "instructblip":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class InstructBlipConfig(PretrainedConfig):
+    r"""
+    [`InstructBlipConfig`] is the configuration class to store the configuration of a
+    [`InstructBlipForConditionalGeneration`]. It is used to instantiate a InstructBLIP model according to the specified
+    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the InstructBLIP
+    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     InstructBlipVisionConfig,
+    ...     InstructBlipQFormerConfig,
+    ...     OPTConfig,
+    ...     InstructBlipConfig,
+    ...     InstructBlipForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a InstructBlipConfig with Salesforce/instruct-blip-flan-t5 style configuration
+    >>> configuration = InstructBlipConfig()
+
+    >>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
+    >>> model = InstructBlipForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PretrainedConfig
+
+    >>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations
+    >>> vision_config = InstructBlipVisionConfig()
+    >>> qformer_config = InstructBlipQFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "instructblip"
+    is_composition = True
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        self.vision_config = InstructBlipVisionConfig(**vision_config)
+        self.qformer_config = InstructBlipQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: InstructBlipVisionConfig,
+        qformer_config: InstructBlipQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and
+        language model configurations.
+
+        Returns:
+            [`InstructBlipConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
new file mode 100644
index 000000000000..87e8b90d6cc8
--- /dev/null
+++ b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert InstructBLIP checkpoints from the original repository.
+
+URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip
+"""
+
+import argparse
+
+import requests
+import torch
+
+# pip3 install salesforce-lavis
+# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
+# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
+# same for Vicuna-13b
+from lavis.models import load_model_and_preprocess
+from PIL import Image
+
+from transformers import (
+    AutoTokenizer,
+    BlipImageProcessor,
+    InstructBlipConfig,
+    InstructBlipForConditionalGeneration,
+    InstructBlipProcessor,
+    InstructBlipQFormerConfig,
+    InstructBlipVisionConfig,
+    LlamaConfig,
+    LlamaTokenizerFast,
+    T5Config,
+    T5TokenizerFast,
+)
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+
+
+def load_demo_image():
+    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    return image
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # vision encoder
+    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
+    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
+    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
+    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
+    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
+    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
+
+    for i in range(config.vision_config.num_hidden_layers):
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
+        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
+
+    # QFormer
+    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
+    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def read_in_q_v_bias(state_dict, config):
+    for i in range(config.vision_config.num_hidden_layers):
+        # read in original q and v biases
+        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
+        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
+
+        # next, set bias in the state dict
+        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
+
+
+def get_blip2_config(model_name):
+    image_size = 364 if "coco" in model_name else 224
+    vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict()
+
+    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
+    # seems like flan-T5 models don't have bos_token_id properly set?
+    if "t5-xl" in model_name:
+        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
+    elif "t5-xxl" in model_name:
+        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
+    elif "vicuna-7b" in model_name:
+        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
+    elif "vicuna-13b" in model_name:
+        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
+    else:
+        raise ValueError("Model name not supported")
+
+    # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
+    qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict()
+    config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config)
+
+    return config, image_size
+
+
+@torch.no_grad()
+def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to Transformers design.
+    """
+    qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
+    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+
+    if "t5" in model_name:
+        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
+    elif "vicuna" in model_name:
+        # the following was used in the original implementation:
+        # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
+        # tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        # tokenizer.add_special_tokens({"bos_token": "</s>"})
+        # tokenizer.add_special_tokens({"eos_token": "</s>"})
+        # tokenizer.add_special_tokens({"unk_token": "</s>"})
+        tokenizer = LlamaTokenizerFast.from_pretrained(
+            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
+        )
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    config, image_size = get_blip2_config(model_name)
+    hf_model = InstructBlipForConditionalGeneration(config).eval()
+
+    model_name_to_original = {
+        "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
+        "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
+        "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
+        "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
+    }
+
+    name, type = model_name_to_original[model_name]
+
+    # load original model
+    print("Loading original model...")
+    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
+    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
+    original_model, vis_processors, _ = load_model_and_preprocess(
+        name=name, model_type=type, is_eval=True, device=lavis_device
+    )
+    original_model.eval()
+    print("Done!")
+
+    # update state dict keys
+    state_dict = original_model.state_dict()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+
+    # some keys can be renamed efficiently
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if key.startswith("Qformer.bert"):
+            key = key.replace("Qformer.bert", "qformer")
+        if "attention.self" in key:
+            key = key.replace("self", "attention")
+        if "llm_proj" in key:
+            key = key.replace("llm_proj", "language_projection")
+        if "t5_proj" in key:
+            key = key.replace("t5_proj", "language_projection")
+        if key.startswith("llm_model"):
+            key = key.replace("llm_model", "language_model")
+        if key.startswith("t5"):
+            key = key.replace("t5", "language")
+        state_dict[key] = val
+
+    # read in qv biases
+    read_in_q_v_bias(state_dict, config)
+
+    # note: weights get loaded in torch.float32 by default
+    hf_model.load_state_dict(state_dict, strict=True)
+
+    image = load_demo_image()
+    prompt = "What is unusual about this image?"
+
+    # create processor
+    image_processor = BlipImageProcessor(
+        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
+    )
+    processor = InstructBlipProcessor(
+        image_processor=image_processor,
+        tokenizer=tokenizer,
+        qformer_tokenizer=qformer_tokenizer,
+    )
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
+
+    # make sure processor creates exact same pixel values
+    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
+    pixel_values = inputs.pixel_values
+    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
+
+    original_model.to(lavis_device)
+    hf_model.to(hf_model_device)
+    with torch.no_grad():
+        if "vicuna" in model_name:
+            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
+            logits = hf_model(**inputs).logits
+        else:
+            original_logits = original_model(
+                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
+            ).logits
+            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
+            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
+            logits = hf_model(**inputs, labels=labels).logits
+
+    print("First values of original logits:", original_logits[0, :3, :3])
+    print("First values of HF logits:", logits[0, :3, :3])
+
+    # assert values
+    assert original_logits.shape == logits.shape
+    atol = 1e-4 if "vicuna" in model_name else 1e-5
+    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
+    print("Looks ok!")
+
+    print("Generating with original model...")
+    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
+
+    # important: we need to cast the weights of the HF model to the appropriate type
+    print("Generating with HF model...")
+    outputs = hf_model.generate(
+        **inputs,
+        do_sample=False,
+        num_beams=5,
+        max_length=256,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.5,
+        length_penalty=1.0,
+        temperature=1,
+    )
+    if "vicuna" in model_name:
+        # convert output id 0 to 2 (eos_token_id)
+        # TODO add this in the generate method?
+        outputs[outputs == 0] = 2
+    print("Original generation:", original_outputs)
+    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
+    output_text = [text.strip() for text in output_text]
+    print("HF generation:", output_text)
+
+    if pytorch_dump_folder_path is not None:
+        processor.save_pretrained(pytorch_dump_folder_path)
+        hf_model.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        processor.push_to_hub(f"Salesforce/{model_name}")
+        hf_model.push_to_hub(f"Salesforce/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    choices = [
+        "instructblip-vicuna-7b",
+        "instructblip-vicuna-13b",
+        "instructblip-flan-t5-xl",
+        "instructblip-flan-t5-xxl",
+    ]
+    parser.add_argument(
+        "--model_name",
+        default="instructblip-flan-t5-xl",
+        choices=choices,
+        type=str,
+        help="Path to hf config.json of model to convert",
+    )
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model and processor to the hub after converting",
+    )
+
+    args = parser.parse_args()
+
+    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
new file mode 100644
index 000000000000..029c59aa7cdd
--- /dev/null
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -0,0 +1,1552 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InstructBLIP model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
+from .configuration_instructblip import InstructBlipConfig, InstructBlipQFormerConfig, InstructBlipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/instructblip-flan-t5-xl"
+
+INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/instructblip-flan-t5-xl",
+    # See all InstructBLIP models at https://huggingface.co/models?filter=instructblip
+]
+
+
+@dataclass
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlip
+class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
+
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlip
+class InstructBlipVisionEmbeddings(nn.Module):
+    def __init__(self, config: InstructBlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlip
+class InstructBlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
+
+        if config.qkv_bias:
+            q_bias = nn.Parameter(torch.zeros(self.embed_dim))
+            v_bias = nn.Parameter(torch.zeros(self.embed_dim))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
+            self.qkv.bias = nn.Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+            2, 0, 3, 1, 4
+        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipMLP
+class InstructBlipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlip
+class InstructBlipEncoderLayer(nn.Module):
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = InstructBlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = InstructBlipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class InstructBlipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = InstructBlipConfig
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+        r"language_model.lm_head.weight",
+    ]
+
+    # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlip
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, InstructBlipVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, InstructBlipEncoder):
+            module.gradient_checkpointing = value
+
+
+INSTRUCTBLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`InstructBlipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+INSTRUCTBLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`InstructBlipProcessor`]. See
+            [`InstructBlipProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+INSTRUCTBLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`InstructBlipProcessor`]. See
+            [`InstructBlipProcessor.__call__`] for details.
+
+        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
+            to serve as text prompt, which the Q-Former model will encode.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip
+class InstructBlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InstructBlipEncoderLayer`].
+
+    Args:
+        config (`InstructBlipConfig`):
+            The corresponding vision configuration for the `InstructBlipEncoder`.
+    """
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([InstructBlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlip, BLIP->INSTRUCTBLIP
+class InstructBlipVisionModel(InstructBlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = InstructBlipVisionConfig
+
+    def __init__(self, config: InstructBlipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = InstructBlipVisionEmbeddings(config)
+        self.encoder = InstructBlipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(INSTRUCTBLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=InstructBlipVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerMultiHeadAttention with Blip2->InstructBlip
+class InstructBlipQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipQFormer
+class InstructBlipQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlip
+class InstructBlipQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = InstructBlipQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = InstructBlipQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->InstructBlipQFormer
+class InstructBlipQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->InstructBlipQFormer
+class InstructBlipQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class InstructBlipQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InstructBlipQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = InstructBlipQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = InstructBlipQFormerIntermediate(config)
+        self.output = InstructBlipQFormerOutput(config)
+
+        self.intermediate_query = InstructBlipQFormerIntermediate(config)
+        self.output_query = InstructBlipQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->InstructBlip
+class InstructBlipQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [InstructBlipQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class InstructBlipQFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class InstructBlipQFormerModel(InstructBlipPreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
+    instruction as input.
+    """
+
+    def __init__(self, config: InstructBlipQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InstructBlipQFormerEmbeddings(config)
+
+        self.encoder = InstructBlipQFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device: (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})",
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        query_embeds=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and query_embeds is None:
+            raise ValueError("You have to specify query_embeds when input_ids is None")
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
+    encoder, Querying Transformer (Q-Former) and a language model.
+
+    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
+    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
+    """,
+    INSTRUCTBLIP_START_DOCSTRING,
+)
+class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
+    config_class = InstructBlipConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: InstructBlipConfig):
+        super().__init__(config)
+
+        self.vision_model = InstructBlipVisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = InstructBlipQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + InstructBLIP + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    @add_start_docstrings_to_model_forward(INSTRUCTBLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=InstructBlipForConditionalGenerationModelOutput, config_class=InstructBlipVisionConfig
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, InstructBlipForConditionalGenerationModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
+            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> model.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> prompt = "What is unusual about this image?"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
+
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     num_beams=5,
+        ...     max_length=256,
+        ...     min_length=1,
+        ...     top_p=0.9,
+        ...     repetition_penalty=1.5,
+        ...     length_penalty=1.0,
+        ...     temperature=1,
+        ... )
+        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        What is unusual about this image? The image is unusual because it depicts a person standing on top of a car, which is parked on the side of the road. This is an unusual position for a person to be in, as they are typically not expected to stand on top of a car while it is parked. Additionally, the person in the image appears to be wearing a suit and tie, which is not typical attire for someone who is standing on top of a car. It is unclear why the person is in this unusual position or what they are doing there.
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0][:, : query_tokens.size(1), :]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat([language_model_attention_mask, attention_mask], dim=1)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                labels = labels.to(logits.device)
+                logits = logits[:, -labels.size(1) :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous().to(logits.device)
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return InstructBlipForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        qformer_input_ids: Optional[torch.LongTensor] = None,
+        qformer_attention_mask: Optional[torch.LongTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt to be fed to the Q-Former module.
+            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices.
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        if qformer_attention_mask is None:
+            qformer_attention_mask = torch.ones_like(qformer_input_ids)
+        qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
+        query_outputs = self.qformer(
+            input_ids=qformer_input_ids,
+            attention_mask=qformer_attention_mask,
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :]
+
+        language_model_inputs = self.language_projection(query_output)
+        language_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+
+        if input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.config.text_config.bos_token_id]])
+                .repeat(batch_size, 1)
+                .to(image_embeds.device)
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
+
+        # concatenate query embeddings with prompt embeddings
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
new file mode 100644
index 000000000000..6f5e71c1178f
--- /dev/null
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
+"""
+
+import os
+from typing import List, Optional, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+from ..auto import AutoTokenizer
+
+
+class InstructBlipProcessor(ProcessorMixin):
+    r"""
+    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
+    processor.
+
+    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
+    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        qformer_tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+        # add QFormer tokenizer
+        self.qformer_tokenizer = qformer_tokenizer
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_token_type_ids: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least images or text.")
+
+        encoding = BatchEncoding()
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding.update(text_encoding)
+            qformer_text_encoding = self.qformer_tokenizer(
+                text=text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_token_type_ids=return_token_type_ids,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
+            encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
+
+        if images is not None:
+            image_encoding = self.image_processor(images, return_tensors=return_tensors)
+            encoding.update(image_encoding)
+
+        return encoding
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    # overwrite to save the Q-Former tokenizer in a separate folder
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer")
+        self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path)
+        return super().save_pretrained(save_directory, **kwargs)
+
+    # overwrite to load the Q-Former tokenizer from a separate folder
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        args.append(qformer_tokenizer)
+        return cls(*args)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2ca6f0156222..f07b2fdd87ef 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3729,6 +3729,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class InstructBlipForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipQFormerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InstructBlipVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index f31d1624a471..71f652050faa 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -330,9 +330,7 @@ def __init__(
     def prepare_config_and_inputs(self):
         config = self.get_config()
 
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
         input_ids[:, -1] = self.eos_token_id  # Eos Token
 
         attention_mask = input_ids.ne(self.pad_token_id)
diff --git a/tests/models/instructblip/__init__.py b/tests/models/instructblip/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
new file mode 100644
index 000000000000..ad33bfacfc89
--- /dev/null
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -0,0 +1,605 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch InstructBLIP model. """
+
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import requests
+
+from transformers import (
+    CONFIG_MAPPING,
+    InstructBlipConfig,
+    InstructBlipProcessor,
+    InstructBlipQFormerConfig,
+    InstructBlipVisionConfig,
+)
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import InstructBlipForConditionalGeneration, InstructBlipVisionModel
+    from transformers.models.instructblip.modeling_instructblip import INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class InstructBlipVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return InstructBlipVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = InstructBlipVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as InstructBLIP's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (InstructBlipVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = InstructBlipVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=InstructBlipVisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="InstructBLIP's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = InstructBlipVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class InstructBlipQFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=6,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask
+
+    def get_config(self):
+        return InstructBlipQFormerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+        )
+
+
+# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py
+class InstructBlipTextModelDecoderOnlyTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        embed_dim=16,
+        num_labels=3,
+        word_embed_proj_dim=16,
+        type_sequence_label_size=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.embed_dim = embed_dim
+        self.num_labels = num_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.word_embed_proj_dim = word_embed_proj_dim
+        self.is_encoder_decoder = False
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        attention_mask = input_ids.ne(self.pad_token_id)
+
+        return config, input_ids, attention_mask
+
+    def get_config(self):
+        return CONFIG_MAPPING["opt"](
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            embed_dim=self.embed_dim,
+            is_encoder_decoder=False,
+            word_embed_proj_dim=self.word_embed_proj_dim,
+        )
+
+
+# this model tester uses a decoder-only language model (OPT)
+class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
+    def __init__(
+        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+    ):
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        if qformer_kwargs is None:
+            qformer_kwargs = {}
+        if text_kwargs is None:
+            text_kwargs = {}
+
+        self.parent = parent
+        self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
+        self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
+        self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
+        self.is_training = is_training
+        self.num_query_tokens = num_query_tokens
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
+
+    def get_config(self):
+        return InstructBlipConfig.from_vision_qformer_text_configs(
+            vision_config=self.vision_model_tester.get_config(),
+            qformer_config=self.qformer_model_tester.get_config(),
+            text_config=self.text_model_tester.get_config(),
+            num_query_tokens=self.num_query_tokens,
+        )
+
+    def create_and_check_for_conditional_generation(
+        self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
+    ):
+        model = InstructBlipForConditionalGeneration(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(
+                pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                qformer_input_ids=qformer_input_ids,
+                qformer_attention_mask=qformer_attention_mask,
+            )
+
+        expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size),
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "qformer_input_ids": qformer_input_ids,
+            "qformer_attention_mask": qformer_attention_mask,
+            "labels": input_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
+
+    def test_for_conditional_generation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipForConditionalGeneration doesn't support inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Tied weights are tested in individual model tests")
+    def test_tied_weights_keys(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="InstructBlipModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="There's no base InstructBlipModel")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="There's no base InstructBlipModel")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_load_vision_qformer_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save InstructBlipConfig and check if we can load InstructBlipVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = InstructBlipVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save InstructBlipConfig and check if we can load InstructBlipQFormerConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            qformer_config = InstructBlipQFormerConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@require_vision
+@require_torch
+@slow
+class InstructBlipModelIntegrationTest(unittest.TestCase):
+    def test_inference_vicuna_7b(self):
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to(
+            torch_device
+        )
+
+        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        prompt = "What is unusual about this image?"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device)
+
+        # verify logits
+        with torch.no_grad():
+            logits = model(**inputs).logits
+
+        expected_slice = torch.tensor(
+            [[-3.4684, -12.6759, 8.5067], [-5.1305, -12.2058, 7.9834], [-4.0632, -13.9285, 9.2327]],
+            device=torch_device,
+        )
+        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-5)
+
+        # verify generation
+        outputs = model.generate(
+            **inputs,
+            do_sample=False,
+            num_beams=5,
+            max_length=256,
+            min_length=1,
+            top_p=0.9,
+            repetition_penalty=1.5,
+            length_penalty=1.0,
+            temperature=1,
+        )
+        outputs[outputs == 0] = 2
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+
+        # fmt: off
+        expected_outputs = [2, 450, 22910,  9565,   310,   445,  1967,   338,   393,   263, 767,   338, 13977,   292, 22095,   373,   278,  1250,   310,   263, 13328, 20134, 29963, 29892,   607,   338, 14089,   287,   297,   278, 7256,   310,   263, 19587,  4272, 11952, 29889,   910,   338,   385, 443,   535,   794,  1848,  2948,   304, 13977,   292, 22095, 29892, 408,   372,  6858,   278,   767,   304, 17346,  3654,   322,   670, 13977,   292, 21083,   373,  2246,   310,   278, 19716,  1550, 12402, 1218,  1549, 12469, 29889, 19814, 29892,   278, 10122,   310,  8818, 275,   322,   916, 24413,   297,   278,  9088,  4340, 19310,  7093, 278, 22910,  5469,   310,   445,  6434, 29889,     2,     1]
+        # fmt: on
+        self.assertEqual(outputs[0].tolist(), expected_outputs)
+        self.assertEqual(
+            generated_text,
+            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.",
+        )
+
+    def test_inference_flant5_xl(self):
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
+        model = InstructBlipForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-flan-t5-xl",
+            torch_dtype=torch.bfloat16,
+        ).to(torch_device)
+
+        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        prompt = "What is unusual about this image?"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device)
+
+        for k, v in inputs.items():
+            if torch.is_floating_point(v):
+                inputs[k] = v.to(torch.bfloat16)
+
+        outputs = model.generate(
+            **inputs,
+            do_sample=False,
+            num_beams=5,
+            max_length=256,
+            min_length=1,
+            top_p=0.9,
+            repetition_penalty=1.5,
+            length_penalty=1.0,
+            temperature=1,
+        )
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+
+        # fmt: off
+        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 4459, 6177, 6, 11, 3, 88, 19, 3609, 46, 3575, 53, 1476, 16, 80, 609, 11, 3, 9, 3116, 13, 28958, 16, 8, 119, 5, 37, 1023, 19, 7225, 250, 34, 1267, 3, 9, 388, 692, 10428, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 611, 6, 34, 19, 92, 487, 24, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 86, 48, 495, 6, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 216, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 4063, 6, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 1875, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 6, 34, 19, 487, 24, 3, 88, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 1]
+        # fmt: on
+        self.assertEqual(outputs[0].tolist(), expected_outputs)
+        self.assertEqual(
+            generated_text,
+            "The image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with a yellow tie, and he is holding an ironing board in one hand and a pair of scissors in the other. The image is unusual because it shows a man doing laundry in the middle of a busy city street. It is possible that the man washed his clothes in the back of the van to save time during his workday. However, it is also possible that the man washed his clothes in the back of the van to save time during his workday. In this case, the man washed his clothes in the back of the van to save time during his workday. He washed his clothes in the back of the van to save time during his workday. Therefore, the man washed his clothes in the back of the van to save time during his workday. Although the man washed his clothes in the back of the van to save time during his workday, it is possible that he washed his clothes in the back of the van to save time during his workday.",
+        )
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
new file mode 100644
index 000000000000..06c68a8a5807
--- /dev/null
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -0,0 +1,191 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import (
+        AutoProcessor,
+        BertTokenizerFast,
+        BlipImageProcessor,
+        GPT2Tokenizer,
+        InstructBlipProcessor,
+        PreTrainedTokenizerFast,
+    )
+
+
+@require_vision
+class InstructBlipProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = BlipImageProcessor()
+        tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
+        qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        processor = InstructBlipProcessor(image_processor, tokenizer, qformer_tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_qformer_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = InstructBlipProcessor(
+            tokenizer=self.get_tokenizer(),
+            image_processor=self.get_image_processor(),
+            qformer_tokenizer=self.get_qformer_tokenizer(),
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+
+        processor = InstructBlipProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
+
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, BlipImageProcessor)
+        self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast)
+
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tokens = tokenizer(input_str, return_token_type_ids=False)
+        encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False)
+
+        for key in encoded_tokens.keys():
+            self.assertListEqual(encoded_tokens[key], encoded_processor[key])
+
+        for key in encoded_tokens_qformer.keys():
+            self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key])
+
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(
+            list(inputs.keys()),
+            ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
+        )
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        qformer_tokenizer = self.get_qformer_tokenizer()
+
+        processor = InstructBlipProcessor(
+            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
+        )
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(
+            list(inputs.keys()),
+            ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"],
+        )
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 03edeab17786..2de9fa61e1e3 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -57,6 +57,7 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
+    "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
     "NllbMoeDecoder",
     "NllbMoeEncoder",
     "LlamaDecoder",  # Building part of bigger (tested) model.
@@ -282,6 +283,8 @@
     "FlavaMultimodalModel",
     "GPT2DoubleHeadsModel",
     "GPTSw3DoubleHeadsModel",
+    "InstructBlipVisionModel",
+    "InstructBlipQFormerModel",
     "LayoutLMForQuestionAnswering",
     "LukeForMaskedLM",
     "LukeForEntityClassification",

From 3b84d86b572543e541fc692034f7941af2de58a9 Mon Sep 17 00:00:00 2001
From: Matthijs Hollemans <mail@hollance.com>
Date: Mon, 26 Jun 2023 11:50:10 +0200
Subject: [PATCH 520/935] add missing alignment_heads to Whisper integration
 test (#24487)

add missing alignment heads
---
 tests/models/whisper/test_modeling_whisper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 07fd5dc2baae..b16587fd66ff 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1442,6 +1442,7 @@ def test_tiny_token_timestamp_generation(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
+        model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
 
         input_speech = self._load_datasamples(4)
         input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(

From be2d9f2e477cce9ed4b91e10d792564c8738ac9d Mon Sep 17 00:00:00 2001
From: Meghan Cowan <cowanmeg@google.com>
Date: Mon, 26 Jun 2023 02:59:07 -0700
Subject: [PATCH 521/935] Fix tpu_metrics_debug (#24452)

fix for tpu metrics debugs string
---
 src/transformers/training_args.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b27e9bba4c34..e8c2823f3793 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1561,7 +1561,10 @@ def __post_init__(self):
                 " `--debug tpu_metrics_debug` instead",
                 FutureWarning,
             )
-            self.debug += " tpu_metrics_debug"
+            if self.debug is None:
+                self.debug = " tpu_metrics_debug"
+            else:
+                self.debug += " tpu_metrics_debug"
             self.tpu_metrics_debug = False
 
         if isinstance(self.debug, str):

From 892399c5ffdc3c7d0e4f9b2c1c81511293c1581b Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 26 Jun 2023 10:59:42 +0100
Subject: [PATCH 522/935] Update AlbertModel type annotation (#24450)

Update type annotation
---
 src/transformers/models/albert/modeling_albert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 9eadaa219834..2a2f6d6ef539 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -687,9 +687,9 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[None] = None,
-        output_hidden_states: Optional[None] = None,
-        return_dict: Optional[None] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[BaseModelOutputWithPooling, Tuple]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 914289ac4b7994507fa7329bf6f54572b32ae061 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 26 Jun 2023 13:58:36 +0200
Subject: [PATCH 523/935] [`pipeline`] Fix str device issue (#24396)

* fix str device issue

* fixup

* adapt from suggestions

* forward contrib credits from suggestions

* better fix

* added backward compatibility for older PT versions

* final fixes

* oops

* Attempting something with less branching.

---------

Co-authored-by: amyeroberts <amyeroberts@users.noreply.github.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 src/transformers/pipelines/base.py       |  7 ++++---
 tests/pipelines/test_pipelines_common.py | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 7360f8b7f5ae..ee117e62a189 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -903,9 +903,10 @@ def device_placement(self):
                 yield
         else:
             if self.device.type == "cuda":
-                torch.cuda.set_device(self.device)
-
-            yield
+                with torch.cuda.device(self.device):
+                    yield
+            else:
+                yield
 
     def ensure_tensor_on_device(self, **inputs):
         """
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 7909ad22d656..8c7c66939c33 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -46,6 +46,7 @@
     require_tensorflow_probability,
     require_tf,
     require_torch,
+    require_torch_gpu,
     require_torch_or_tf,
     slow,
 )
@@ -542,6 +543,20 @@ def test_load_default_pipelines_pt_table_qa(self):
         gc.collect()
         torch.cuda.empty_cache()
 
+    @slow
+    @require_torch
+    @require_torch_gpu
+    def test_pipeline_cuda(self):
+        pipe = pipeline("text-generation", device="cuda")
+        _ = pipe("Hello")
+
+    @slow
+    @require_torch
+    @require_torch_gpu
+    def test_pipeline_cuda_indexed(self):
+        pipe = pipeline("text-generation", device="cuda:0")
+        _ = pipe("Hello")
+
     @slow
     @require_tf
     @require_tensorflow_probability

From c8aff1d3e63609ea5c732a09c813aa8d8cc20d37 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 26 Jun 2023 20:07:27 +0800
Subject: [PATCH 524/935] when resume from peft checkpoint, the model should be
 trainable (#24463)

---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 85d0f717d93f..9d42674f209e 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2058,7 +2058,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
             # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
             if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
                 if os.path.exists(resume_from_checkpoint):
-                    model.load_adapter(resume_from_checkpoint, model.active_adapter)
+                    model.load_adapter(resume_from_checkpoint, model.active_adapter, is_trainable=True)
                 else:
                     logger.warning(
                         "The intermediate checkpoints of PEFT may not be saved correctly, "

From 195a9e5bdb1faa58cd58b47a23a47734d2b90d8c Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Mon, 26 Jun 2023 17:45:37 +0530
Subject: [PATCH 525/935] deepspeed z1/z2 state dict fix (#24489)

* deepspeed z2/z1 state_dict bloating fix

* update

* version check
---
 src/transformers/trainer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 9d42674f209e..2ff4f93fdfcb 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2740,13 +2740,17 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
                     self._save(output_dir, state_dict=state_dict)
         elif self.is_deepspeed_enabled:
             # this takes care of everything as long as we aren't under zero3
-            if self.args.should_save:
-                self._save(output_dir)
+            if self.args.should_save and not is_deepspeed_zero3_enabled():
+                if version.parse(accelerate_version) <= version.parse("0.20.3"):
+                    raise ValueError("Install Accelerate from main branch")
+                state_dict = self.accelerator.get_state_dict(self.deepspeed)
+
+                self._save(output_dir, state_dict=state_dict)
 
             if is_deepspeed_zero3_enabled():
                 # It's too complicated to try to override different places where the weights dump gets
                 # saved, so since under zero3 the file is bogus, simply delete it. The user should
-                # either user deepspeed checkpoint to resume or to recover full weights use
+                # either use deepspeed checkpoint to resume or to recover full weights use
                 # zero_to_fp32.py stored in the checkpoint.
                 if self.args.should_save:
                     file = os.path.join(output_dir, WEIGHTS_NAME)

From 3ca022238bd96996ab06aa4c2a1fb50afced8332 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 26 Jun 2023 14:37:12 +0200
Subject: [PATCH 526/935] Update `InstructBlipModelIntegrationTest` (#24490)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/instructblip/test_modeling_instructblip.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index ad33bfacfc89..c41865455336 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -521,6 +521,8 @@ def prepare_img():
 @require_torch
 @slow
 class InstructBlipModelIntegrationTest(unittest.TestCase):
+    # TODO (@Younes): Re-enable this when 8-bit or 4-bit is implemented.
+    @unittest.skip(reason="GPU OOM")
     def test_inference_vicuna_7b(self):
         processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
         model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to(
@@ -596,10 +598,10 @@ def test_inference_flant5_xl(self):
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
         # fmt: off
-        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 4459, 6177, 6, 11, 3, 88, 19, 3609, 46, 3575, 53, 1476, 16, 80, 609, 11, 3, 9, 3116, 13, 28958, 16, 8, 119, 5, 37, 1023, 19, 7225, 250, 34, 1267, 3, 9, 388, 692, 10428, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 611, 6, 34, 19, 92, 487, 24, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 86, 48, 495, 6, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 216, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 4063, 6, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 1875, 8, 388, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 6, 34, 19, 487, 24, 3, 88, 47, 88, 26, 112, 4954, 16, 8, 223, 13, 8, 4049, 12, 1097, 97, 383, 112, 161, 1135, 5, 1]
+        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 2756, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 12, 743, 112, 2491, 5, 37, 1023, 19, 7225, 788, 12, 8, 685, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 19, 1119, 12, 1097, 540, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 6, 68, 34, 19, 92, 487, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 3, 13865, 13, 8, 1053, 21, 8, 388, 31, 7, 2874, 6, 34, 19, 964, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1]
         # fmt: on
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with a yellow tie, and he is holding an ironing board in one hand and a pair of scissors in the other. The image is unusual because it shows a man doing laundry in the middle of a busy city street. It is possible that the man washed his clothes in the back of the van to save time during his workday. However, it is also possible that the man washed his clothes in the back of the van to save time during his workday. In this case, the man washed his clothes in the back of the van to save time during his workday. He washed his clothes in the back of the van to save time during his workday. Therefore, the man washed his clothes in the back of the van to save time during his workday. Although the man washed his clothes in the back of the van to save time during his workday, it is possible that he washed his clothes in the back of the van to save time during his workday.",
+            "The image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with a bright yellow tie, and he is using an ironing board to complete his task. The image is unusual due to the fact that it shows a man ironing clothes on the back of a van in the middle of a busy city street. It is possible that the man is trying to save money by doing his laundry on the back of the van, but it is also possible that he is trying to save time by doing his laundry on the back of the van in the middle of a busy city street. Regardless of the reason for the man's actions, it is clear that he is trying to save time by doing his laundry on the back of the van in the middle of a busy city street.",
         )

From c2aa5e17e454984cbd91c98d8a5ee75412e37ae1 Mon Sep 17 00:00:00 2001
From: condor-cp <40066676+condor-cp@users.noreply.github.com>
Date: Mon, 26 Jun 2023 14:42:38 +0200
Subject: [PATCH 527/935] Update token_classification.md (#24484)

Add link to pytorch CrossEntropyLoss so that one understand why '-100' is ignore by the loss function.
---
 docs/source/en/tasks/token_classification.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index f9a84853b330..87d60d37cb31 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -126,7 +126,7 @@ As you saw in the example `tokens` field above, it looks like the input has alre
 However, this adds some special tokens `[CLS]` and `[SEP]` and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by:
 
 1. Mapping all tokens to their corresponding word with the [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) method.
-2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function.
+2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function (see [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)).
 3. Only labeling the first token of a given word. Assign `-100` to other subtokens from the same word.
 
 Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT's maximum input length:

From 5757923888246ea16b324f53c60ea444574005ed Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 26 Jun 2023 09:58:14 -0400
Subject: [PATCH 528/935] Add support for for loops in python interpreter
 (#24429)

Add support for for loops
---
 src/transformers/tools/agents.py             |  1 +
 src/transformers/tools/python_interpreter.py | 15 +++++++++++++++
 tests/tools/test_python_interpreter.py       |  7 +++++++
 3 files changed, 23 insertions(+)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 66563c3529f1..fdbae381f4ec 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -48,6 +48,7 @@
 
 BASE_PYTHON_TOOLS = {
     "print": print,
+    "range": range,
     "float": float,
     "int": int,
     "bool": bool,
diff --git a/src/transformers/tools/python_interpreter.py b/src/transformers/tools/python_interpreter.py
index ef366c5d2e77..960be1a2a265 100644
--- a/src/transformers/tools/python_interpreter.py
+++ b/src/transformers/tools/python_interpreter.py
@@ -110,6 +110,9 @@ def evaluate_ast(expression: ast.AST, state: Dict[str, Any], tools: Dict[str, Ca
     elif isinstance(expression, ast.Expr):
         # Expression -> evaluate the content
         return evaluate_ast(expression.value, state, tools)
+    elif isinstance(expression, ast.For):
+        # For loop -> execute the loop
+        return evaluate_for(expression, state, tools)
     elif isinstance(expression, ast.FormattedValue):
         # Formatted value (part of f-string) -> evaluate the content and return
         return evaluate_ast(expression.value, state, tools)
@@ -236,3 +239,15 @@ def evaluate_if(if_statement, state, tools):
             if line_result is not None:
                 result = line_result
     return result
+
+
+def evaluate_for(for_loop, state, tools):
+    result = None
+    iterator = evaluate_ast(for_loop.iter, state, tools)
+    for counter in iterator:
+        state[for_loop.target.id] = counter
+        for expression in for_loop.body:
+            line_result = evaluate_ast(expression, state, tools)
+            if line_result is not None:
+                result = line_result
+    return result
diff --git a/tests/tools/test_python_interpreter.py b/tests/tools/test_python_interpreter.py
index 5dfdd8d9c37c..b9a38b4a21f1 100644
--- a/tests/tools/test_python_interpreter.py
+++ b/tests/tools/test_python_interpreter.py
@@ -122,3 +122,10 @@ def test_evaluate_subscript(self):
         result = evaluate(code, {"add_two": add_two}, state=state)
         assert result == 5
         self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}})
+
+    def test_evaluate_for(self):
+        code = "x = 0\nfor i in range(3):\n    x = i"
+        state = {}
+        result = evaluate(code, {"range": range}, state=state)
+        assert result == 2
+        self.assertDictEqual(state, {"x": 2, "i": 2})

From 9895670e95d7fef9fd53e8d0a638970ab03221f1 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 26 Jun 2023 18:36:27 +0200
Subject: [PATCH 529/935] [`InstructBlip`] Add accelerate support for
 instructblip (#24488)

* add accelerate support for instructblip

* add `_keep_in_fp32_modules`

* dynamically adapt `_no_split_modules`

* better fix

* same logic for `_keep_in_fp32_modules`
---
 .../models/instructblip/modeling_instructblip.py      | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 029c59aa7cdd..acc0df630991 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -281,6 +281,8 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
         r"language_model.decoder.embed_tokens.weight",
         r"language_model.lm_head.weight",
     ]
+    _no_split_modules = ["InstructBlipAttention", "InstructBlipQFormerMultiHeadAttention"]
+    _keep_in_fp32_modules = []
 
     # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlip
     def _init_weights(self, module):
@@ -1264,11 +1266,18 @@ def __init__(self, config: InstructBlipConfig):
         self.qformer = InstructBlipQFormerModel(config.qformer_config)
 
         self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+
         if config.use_decoder_only_language_model:
             language_model = AutoModelForCausalLM.from_config(config.text_config)
         else:
             language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
 
+        if language_model._no_split_modules is not None:
+            self._no_split_modules.extend(language_model._no_split_modules)
+
+        if language_model._keep_in_fp32_modules is not None:
+            self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
+
         self.language_model = language_model
 
         # Initialize weights and apply final processing
@@ -1422,7 +1431,7 @@ def forward(
 
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask, attention_mask], dim=1)
+        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(

From 850cf4af0ce281d2c3e7ebfc12e0bc24a9c40714 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 26 Jun 2023 18:36:47 +0200
Subject: [PATCH 530/935] Compute `dropout_probability` only in training mode
 (#24486)

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/autoformer/modeling_autoformer.py     | 16 +++++++++++-----
 src/transformers/models/bart/modeling_bart.py    | 16 +++++++++++-----
 .../bigbird_pegasus/modeling_bigbird_pegasus.py  | 16 +++++++++++-----
 .../models/biogpt/modeling_biogpt.py             |  7 ++++---
 .../models/blenderbot/modeling_blenderbot.py     | 16 +++++++++++-----
 .../modeling_blenderbot_small.py                 | 16 +++++++++++-----
 .../modeling_conditional_detr.py                 | 16 +++++++++++-----
 src/transformers/models/detr/modeling_detr.py    | 16 +++++++++++-----
 .../models/flaubert/modeling_flaubert.py         |  7 ++++---
 src/transformers/models/fsmt/modeling_fsmt.py    |  7 ++++---
 .../models/informer/modeling_informer.py         | 16 +++++++++++-----
 src/transformers/models/led/modeling_led.py      |  7 ++++---
 .../models/marian/modeling_marian.py             | 16 +++++++++++-----
 .../models/maskformer/modeling_maskformer.py     |  7 ++++---
 src/transformers/models/mbart/modeling_mbart.py  | 16 +++++++++++-----
 src/transformers/models/mvp/modeling_mvp.py      | 16 +++++++++++-----
 src/transformers/models/opt/modeling_opt.py      |  7 ++++---
 .../models/pegasus/modeling_pegasus.py           | 16 +++++++++++-----
 .../models/pegasus_x/modeling_pegasus_x.py       | 16 +++++++++++-----
 .../models/plbart/modeling_plbart.py             | 16 +++++++++++-----
 .../speech_to_text/modeling_speech_to_text.py    | 16 +++++++++++-----
 .../modeling_speech_to_text_2.py                 |  7 ++++---
 .../modeling_table_transformer.py                | 16 +++++++++++-----
 .../modeling_time_series_transformer.py          | 16 +++++++++++-----
 src/transformers/models/trocr/modeling_trocr.py  |  7 ++++---
 .../models/whisper/modeling_whisper.py           | 16 +++++++++++-----
 src/transformers/models/xglm/modeling_xglm.py    |  7 ++++---
 .../autoformer/test_modeling_autoformer.py       |  6 +++++-
 28 files changed, 239 insertions(+), 118 deletions(-)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 01c20dc52a01..85dfe45ff468 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -1197,8 +1197,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1407,9 +1412,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 51afe26301b3..f426956594d6 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -836,8 +836,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1089,9 +1094,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index e529aec5ec8a..d7683d6fcf8f 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1932,8 +1932,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -2275,9 +2280,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 40fa81de9ce4..3e925917cffb 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -578,9 +578,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 3fe45ee216e9..8e582c4fa33a 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -766,8 +766,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1018,9 +1023,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 536554669706..890b47373e7e 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -764,8 +764,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1015,9 +1020,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 979cef5b403b..e42c4fc3ed43 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1223,8 +1223,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 # we add position_embeddings as extra input to the encoder_layer
@@ -1377,9 +1382,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
             if idx == 0:
                 pos_transformation = 1
             else:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 165c98f1e6f0..3e8925a49d34 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -978,8 +978,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 # we add position_embeddings as extra input to the encoder_layer
@@ -1117,9 +1122,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if self.gradient_checkpointing and self.training:
 
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 11f6f0fb3f62..1b04da241036 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -579,9 +579,10 @@ def forward(
         attentions = () if output_attentions else None
         for i in range(self.n_layers):
             # LayerDrop
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if output_hidden_states:
                 hidden_states = hidden_states + (tensor,)
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 35d34324c722..255cf91df76c 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -793,9 +793,10 @@ def forward(
                 x = x.transpose(0, 1)
                 all_hidden_states += (x,)
                 x = x.transpose(0, 1)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             layer_state = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 1645cacd3d4d..543db1d608ef 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -1204,8 +1204,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1424,9 +1429,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 38400590d3b6..8de14242bfc7 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -2134,9 +2134,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index c1d6a6768420..1d1cbe125e64 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -777,8 +777,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1023,9 +1028,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 55efe64da3c3..39b0b4cdd4c1 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -763,9 +763,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if self.gradient_checkpointing and self.training:
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 8a088b68ab0b..7bf6b1b37e98 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -818,8 +818,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1073,9 +1078,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index a1fca99dadec..d135ee558d16 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -940,8 +940,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1215,9 +1220,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 92c616bb631d..5ad783b92daf 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -684,9 +684,10 @@ def forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 9565ee0d91fe..3eac50b327ff 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -792,8 +792,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1073,9 +1078,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index 661cb85a3bb6..0763aec360fb 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -1059,8 +1059,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1314,9 +1319,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 2a80ae3d593d..30d9bd0ddc38 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -797,8 +797,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1051,9 +1056,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index bca2669ae13f..862dcac2ce7c 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -807,8 +807,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1052,9 +1057,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 31e9bc34c943..a04fd82d4b8f 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -661,9 +661,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index d2de059470d9..2c4458b0ed3a 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -919,8 +919,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 # we add position_embeddings as extra input to the encoder_layer
@@ -1061,9 +1066,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             if self.gradient_checkpointing and self.training:
 
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 477a52a57c7f..98de5e12b481 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -936,8 +936,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1150,9 +1155,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index ede83af6ed7d..3ad4ff1bac52 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -693,9 +693,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index c5e9c94d3f94..cffb2810838d 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -915,8 +915,13 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
@@ -1144,9 +1149,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 19ae63199c72..b7172127d906 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -667,9 +667,10 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            dropout_probability = torch.rand([])
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 9f0434689c4b..ab62d0e395f5 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -21,7 +21,7 @@
 from huggingface_hub import hf_hub_download
 
 from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -380,6 +380,10 @@ def test_attention_outputs(self):
             [self.model_tester.num_attention_heads, encoder_seq_length, dim],
         )
 
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
 
 def prepare_batch(filename="train-batch.pt"):
     file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")

From c9fd49853ff16dbf562de9d1c8afffe24fbd7fa1 Mon Sep 17 00:00:00 2001
From: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
Date: Tue, 27 Jun 2023 02:38:29 +0900
Subject: [PATCH 531/935] Fix 'local_rank' AttiributeError in Trainer class
 (#24297)

fix attribute error
---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 2ff4f93fdfcb..6beb0a6b68fe 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3250,7 +3250,7 @@ def _nested_gather(self, tensors, name=None):
         elif is_sagemaker_mp_enabled():
             tensors = smp_gather(tensors)
         elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or (
-            self.args.distributed_state is None and self.local_rank != -1
+            self.args.distributed_state is None and self.args.local_rank != -1
         ):
             tensors = distributed_concat(tensors)
         return tensors

From 7b4e3b5b40f20dfa9361460d5176447d448495b2 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 26 Jun 2023 19:43:06 +0200
Subject: [PATCH 532/935] Compute `dropout_probability` only in training mode
 (SpeechT5) (#24498)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/speecht5/modeling_speecht5.py            | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index c91b90d63cdf..301ed54af4d6 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -1380,9 +1380,11 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
+            skip_the_layer = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                skip_the_layer = dropout_probability < self.layerdrop
 
-            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 # under deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
@@ -1705,9 +1707,10 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = torch.rand([])
-
-            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            skip_the_layer = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                skip_the_layer = dropout_probability < self.layerdrop
             if skip_the_layer and not deepspeed_zero3_is_enabled:
                 continue
 

From 68c92981ff2b804979d2e6107eeefe298d1e5183 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gema=20Parre=C3=B1o?= <gema.parreno.piqueras@gmail.com>
Date: Mon, 26 Jun 2023 20:26:09 +0200
Subject: [PATCH 533/935] Fix link in utils (#24501)

* fix link

* new link

---------

Co-authored-by: Gema <gema@mbp-de-gema-2.lan>
---
 src/transformers/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index d153f90b4aef..d2d56bea10c6 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -215,6 +215,6 @@ def check_min_version(min_version):
         error_message += f" but the version found is {__version__}.\n"
         raise ImportError(
             error_message
-            + "Check out https://huggingface.co/transformers/examples.html for the examples corresponding to other "
+            + "Check out https://github.com/huggingface/transformers/tree/main/examples#important-note for the examples corresponding to other "
             "versions of HuggingFace Transformers."
         )

From 43479ef98f2d9f1600793922cf022d7d0670a5f8 Mon Sep 17 00:00:00 2001
From: hukuda222 <zionbanzai3@gmail.com>
Date: Tue, 27 Jun 2023 18:43:10 +0900
Subject: [PATCH 534/935] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=20Fix=20group?=
 =?UTF-8?q?=20beam=20search=20(#24407)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* group_beam_search now works correctly

* add argument descriptions

* add a comment

* format

* make style

* change comment

* Update src/transformers/generation/beam_search.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

---------

Co-authored-by: shogo.fujita <shogo.fujita@legalontech.jp>
Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/beam_search.py | 47 ++++++++++++++--------
 src/transformers/generation/utils.py       |  3 +-
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 792b2a17f5d6..71a459c06852 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -43,6 +43,10 @@
             The id of the *padding* token.
         eos_token_id (`Union[int, List[int]]`, *optional*):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        beam_indices (`torch.LongTensor]`, *optional*):
+            Beam indices indicating to which beam hypothesis each token correspond.
+        group_index (`int`, *optional*):
+            The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
 
     Return:
         `UserDict`: A dictionary composed of the fields as defined above:
@@ -175,16 +179,22 @@ def __init__(
         self.group_size = self.num_beams // self.num_beam_groups
 
         self._is_init = False
+        # self._beam_hyps[i*self.num_beam_groups+j] is the beam_hyps of the j-th group in the i-th mini-batch.
+        # If group_beam_search is not used, the list consists of `batch_size` beam_hyps.
         self._beam_hyps = [
             BeamHypotheses(
-                num_beams=self.num_beams,
+                num_beams=self.group_size,
                 length_penalty=self.length_penalty,
                 early_stopping=self.do_early_stopping,
                 max_length=max_length,
             )
-            for _ in range(batch_size)
+            for _ in range(batch_size * self.num_beam_groups)
         ]
-        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)
+        # self._done[i*self.num_beam_groups+j] indicates whether the generation of the beam_hyps of the j-th group
+        # in the i-th mini-batch is complete.
+        self._done = torch.tensor(
+            [False for _ in range(batch_size * self.num_beam_groups)], dtype=torch.bool, device=self.device
+        )
 
         if not isinstance(num_beams, int) or num_beams <= 1:
             raise ValueError(
@@ -211,9 +221,11 @@ def process(
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
         beam_indices: Optional[torch.LongTensor] = None,
+        group_index: Optional[int] = 0,
     ) -> Dict[str, torch.Tensor]:
         cur_len = input_ids.shape[-1] + 1  # add up to the length which the next_scores is calculated on
-        batch_size = len(self._beam_hyps)
+        batch_size = len(self._beam_hyps) // self.num_beam_groups
+
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
             if self.num_beam_groups > 1:
                 raise ValueError(
@@ -234,9 +246,10 @@ def process(
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
 
-        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_idx]:
-                if self.num_beams < len(beam_hyp):
+        for batch_idx in range(batch_size):
+            batch_group_idx = batch_idx * self.num_beam_groups + group_index
+            if self._done[batch_group_idx]:
+                if self.num_beams < len(self._beam_hyps[batch_group_idx]):
                     raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
                 if eos_token_id is None or pad_token_id is None:
                     raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
@@ -264,7 +277,7 @@ def process(
                     else:
                         beam_index = None
 
-                    beam_hyp.add(
+                    self._beam_hyps[batch_group_idx].add(
                         input_ids[batch_beam_idx].clone(),
                         next_score.item(),
                         beam_indices=beam_index,
@@ -287,7 +300,7 @@ def process(
                 )
 
             # Check if we are done so that we can save a pad step if all(done)
-            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+            self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done(
                 next_scores[batch_idx].max().item(), cur_len
             )
 
@@ -310,20 +323,20 @@ def finalize(
         eos_token_id: Optional[Union[int, List[int]]] = None,
         beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.LongTensor]:
-        batch_size = len(self._beam_hyps)
+        batch_size = len(self._beam_hyps) // self.num_beam_groups
 
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
 
         # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_idx]:
+        for batch_group_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_group_idx]:
                 continue
 
             # all open beam hypotheses are added to the beam hypothesis
             # beam hypothesis class automatically keeps the best beams
-            for beam_id in range(self.num_beams):
-                batch_beam_idx = batch_idx * self.num_beams + beam_id
+            for index_per_group in range(self.group_size):
+                batch_beam_idx = batch_group_idx * self.group_size + index_per_group
                 final_score = final_beam_scores[batch_beam_idx].item()
                 final_tokens = input_ids[batch_beam_idx]
                 beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
@@ -336,8 +349,10 @@ def finalize(
         best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
 
         # retrieve best hypotheses
-        for i, beam_hyp in enumerate(self._beam_hyps):
-            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+        for i in range(batch_size):
+            beam_hyps_in_batch = self._beam_hyps[i * self.num_beam_groups : (i + 1) * self.num_beam_groups]
+            candidate_beams = [beam for beam_hyp in beam_hyps_in_batch for beam in beam_hyp.beams]
+            sorted_hyps = sorted(candidate_beams, key=lambda x: x[0])
             for j in range(self.num_beam_hyps_to_keep):
                 best_hyp_tuple = sorted_hyps.pop()
                 best_score = best_hyp_tuple[0]
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index e5da7a143b4c..b50689ba3f96 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -3522,10 +3522,10 @@ def group_beam_search(
             else self.generation_config.return_dict_in_generate
         )
 
-        batch_size = len(beam_scorer._beam_hyps)
         num_beams = beam_scorer.num_beams
         num_beam_groups = beam_scorer.num_beam_groups
         num_sub_beams = num_beams // num_beam_groups
+        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
         device = input_ids.device
 
         batch_beam_size, cur_len = input_ids.shape
@@ -3648,6 +3648,7 @@ def group_beam_search(
                     pad_token_id=pad_token_id,
                     eos_token_id=eos_token_id,
                     beam_indices=process_beam_indices,
+                    group_index=beam_group_idx,
                 )
                 beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
                 beam_next_tokens = beam_outputs["next_beam_tokens"]

From 5f3efdf762a5de89d354879d3d6bed689bbc73b7 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 27 Jun 2023 10:46:39 +0100
Subject: [PATCH 535/935] Generate: `group_beam_search` requires
 `diversity_penalty>0.0` (#24456)

* add exception

* update docs
---
 docs/source/en/generation_strategies.md | 9 +++++----
 src/transformers/generation/utils.py    | 5 +++++
 tests/generation/test_utils.py          | 1 +
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 5552fc08295b..613f1598a905 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -301,8 +301,9 @@ the `num_beams` greater than 1, and set `do_sample=True` to use this decoding st
 
 The diverse beam search decoding strategy is an extension of the beam search strategy that allows for generating a more diverse
 set of beam sequences to choose from. To learn how it works, refer to [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf).
-This approach has two main parameters: `num_beams` and `num_beam_groups`.
-The groups are selected to ensure they are distinct enough compared to the others, and regular beam search is used within each group.
+This approach has three main parameters: `num_beams`, `num_beam_groups`, and `diversity_penalty`.
+The diversily penalty ensures the outputs are distinct across groups, and beam search is used within each group.
+
 
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
@@ -328,9 +329,9 @@ The groups are selected to ensure they are distinct enough compared to the other
 
 >>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
 
->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30)
+>>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
 >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'The Design Principles are a set of universal design principles that can be applied to any location, climate and culture, and they allow us to design the most efficient and sustainable human habitation and food production systems.'
+'The aim of this project is to create a new type of living system, one that is more sustainable and efficient than the current one.'
 ```
 
 This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index b50689ba3f96..cb681cc72053 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1669,6 +1669,11 @@ def generate(
             if generation_config.num_beams % generation_config.num_beam_groups != 0:
                 raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.")
 
+            if generation_config.diversity_penalty == 0.0:
+                raise ValueError(
+                    "`diversity_penalty` should be greater than `0.0`, otherwise your beam groups will be identical."
+                )
+
             if stopping_criteria.max_length is None:
                 raise ValueError("`max_length` needs to be a stopping_criteria for now.")
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index de38705ce472..0a133eb6bd1f 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2366,6 +2366,7 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
             num_beams=2,
             num_beam_groups=2,
             num_return_sequences=2,
+            diversity_penalty=1.0,
             eos_token_id=None,
             return_dict_in_generate=True,
             output_scores=True,

From ac19871ce2919fd8ccee70e19c7eb5603f986cfa Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 27 Jun 2023 11:48:23 +0100
Subject: [PATCH 536/935] Generate: `min_tokens_to_keep` has to be `>= 1`
 (#24453)

---
 src/transformers/generation/flax_logits_process.py |  4 ++--
 src/transformers/generation/logits_process.py      | 10 +++++-----
 src/transformers/generation/tf_logits_process.py   |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index f48d56517f4a..61ddb392c6df 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -129,8 +129,8 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 0):
-            raise ValueError(f"`min_tokens_to_keep` has to be a non-negative integer, but is {min_tokens_to_keep}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 1b32e592dc51..87a3b17bcb12 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -255,8 +255,8 @@ def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens
         top_p = float(top_p)
         if top_p < 0 or top_p > 1.0:
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 0):
-            raise ValueError(f"`min_tokens_to_keep` has to be a non-negative integer, but is {min_tokens_to_keep}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value
@@ -323,6 +323,8 @@ def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_t
         mass = float(mass)
         if not (mass > 0 and mass < 1):
             raise ValueError(f"`typical_p` has to be a float > 0 and < 1, but is {mass}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.filter_value = filter_value
         self.mass = mass
@@ -344,9 +346,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         last_ind = (cumulative_probs < self.mass).sum(dim=1)
         last_ind[last_ind < 0] = 0
         sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
-        if self.min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+        sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
         indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
 
         scores = scores.masked_fill(indices_to_remove, self.filter_value)
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 00c47e798207..2c589ca98c7b 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -160,8 +160,8 @@ class TFTopPLogitsWarper(TFLogitsWarper):
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 0):
-            raise ValueError(f"`min_tokens_to_keep` has to be a non-negative integer, but is {min_tokens_to_keep}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
 
         self.top_p = top_p
         self.filter_value = filter_value

From 239ace152bfa84188d78d3802d2e69b64752be7e Mon Sep 17 00:00:00 2001
From: Xiaoli Wang <nsnq@qq.com>
Date: Tue, 27 Jun 2023 19:15:49 +0800
Subject: [PATCH 537/935] Fix TypeError: Object of type int64 is not JSON
 serializable (#24340)

* Fix TypeError: Object of type int64 is not JSON serializable

* Convert numpy.float64 and numpy.int64 to float and int for json serialization

* Black reformatted examples/pytorch/token-classification/run_ner_no_trainer.py

* * make style
---
 .../pytorch/token-classification/run_ner_no_trainer.py    | 7 +++++++
 .../research_projects/codeparrot/scripts/human_eval.py    | 2 +-
 .../research_projects/fsner/src/fsner/tokenizer_utils.py  | 4 ++--
 .../jax-projects/big_bird/prepare_natural_questions.py    | 2 +-
 .../research_projects/luke/run_luke_ner_no_trainer.py     | 2 +-
 examples/research_projects/lxmert/modeling_frcnn.py       | 4 ++--
 examples/research_projects/visual_bert/modeling_frcnn.py  | 4 ++--
 src/transformers/generation/logits_process.py             | 6 +++---
 src/transformers/generation/tf_logits_process.py          | 2 +-
 src/transformers/generation/tf_utils.py                   | 8 ++++----
 src/transformers/keras_callbacks.py                       | 2 +-
 .../convert_bert_pytorch_checkpoint_to_original_tf.py     | 2 +-
 .../convert_bigbird_pegasus_tf_to_pytorch.py              | 6 +++---
 src/transformers/models/deta/modeling_deta.py             | 4 ++--
 src/transformers/models/dpr/tokenization_dpr.py           | 8 +++-----
 src/transformers/models/dpr/tokenization_dpr_fast.py      | 8 +++-----
 .../models/pegasus/convert_pegasus_tf_to_pytorch.py       | 2 +-
 src/transformers/models/sam/processing_sam.py             | 2 +-
 tests/generation/test_framework_agnostic.py               | 4 ++--
 tests/models/codegen/test_modeling_codegen.py             | 2 +-
 tests/models/data2vec/test_modeling_data2vec_audio.py     | 2 +-
 tests/models/encodec/test_modeling_encodec.py             | 4 ++--
 tests/models/gpt2/test_modeling_gpt2.py                   | 2 +-
 tests/models/gptj/test_modeling_gptj.py                   | 2 +-
 tests/models/hubert/test_modeling_hubert.py               | 4 ++--
 tests/models/mctct/test_modeling_mctct.py                 | 4 ++--
 tests/models/rwkv/test_modeling_rwkv.py                   | 2 +-
 tests/models/sew/test_modeling_sew.py                     | 2 +-
 tests/models/sew_d/test_modeling_sew_d.py                 | 2 +-
 tests/models/speecht5/test_modeling_speecht5.py           | 6 +++---
 tests/models/unispeech/test_modeling_unispeech.py         | 2 +-
 tests/models/unispeech_sat/test_modeling_unispeech_sat.py | 4 ++--
 tests/models/wav2vec2/test_modeling_flax_wav2vec2.py      | 2 +-
 tests/models/wav2vec2/test_modeling_wav2vec2.py           | 4 ++--
 .../test_modeling_wav2vec2_conformer.py                   | 2 +-
 tests/models/wavlm/test_modeling_wavlm.py                 | 2 +-
 tests/models/whisper/test_modeling_whisper.py             | 2 +-
 tests/onnx/test_onnx.py                                   | 2 +-
 tests/test_modeling_tf_common.py                          | 2 +-
 tests/test_tokenization_common.py                         | 2 +-
 tests/trainer/test_trainer_seq2seq.py                     | 4 ++--
 utils/check_copies.py                                     | 2 +-
 utils/create_dummy_models.py                              | 2 +-
 utils/tests_fetcher.py                                    | 2 +-
 44 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 84f506a42ca6..cd8f7658d986 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -28,6 +28,7 @@
 
 import datasets
 import evaluate
+import numpy as np
 import torch
 from accelerate import Accelerator
 from accelerate.logging import get_logger
@@ -777,6 +778,12 @@ def compute_metrics():
             if args.with_tracking:
                 all_results.update({"train_loss": total_loss.item() / len(train_dataloader)})
             with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+                # Convert all float64 & int64 type numbers to float & int for json serialization
+                for key, value in all_results.items():
+                    if isinstance(value, np.float64):
+                        all_results[key] = float(value)
+                    elif isinstance(value, np.int64):
+                        all_results[key] = int(value)
                 json.dump(all_results, f)
 
 
diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py
index 157079881d5f..ef217a597e33 100644
--- a/examples/research_projects/codeparrot/scripts/human_eval.py
+++ b/examples/research_projects/codeparrot/scripts/human_eval.py
@@ -60,7 +60,7 @@ def __call__(self, input_ids, scores, **kwargs):
         decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])
         done = []
         for decoded_generation in decoded_generations:
-            done.append(any([stop_string in decoded_generation for stop_string in self.eof_strings]))
+            done.append(any(stop_string in decoded_generation for stop_string in self.eof_strings))
         return all(done)
 
 
diff --git a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
index bc5f6650ccd9..b281ae6cfb89 100644
--- a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
+++ b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
@@ -17,7 +17,7 @@ def tokenize(self, x):
             `transformers.tokenization_utils_base.BatchEncoding` dict with additional keys and values for start_token_id, end_token_id and sizes of example lists for each entity type
         """
 
-        if isinstance(x, list) and all([isinstance(_x, list) for _x in x]):
+        if isinstance(x, list) and all(isinstance(_x, list) for _x in x):
             d = None
             for l in x:
                 t = self.tokenizer(
@@ -37,7 +37,7 @@ def tokenize(self, x):
             d["start_token_id"] = torch.tensor(self.tokenizer.convert_tokens_to_ids("[E]"))
             d["end_token_id"] = torch.tensor(self.tokenizer.convert_tokens_to_ids("[/E]"))
 
-        elif isinstance(x, list) and all([isinstance(_x, str) for _x in x]):
+        elif isinstance(x, list) and all(isinstance(_x, str) for _x in x):
             d = self.tokenizer(
                 x,
                 padding="max_length",
diff --git a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
index 6a202ba77522..ebbb184ccb6b 100644
--- a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
+++ b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
@@ -50,7 +50,7 @@ def choose_first(answer, is_long_answer=False):
         answer["remove_it"] = False
 
     cols = ["start_token", "end_token", "start_byte", "end_byte", "text"]
-    if not all([isinstance(answer[k], list) for k in cols]):
+    if not all(isinstance(answer[k], list) for k in cols):
         raise ValueError("Issue in ID", example["id"])
 
     return answer
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
index 4c5227d2c7e0..6b59643cf70b 100644
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py
@@ -610,7 +610,7 @@ def get_luke_labels(outputs, ner_tags, original_entity_spans):
             predicted_sequence = [label_list[0]] * len(true_tags)
 
             for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
-                if all([o == label_list[0] for o in predicted_sequence[span[0] : span[1]]]):
+                if all(o == label_list[0] for o in predicted_sequence[span[0] : span[1]]):
                     predicted_sequence[span[0]] = label
                     if span[1] - span[0] > 1:
                         predicted_sequence[span[0] + 1 : span[1]] = [label] * (span[1] - span[0] - 1)
diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py
index edbd224cbe08..943588a5ed8c 100644
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -554,8 +554,8 @@ def __init__(
         assert thresholds[0] > 0
         thresholds.insert(0, -float("inf"))
         thresholds.append(float("inf"))
-        assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
-        assert all([label_i in [-1, 0, 1] for label_i in labels])
+        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
+        assert all(label_i in [-1, 0, 1] for label_i in labels)
         assert len(labels) == len(thresholds) - 1
         self.thresholds = thresholds
         self.labels = labels
diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py
index edbd224cbe08..943588a5ed8c 100644
--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@@ -554,8 +554,8 @@ def __init__(
         assert thresholds[0] > 0
         thresholds.insert(0, -float("inf"))
         thresholds.append(float("inf"))
-        assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
-        assert all([label_i in [-1, 0, 1] for label_i in labels])
+        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
+        assert all(label_i in [-1, 0, 1] for label_i in labels)
         assert len(labels) == len(thresholds) - 1
         self.thresholds = thresholds
         self.labels = labels
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 87a3b17bcb12..952e86570a9e 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -110,7 +110,7 @@ def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
 
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
-        if not all([isinstance(i, int) for i in eos_token_id]) or any([i < 0 for i in eos_token_id]):
+        if not all(isinstance(i, int) for i in eos_token_id) or any(i < 0 for i in eos_token_id):
             logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
 
         self.min_length = min_length
@@ -147,7 +147,7 @@ def __init__(self, prompt_length_to_skip: int, min_new_tokens: int, eos_token_id
 
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
-        if not all([isinstance(i, int) for i in eos_token_id]) or any([i < 0 for i in eos_token_id]):
+        if not all(isinstance(i, int) for i in eos_token_id) or any(i < 0 for i in eos_token_id):
             logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
 
         self.prompt_length_to_skip = prompt_length_to_skip
@@ -731,7 +731,7 @@ def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         bad_words_ids = list(
-            filter(lambda bad_token_seq: all([bad_token_seq != [i] for i in eos_token_id]), bad_words_ids)
+            filter(lambda bad_token_seq: all(bad_token_seq != [i] for i in eos_token_id), bad_words_ids)
         )
 
         # Forbidding a sequence is equivalent to setting its bias to -inf
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 2c589ca98c7b..7e442a1659c8 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -318,7 +318,7 @@ def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
         self.bad_word_seqs_ids = tf.ragged.constant(bad_words_ids).to_tensor(default_value=-1)
         # 2. a tensor with the unpadded length of each forbidden sequence, for quick length comparisons
         bad_word_seqs_len = [len(bad_words) for bad_words in bad_words_ids]
-        if any([word_len == 0 for word_len in bad_word_seqs_len]):
+        if any(word_len == 0 for word_len in bad_word_seqs_len):
             raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
         self.bad_word_seqs_len = tf.convert_to_tensor(bad_word_seqs_len, dtype=tf.int32)
         # 3. a tensor containing the last token for each sequence, for easy access to the tokens that may be banned
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index ff8be3dc0aea..40c418a714a5 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -1638,7 +1638,7 @@ def greedy_search(
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
         # some models, like XLNet, need more than the last token in the presence of past_key_values
         needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
 
@@ -1922,7 +1922,7 @@ def sample(
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
         # some models, like XLNet, need more than the last token in the presence of past_key_values
         needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
 
@@ -2265,7 +2265,7 @@ def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
         # some models, like XLNet, need more than the last token in the presence of past_key_values
         needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
 
@@ -2779,7 +2779,7 @@ def gather_fn(tensor):
         # TODO (Joao): fix cache format or find programatic way to detect cache index
         # GPT2 and other models has a slightly different cache structure, with a different batch axis
         model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
-        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
 
         # 2. init `attentions`, `hidden_states`, and `scores` tuples
         scores = [] if (return_dict_in_generate and output_scores) else None
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 661a8c09d585..22e1ff682ccf 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -144,7 +144,7 @@ def __init__(
     @staticmethod
     def _concatenate_batches(batches, padding_index=-100):
         # If all batches are unidimensional or same length, do a simple concatenation
-        if batches[0].ndim == 1 or all([batch.shape[1] == batches[0].shape[1] for batch in batches]):
+        if batches[0].ndim == 1 or all(batch.shape[1] == batches[0].shape[1] for batch in batches):
             return np.concatenate(batches, axis=0)
 
         # Welp, they're not the same length. Let's do some padding
diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
index 68ed9bafc873..5e3ef4df9fea 100644
--- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -78,7 +78,7 @@ def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
         for var_name in state_dict:
             tf_name = to_tf_var_name(var_name)
             torch_tensor = state_dict[var_name].numpy()
-            if any([x in var_name for x in tensors_to_transpose]):
+            if any(x in var_name for x in tensors_to_transpose):
                 torch_tensor = torch_tensor.T
             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
             tf.keras.backend.set_value(tf_var, torch_tensor)
diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
index 5a81207548f9..e17369e48041 100644
--- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
+++ b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
@@ -104,7 +104,7 @@ def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPeg
         new_k = rename_state_dict_key(k, patterns)
         if new_k not in state_dict:
             raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any([True if i in k else False for i in ["dense", "query", "key", "value"]]):
+        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
             v = v.T
         mapping[new_k] = torch.from_numpy(v)
         assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
@@ -117,7 +117,7 @@ def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPeg
         new_k = rename_state_dict_key(k, patterns)
         if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings":
             raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any([True if i in k else False for i in ["dense", "query", "key", "value"]]):
+        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
             v = v.T
         mapping[new_k] = torch.from_numpy(v)
         if k != "pegasus/embeddings/position_embeddings":
@@ -147,7 +147,7 @@ def get_tf_weights_as_numpy(path) -> Dict:
     tf_weights = {}
     ignore_name = ["global_step"]
     for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any([pat in name for pat in ignore_name])
+        skip_key = any(pat in name for pat in ignore_name)
         if skip_key:
             continue
         array = tf.train.load_variable(path, name)
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index af218829d6f9..bee84a5bf72f 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -2485,9 +2485,9 @@ def __init__(self, thresholds: List[float], labels: List[int], allow_low_quality
         thresholds.insert(0, -float("inf"))
         thresholds.append(float("inf"))
         # Currently torchscript does not support all + generator
-        if not all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])]):
+        if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])):
             raise ValueError("Thresholds should be sorted.")
-        if not all([l in [-1, 0, 1] for l in labels]):
+        if not all(l in [-1, 0, 1] for l in labels):
             raise ValueError("All labels should be either -1, 0 or 1")
         if len(labels) != len(thresholds) - 1:
             raise ValueError("Number of labels should be equal to number of thresholds - 1")
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index a2024dda5d69..b2ae84addc75 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -379,11 +379,9 @@ def _get_best_spans(
             if length > max_answer_length:
                 raise ValueError(f"Span is too long: {length} > {max_answer_length}")
             if any(
-                [
-                    start_index <= prev_start_index <= prev_end_index <= end_index
-                    or prev_start_index <= start_index <= end_index <= prev_end_index
-                    for (prev_start_index, prev_end_index) in chosen_span_intervals
-                ]
+                start_index <= prev_start_index <= prev_end_index <= end_index
+                or prev_start_index <= start_index <= end_index <= prev_end_index
+                for (prev_start_index, prev_end_index) in chosen_span_intervals
             ):
                 continue
             chosen_span_intervals.append((start_index, end_index))
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index de32332bf271..784ed1344cf6 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -377,11 +377,9 @@ def _get_best_spans(
             length = end_index - start_index + 1
             assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}"
             if any(
-                [
-                    start_index <= prev_start_index <= prev_end_index <= end_index
-                    or prev_start_index <= start_index <= end_index <= prev_end_index
-                    for (prev_start_index, prev_end_index) in chosen_span_intervals
-                ]
+                start_index <= prev_start_index <= prev_end_index <= end_index
+                or prev_start_index <= start_index <= end_index <= prev_end_index
+                for (prev_start_index, prev_end_index) in chosen_span_intervals
             ):
                 continue
             chosen_span_intervals.append((start_index, end_index))
diff --git a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
index 739e075233f7..cf183b590c1b 100644
--- a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
+++ b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
@@ -90,7 +90,7 @@ def get_tf_weights_as_numpy(path="./ckpt/aeslc/model.ckpt-32000") -> Dict:
     tf_weights = {}
     ignore_name = ["Adafactor", "global_step"]
     for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any([pat in name for pat in ignore_name])
+        skip_key = any(pat in name for pat in ignore_name)
         if skip_key:
             continue
         array = tf.train.load_variable(path, name)
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index fd73260f9401..0ec47a995af1 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -115,7 +115,7 @@ def _normalize_and_convert(
                     for point, original_size in zip(input_points, original_sizes)
                 ]
             # check that all arrays have the same shape
-            if not all([point.shape == input_points[0].shape for point in input_points]):
+            if not all(point.shape == input_points[0].shape for point in input_points):
                 if input_labels is not None:
                     input_points, input_labels = self._pad_points_and_labels(input_points, input_labels)
 
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
index 61845aa9bc1a..306cb15168e5 100644
--- a/tests/generation/test_framework_agnostic.py
+++ b/tests/generation/test_framework_agnostic.py
@@ -647,7 +647,7 @@ def test_eos_token_id_int_and_list_beam_search(self):
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         unpadded_correct_condition = expectation == len(generated_tokens[0])
         padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            [token == model.config.pad_token_id for token in generated_tokens[0][expectation:]]
+            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
         )
         self.assertTrue(unpadded_correct_condition or padded_correct_condition)
 
@@ -655,7 +655,7 @@ def test_eos_token_id_int_and_list_beam_search(self):
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         unpadded_correct_condition = expectation == len(generated_tokens[0])
         padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            [token == model.config.pad_token_id for token in generated_tokens[0][expectation:]]
+            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
         )
         self.assertTrue(unpadded_correct_condition or padded_correct_condition)
 
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index fdf2d8919295..9072c2b5bc2f 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -521,7 +521,7 @@ def test_codegen_sample(self):
 
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
     @is_flaky(max_attempts=3, description="measure of timing is somehow flaky.")
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index 74450f50c8b5..67fe0cbe70c0 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -516,7 +516,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index a1693b758243..427f92f6c25d 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -373,12 +373,12 @@ def test_initialization(self):
                 uniform_init_parms = ["conv"]
                 ignore_init = ["lstm"]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
-                    elif not any([x in name for x in ignore_init]):
+                    elif not any(x in name for x in ignore_init):
                         self.assertIn(
                             ((param.data.mean() * 1e9).round() / 1e9).item(),
                             [0.0, 1.0],
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 65542b495497..f820b54942e1 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -768,7 +768,7 @@ def test_gpt2_sample(self):
         )
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
     @slow
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 7fd6a40e1732..3636d357d5dc 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -571,7 +571,7 @@ def test_gptj_sample(self):
 
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
         self.assertTrue(
-            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
     @slow
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index 9b18b5580a8f..bad1a561da0c 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -423,7 +423,7 @@ def test_initialization(self):
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -684,7 +684,7 @@ def test_initialization(self):
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/mctct/test_modeling_mctct.py b/tests/models/mctct/test_modeling_mctct.py
index 7fa9d99a3fa0..21fadd633c3a 100644
--- a/tests/models/mctct/test_modeling_mctct.py
+++ b/tests/models/mctct/test_modeling_mctct.py
@@ -386,7 +386,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -533,7 +533,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 67430f6f3dc0..2b9cc47133ae 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -334,7 +334,7 @@ def test_initialization(self):
                     if param.requires_grad:
                         # check if it's a ones like
                         self.assertTrue(torch.allclose(param.data, torch.ones_like(param.data), atol=1e-5, rtol=1e-5))
-                elif any([x in name for x in ["time_mix_key", "time_mix_receptance"]]):
+                elif any(x in name for x in ["time_mix_key", "time_mix_receptance"]):
                     if param.requires_grad:
                         self.assertInterval(
                             param.data,
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index aa511e788b89..651600c4372a 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -417,7 +417,7 @@ def test_initialization(self):
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index c6ee53371902..9aa4b8edace4 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -431,7 +431,7 @@ def test_initialization(self):
                     "quantizer.weight_proj.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 8fbbee84f22b..7532f132a79d 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -583,7 +583,7 @@ def test_initialization(self):
                     "feature_projection.projection.bias",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -927,7 +927,7 @@ def test_initialization(self):
                     "conv.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -1337,7 +1337,7 @@ def test_initialization(self):
                     "feature_projection.projection.bias",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index 4d7967c484f8..6d0bd1bf1f24 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -432,7 +432,7 @@ def test_initialization(self):
                     "feature_projection.projection.bias",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index 19d3dd849f52..a418a56dadb6 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -484,7 +484,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -695,7 +695,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index 508d96ae10ed..b9b52dc121e9 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -464,7 +464,7 @@ def test_sample_negatives_with_attn_mask(self):
         negative_indices = _sample_negative_indices(features.shape, num_negatives, attention_mask=attention_mask)
 
         # make sure that no padding tokens are sampled
-        self.assertTrue(all([idx not in negative_indices for idx in forbidden_indices]))
+        self.assertTrue(all(idx not in negative_indices for idx in forbidden_indices))
 
         features = features.reshape(-1, hidden_size)  # BTC => (BxT)C
         # take negative vectors from sampled indices
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 65bfcb4451a9..4db9b156db46 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -637,7 +637,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
@@ -971,7 +971,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 200c863adbb8..8c26268c6dad 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -569,7 +569,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index 67f380c239ed..b04a96dd1caa 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -438,7 +438,7 @@ def test_initialization(self):
                     "objective.weight",
                 ]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b16587fd66ff..20bbfbac3257 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1535,7 +1535,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
         text = processor.decode(output[0])
 
         self.assertTrue(prompt in text)
-        self.assertTrue(all([token in text for token in expected_tokens]))
+        self.assertTrue(all(token in text for token in expected_tokens))
 
     @slow
     def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
diff --git a/tests/onnx/test_onnx.py b/tests/onnx/test_onnx.py
index db1fc6ac4542..851934ef3022 100644
--- a/tests/onnx/test_onnx.py
+++ b/tests/onnx/test_onnx.py
@@ -145,7 +145,7 @@ def _test_infer_dynamic_axis(self, model, tokenizer, framework):
 
         # Assert all variables are present
         self.assertEqual(len(shapes), len(variable_names))
-        self.assertTrue(all([var_name in shapes for var_name in variable_names]))
+        self.assertTrue(all(var_name in shapes for var_name in variable_names))
         self.assertSequenceEqual(variable_names[:3], input_vars)
         self.assertSequenceEqual(variable_names[3:], output_vars)
 
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 37429b73fd9b..fe225e2723e2 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1566,7 +1566,7 @@ def test_int_support(self):
                 return_labels=True if "labels" in inspect.signature(model_class.call).parameters.keys() else False,
             )
             if not any(
-                [tensor.dtype.is_integer for tensor in prepared_for_class.values() if isinstance(tensor, tf.Tensor)]
+                tensor.dtype.is_integer for tensor in prepared_for_class.values() if isinstance(tensor, tf.Tensor)
             ):
                 return  # No integer inputs means no need for this test
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 0656e382a8ef..3b17c6ea4f69 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -79,7 +79,7 @@
 
 def filter_non_english(_, pretrained_name: str):
     """Filter all the model for non-english language"""
-    return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
+    return not any(lang in pretrained_name for lang in NON_ENGLISH_TAGS)
 
 
 def filter_roberta_detectors(_, pretrained_name: str):
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 6212dea1f7b5..918c22155832 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -56,8 +56,8 @@ def _map_to_encoder_decoder_inputs(batch):
             ]
             batch["decoder_attention_mask"] = outputs.attention_mask
 
-            assert all([len(x) == 512 for x in inputs.input_ids])
-            assert all([len(x) == 128 for x in outputs.input_ids])
+            assert all(len(x) == 512 for x in inputs.input_ids)
+            assert all(len(x) == 128 for x in outputs.input_ids)
 
             return batch
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0dbc7b889edc..959c7b2d329b 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -362,7 +362,7 @@ def _rep(match):
     model_keys = [re.search(r"\*\*\[([^\]]*)", line).groups()[0] for line in model_list.strip().split("\n")]
 
     # We exclude keys in localized README not in the main one.
-    readmes_match = not any([k not in model_keys for k in localized_model_index])
+    readmes_match = not any(k not in model_keys for k in localized_model_index)
     localized_model_index = {k: v for k, v in localized_model_index.items() if k in model_keys}
 
     for model in model_list.strip().split("\n"):
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index 997aab3f6025..0dfe5cea6637 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -735,7 +735,7 @@ def build_model(model_arch, tiny_config, output_dir):
 
     tiny_config = copy.deepcopy(tiny_config)
 
-    if any([model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]]):
+    if any(model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]):
         tiny_config.is_encoder_decoder = False
         tiny_config.is_decoder = True
 
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index b6a1ec87ed7a..88552a11ca11 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -428,7 +428,7 @@ def get_module_dependencies(module_fname, cache=None):
                 # So we get the imports from that init then try to find where our objects come from.
                 new_imported_modules = extract_imports(module, cache=cache)
                 for new_module, new_imports in new_imported_modules:
-                    if any([i in new_imports for i in imports]):
+                    if any(i in new_imports for i in imports):
                         if new_module not in dependencies:
                             new_modules.append((new_module, [i for i in new_imports if i in imports]))
                         imports = [i for i in imports if i not in new_imports]

From 4abd3ee479a1909a9dbc451550e48334731e7d39 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 27 Jun 2023 14:14:17 +0200
Subject: [PATCH 538/935] Fix poor past ci (#24485)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/generation/logits_process.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 952e86570a9e..6a73fcf9649e 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -638,7 +638,11 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
                 torch.tensor(sequence_ids[:-1], dtype=input_ids.dtype, device=input_ids.device),
             ).prod(dim=1)
             matching_mask[:, last_token] |= matching_rows.bool()
-        bias += torch.where(matching_mask, self.length_greather_than_1_bias, 0.0)
+        bias += torch.where(
+            matching_mask,
+            self.length_greather_than_1_bias,
+            torch.tensor(0.0, device=self.length_greather_than_1_bias.device),
+        )
 
         # 5 - apply the bias to the scores
         scores = scores + bias

From 0863436b6cf4ddfa968e5bc887a22335f8f6ce6f Mon Sep 17 00:00:00 2001
From: Hyeonseo Yun <0525_hhgus@naver.com>
Date: Tue, 27 Jun 2023 21:18:42 +0900
Subject: [PATCH 539/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`tflite.mdx`=20to=20Korean=20(#24435)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: tflite.mdx

* feat: nmt and manual edit `tflite.mdx`

* revised: resolve suggestions tflite.mdx

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* revised: resolve suggestions and new line tflite.mdx

Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>
Co-Authored-By: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>

---------

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Kihoon Son <75935546+KIHOON71@users.noreply.github.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml |  2 ++
 docs/source/ko/tflite.md    | 62 +++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 docs/source/ko/tflite.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 0796b7b26fd0..80bd4c991a5b 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -89,6 +89,8 @@
       title: Amazon SageMaker에서 학습 실행하기
     - local: serialization
       title: ONNX로 내보내기
+    - local: tflite
+      title: TFLite로 내보내기
     - local: torchscript
       title: TorchScript로 내보내기
     - local: in_translation
diff --git a/docs/source/ko/tflite.md b/docs/source/ko/tflite.md
new file mode 100644
index 000000000000..5d08ea407854
--- /dev/null
+++ b/docs/source/ko/tflite.md
@@ -0,0 +1,62 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TFLite로 내보내기[[export-to-tflite]]
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide)는 자원이 제한된 휴대폰, 임베디드 시스템, 사물인터넷(IoT) 기기에서 
+기계학습 모델을 배포하기 위한 경량 프레임워크입니다. 
+TFLite는 연산 능력, 메모리, 전력 소비가 제한된 기기에서 모델을 효율적으로 최적화하고 실행하기 위해 
+설계되었습니다. 
+TensorFlow Lite 모델은 `.tflite` 파일 확장자로 식별되는 특수하고 효율적인 휴대용 포맷으로 표현됩니다. 
+
+🤗 Optimum은 `exporters.tflite` 모듈로 🤗 Transformers 모델을 TFLite로 내보내는 기능을 제공합니다. 
+지원되는 모델 아키텍처 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/tflite/overview)를 참고하세요. 
+
+모델을 TFLite로 내보내려면, 필요한 종속성을 설치하세요:
+ 
+```bash
+pip install optimum[exporters-tf]
+```
+
+모든 사용 가능한 인수를 확인하려면, [🤗 Optimum 문서](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)를 참고하거나 
+터미널에서 도움말을 살펴보세요:
+
+```bash
+optimum-cli export tflite --help
+```
+
+예를 들어 🤗 Hub에서의 `bert-base-uncased` 모델 체크포인트를 내보내려면, 다음 명령을 실행하세요:
+
+```bash
+optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+다음과 같이 진행 상황을 나타내는 로그와 결과물인 `model.tflite`가 저장된 위치를 보여주는 로그가 표시됩니다:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+ ```
+
+위 예제는 🤗 Hub에서의 체크포인트를 내보내는 방법을 보여줍니다. 
+로컬 모델을 내보낸다면, 먼저 모델 가중치와 토크나이저 파일이 모두 같은 디렉터리( `local_path` )에 저장됐는지 확인하세요. 
+CLI를 사용할 때, 🤗 Hub에서의 체크포인트 이름 대신 `model` 인수에 `local_path`를 전달하면 됩니다. 
\ No newline at end of file

From 6fe8d198e3e30b709a7d233240f273804b886dcc Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 27 Jun 2023 20:33:21 +0800
Subject: [PATCH 540/935] =?UTF-8?q?use=20accelerate=20autocast=20in=20jit?=
 =?UTF-8?q?=20eval=20path,=20since=20mix=20precision=20logic=20is=E2=80=A6?=
 =?UTF-8?q?=20(#24460)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

use accelerate autocast in jit eval path, since mix precision logic is in accelerator currently

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 src/transformers/trainer.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 6beb0a6b68fe..280884392206 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -17,6 +17,7 @@
 """
 
 import contextlib
+import copy
 import functools
 import glob
 import inspect
@@ -143,7 +144,6 @@
     logging,
     strtobool,
 )
-from .utils.generic import ContextManagers
 
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
@@ -1265,9 +1265,14 @@ def torch_jit_model_eval(self, model, dataloader, training=False):
             example_batch = next(iter(dataloader))
             example_batch = self._prepare_inputs(example_batch)
             try:
-                jit_model = model.eval()
-                with ContextManagers([self.autocast_smart_context_manager(cache_enabled=False), torch.no_grad()]):
-                    if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.14.0"):
+                jit_model = copy.copy(model)
+                jit_model.eval()
+                original_forward = jit_model.__dict__.pop("_original_forward", None)
+                # remove mixed precision hooks from the model
+                if original_forward:
+                    jit_model.forward = original_forward
+                with self.accelerator.autocast(cache_enabled=False), torch.no_grad():
+                    if version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.0.0"):
                         if isinstance(example_batch, dict):
                             jit_model = torch.jit.trace(jit_model, example_kwarg_inputs=example_batch, strict=False)
                         else:

From bcf02ec7017627b2007eac92abb8a71b7eeee30e Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Tue, 27 Jun 2023 18:42:15 +0530
Subject: [PATCH 541/935] Update hyperparameter_search.py (#24515)

* Update hyperparameter_search.py

* resolve comments
---
 src/transformers/hyperparameter_search.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/hyperparameter_search.py b/src/transformers/hyperparameter_search.py
index f0f5f46a0dde..8dfd60cc39cd 100644
--- a/src/transformers/hyperparameter_search.py
+++ b/src/transformers/hyperparameter_search.py
@@ -40,7 +40,8 @@ class HyperParamSearchBackendBase:
     name: str
     pip_package: str = None
 
-    def is_available(self):
+    @staticmethod
+    def is_available():
         raise NotImplementedError
 
     def run(self, trainer, n_trials: int, direction: str, **kwargs):
@@ -63,7 +64,8 @@ def pip_install(cls):
 class OptunaBackend(HyperParamSearchBackendBase):
     name = "optuna"
 
-    def is_available(self):
+    @staticmethod
+    def is_available():
         return is_optuna_available()
 
     def run(self, trainer, n_trials: int, direction: str, **kwargs):
@@ -77,7 +79,8 @@ class RayTuneBackend(HyperParamSearchBackendBase):
     name = "ray"
     pip_package = "'ray[tune]'"
 
-    def is_available(self):
+    @staticmethod
+    def is_available():
         return is_ray_available()
 
     def run(self, trainer, n_trials: int, direction: str, **kwargs):
@@ -90,7 +93,8 @@ def default_hp_space(self, trial):
 class SigOptBackend(HyperParamSearchBackendBase):
     name = "sigopt"
 
-    def is_available(self):
+    @staticmethod
+    def is_available():
         return is_sigopt_available()
 
     def run(self, trainer, n_trials: int, direction: str, **kwargs):
@@ -103,7 +107,8 @@ def default_hp_space(self, trial):
 class WandbBackend(HyperParamSearchBackendBase):
     name = "wandb"
 
-    def is_available(self):
+    @staticmethod
+    def is_available():
         return is_wandb_available()
 
     def run(self, trainer, n_trials: int, direction: str, **kwargs):

From 06910f5a764ff247c57ffd0d449eafb025e2bac1 Mon Sep 17 00:00:00 2001
From: Sebastian <sjrl@users.noreply.github.com>
Date: Tue, 27 Jun 2023 16:07:06 +0200
Subject: [PATCH 542/935] [`T5`] Add T5ForQuestionAnswering and
 MT5ForQuestionAnswering (#24481)

* Adding T5ForQuestionAnswering

* Changed weight initialization that results in better initial loss when fine-tuning

* Update to class variables

* Running make fixup

* Running make fix-copies

* Remove model_parallel

* Adding MT5ForQuestionAnswering

* Adding docs

* Fix wrong doc

* Update src/transformers/models/mt5/modeling_mt5.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* Update src/transformers/models/t5/modeling_t5.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* File formatting

* Undoing change

---------

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 docs/source/en/model_doc/mt5.md               |   4 +
 docs/source/en/model_doc/t5.md                |   5 +
 docs/source/en/tasks/question_answering.md    |   2 +-
 src/transformers/__init__.py                  |  12 +-
 src/transformers/models/auto/modeling_auto.py |   2 +
 src/transformers/models/mt5/__init__.py       |  10 +-
 src/transformers/models/mt5/modeling_mt5.py   | 204 +++++++++++++++++-
 src/transformers/models/t5/__init__.py        |   2 +
 src/transformers/models/t5/modeling_t5.py     | 198 ++++++++++++++++-
 src/transformers/utils/dummy_pt_objects.py    |  14 ++
 tests/models/t5/test_modeling_t5.py           |   4 +-
 11 files changed, 450 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md
index 1e769f5ac1f4..ea3da7668157 100644
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@@ -95,6 +95,10 @@ See [`T5TokenizerFast`] for all details.
 
 [[autodoc]] MT5EncoderModel
 
+## MT5ForQuestionAnswering
+
+[[autodoc]] MT5ForQuestionAnswering
+
 ## TFMT5Model
 
 [[autodoc]] TFMT5Model
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index b5fa747a8130..29068a3a4255 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -399,6 +399,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] T5EncoderModel
     - forward
 
+## T5ForQuestionAnswering
+
+[[autodoc]] T5ForQuestionAnswering
+    - forward
+
 ## TFT5Model
 
 [[autodoc]] TFT5Model
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index cb268e520e7d..586b0a5d7462 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d3ea00546639..202d9a610153 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2132,7 +2132,7 @@
         ]
     )
     _import_structure["models.mt5"].extend(
-        ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5Model", "MT5PreTrainedModel"]
+        ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5ForQuestionAnswering", "MT5Model", "MT5PreTrainedModel"]
     )
     _import_structure["models.mvp"].extend(
         [
@@ -2573,6 +2573,7 @@
             "T5_PRETRAINED_MODEL_ARCHIVE_LIST",
             "T5EncoderModel",
             "T5ForConditionalGeneration",
+            "T5ForQuestionAnswering",
             "T5Model",
             "T5PreTrainedModel",
             "load_tf_weights_in_t5",
@@ -5701,7 +5702,13 @@
             MPNetModel,
             MPNetPreTrainedModel,
         )
-        from .models.mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model, MT5PreTrainedModel
+        from .models.mt5 import (
+            MT5EncoderModel,
+            MT5ForConditionalGeneration,
+            MT5ForQuestionAnswering,
+            MT5Model,
+            MT5PreTrainedModel,
+        )
         from .models.mvp import (
             MVP_PRETRAINED_MODEL_ARCHIVE_LIST,
             MvpForCausalLM,
@@ -6064,6 +6071,7 @@
             T5_PRETRAINED_MODEL_ARCHIVE_LIST,
             T5EncoderModel,
             T5ForConditionalGeneration,
+            T5ForQuestionAnswering,
             T5Model,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4259f50374b5..8c5e5e8c157e 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -768,6 +768,7 @@
         ("megatron-bert", "MegatronBertForQuestionAnswering"),
         ("mobilebert", "MobileBertForQuestionAnswering"),
         ("mpnet", "MPNetForQuestionAnswering"),
+        ("mt5", "MT5ForQuestionAnswering"),
         ("mvp", "MvpForQuestionAnswering"),
         ("nezha", "NezhaForQuestionAnswering"),
         ("nystromformer", "NystromformerForQuestionAnswering"),
@@ -781,6 +782,7 @@
         ("roformer", "RoFormerForQuestionAnswering"),
         ("splinter", "SplinterForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
+        ("t5", "T5ForQuestionAnswering"),
         ("xlm", "XLMForQuestionAnsweringSimple"),
         ("xlm-roberta", "XLMRobertaForQuestionAnswering"),
         ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
diff --git a/src/transformers/models/mt5/__init__.py b/src/transformers/models/mt5/__init__.py
index de966bcb1980..d86823021d71 100644
--- a/src/transformers/models/mt5/__init__.py
+++ b/src/transformers/models/mt5/__init__.py
@@ -50,6 +50,7 @@
     _import_structure["modeling_mt5"] = [
         "MT5EncoderModel",
         "MT5ForConditionalGeneration",
+        "MT5ForQuestionAnswering",
         "MT5Model",
         "MT5PreTrainedModel",
         "MT5Stack",
@@ -81,7 +82,14 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model, MT5PreTrainedModel, MT5Stack
+        from .modeling_mt5 import (
+            MT5EncoderModel,
+            MT5ForConditionalGeneration,
+            MT5ForQuestionAnswering,
+            MT5Model,
+            MT5PreTrainedModel,
+            MT5Stack,
+        )
 
     try:
         if not is_tf_available():
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index a3cfce8ffc4a..dfeb3c10a915 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -31,6 +31,7 @@
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
@@ -772,12 +773,15 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, MT5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (MT5Model, MT5ForConditionalGeneration, MT5EncoderModel)):
+        elif isinstance(module, (MT5Model, MT5ForConditionalGeneration, MT5EncoderModel, MT5ForQuestionAnswering)):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
             if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
                 module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
         elif isinstance(module, MT5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -2015,3 +2019,201 @@ def forward(
         )
 
         return encoder_outputs
+
+
+@add_start_docstrings(
+    """
+    MT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MT5_START_DOCSTRING,
+)
+class MT5ForQuestionAnswering(MT5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"encoder.embed_tokens.weight",
+        r"decoder.embed_tokens.weight",
+        r"lm_head.weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = MT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = MT5Stack(decoder_config, self.shared)
+
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        # Copied from models.bart.modeling_bart.BartModel.forward
+        #   different to other models, T5 automatically creates decoder_input_ids from
+        #   input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=None,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index 396160d7739c..5a02877504b9 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -56,6 +56,7 @@
         "T5Model",
         "T5PreTrainedModel",
         "load_tf_weights_in_t5",
+        "T5ForQuestionAnswering",
     ]
 
 try:
@@ -115,6 +116,7 @@
             T5_PRETRAINED_MODEL_ARCHIVE_LIST,
             T5EncoderModel,
             T5ForConditionalGeneration,
+            T5ForQuestionAnswering,
             T5Model,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 050309fa9a33..f1f7b17c7bfe 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -32,6 +32,7 @@
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
@@ -801,12 +802,15 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering)):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
             if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
                 module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
         elif isinstance(module, T5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -1949,3 +1953,195 @@ def forward(
         )
 
         return encoder_outputs
+
+
+@add_start_docstrings(
+    """
+    T5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    T5_START_DOCSTRING,
+)
+class T5ForQuestionAnswering(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"encoder.embed_tokens.weight",
+        r"decoder.embed_tokens.weight",
+        r"lm_head.weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        # Copied from models.bart.modeling_bart.BartModel.forward
+        #   different to other models, T5 automatically creates decoder_input_ids from
+        #   input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=None,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f07b2fdd87ef..451303260770 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4915,6 +4915,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class MT5ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MT5Model(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6742,6 +6749,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class T5ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class T5Model(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 3bc4d03a9c8c..b0ff82dd62c2 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -43,6 +43,7 @@
         ByT5Tokenizer,
         T5EncoderModel,
         T5ForConditionalGeneration,
+        T5ForQuestionAnswering,
         T5Model,
         T5Tokenizer,
     )
@@ -520,7 +521,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
+    all_model_classes = (T5Model, T5ForConditionalGeneration, T5ForQuestionAnswering) if is_torch_available() else ()
     all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
@@ -529,6 +530,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
             "summarization": T5ForConditionalGeneration,
             "text2text-generation": T5ForConditionalGeneration,
             "translation": T5ForConditionalGeneration,
+            "question-answering": T5ForQuestionAnswering,
         }
         if is_torch_available()
         else {}

From 4e8929dcbb9040f54f52d898a600260f338ef54f Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 27 Jun 2023 22:09:38 +0800
Subject: [PATCH 543/935] set model to training mode before accelerate.prepare
 (#24520)

---
 src/transformers/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 280884392206..dae37b5a815b 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1640,6 +1640,7 @@ def _inner_training_loop(
 
         # prepare using `accelerator` prepare
         if use_accelerator_prepare:
+            self.model.train()
             if hasattr(self.lr_scheduler, "step"):
                 if self.use_apex:
                     model = self.accelerator.prepare(self.model)

From 7d150d68ff6eaecc75b446aa06160b6bc8466e38 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 27 Jun 2023 17:41:55 +0200
Subject: [PATCH 544/935] Update `huggingface_hub` commit sha (#24527)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 85552297b1ca..246dc339d7ff 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -117,7 +117,7 @@ def to_dict(self):
         ]
         steps.extend([{"run": l} for l in self.install_steps])
         # TODO (ydshieh): Remove this line after the next release (the one after 2023/06/19) of `huggingface_hub`
-        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install -U git+https://github.com/huggingface/huggingface_hub.git@c36817701ecd4694b290178846176da2e195d838"}})
+        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install -U git+https://github.com/huggingface/huggingface_hub.git@e4a419bf6bbaa95d14704cc781d3e81a49cef413"}})
         steps.append(
             {
                 "save_cache": {

From 38db04ece0870b2f7132c7f2e81b32d489fc7c59 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 27 Jun 2023 13:21:19 -0400
Subject: [PATCH 545/935] Find module name in an OS-agnostic fashion (#24526)

* Find module name in an OS-agnostic fashion

* address review comment
---
 src/transformers/dynamic_module_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index f106ccd213ac..5f7a2659bffa 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -234,7 +234,7 @@ def get_cached_module_file(
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
     is_local = os.path.isdir(pretrained_model_name_or_path)
     if is_local:
-        submodule = pretrained_model_name_or_path.split(os.path.sep)[-1]
+        submodule = os.path.basename(pretrained_model_name_or_path)
     else:
         submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
         cached_module = try_to_load_from_cache(
@@ -271,7 +271,7 @@ def get_cached_module_file(
     full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
     create_dynamic_module(full_submodule)
     submodule_path = Path(HF_MODULES_CACHE) / full_submodule
-    if submodule == pretrained_model_name_or_path.split(os.path.sep)[-1]:
+    if submodule == os.path.basename(pretrained_model_name_or_path):
         # We copy local files to avoid putting too many folders in sys.path. This copy is done when the file is new or
         # has changed since last copy.
         if not (submodule_path / module_file).exists() or not filecmp.cmp(

From fb6a62762f12e6402b6066968c67a0bd06fee243 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 27 Jun 2023 13:28:26 -0400
Subject: [PATCH 546/935] Fix LR scheduler based on bs from auto bs finder
 (#24521)

* One solution

* args -> self
---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index dae37b5a815b..d846a76b2c63 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1553,7 +1553,7 @@ def _inner_training_loop(
         # number of training epochs: num_train_epochs
         # number of training steps per epoch: num_update_steps_per_epoch
         # total number of training steps to execute: max_steps
-        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
 
         len_dataloader = None
         if has_length(train_dataloader):

From 53194991e9f1d25a86d66f36f2774cf462391f13 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 27 Jun 2023 19:33:55 +0200
Subject: [PATCH 547/935] [Mask2Former] Remove SwinConfig (#24259)

Remove SwinConfig
---
 .../models/mask2former/modeling_mask2former.py             | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 86091062923c..47a4a18d1a15 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -23,7 +23,7 @@
 import torch
 from torch import Tensor, nn
 
-from ... import AutoBackbone, SwinConfig
+from ... import AutoBackbone
 from ...activations import ACT2FN
 from ...file_utils import (
     ModelOutput,
@@ -1388,10 +1388,7 @@ def __init__(self, config: Mask2FormerConfig):
         """
         super().__init__()
 
-        backbone_config_dict = config.backbone_config.to_dict()
-        backbone_config = SwinConfig.from_dict(backbone_config_dict)
-
-        self.encoder = AutoBackbone.from_config(backbone_config)
+        self.encoder = AutoBackbone.from_config(config.backbone_config)
         self.decoder = Mask2FormerPixelDecoder(config, feature_channels=self.encoder.channels)
 
     def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> Mask2FormerPixelLevelModuleOutput:

From 8e5d1619b3e57367701d74647e87b95f8dba5409 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 27 Jun 2023 14:45:40 -0400
Subject: [PATCH 548/935] Clean load keys (#24505)

* Preliminary work on some models

* Fix test load missing and make sure nonpersistent buffers are tested

* Always ignore nonpersistent buffers if in state_dict

* Treat models

* More models

* Treat remaining models

* Fix quality

* Fix tests

* Remove draft

* This test is not needed anymore

* Fix copies

* Fix last test

* Newly added models

* Fix last tests

* Address review comments
---
 src/transformers/modeling_utils.py            | 32 +++++--
 .../models/albert/modeling_albert.py          | 20 +---
 .../models/align/modeling_align.py            |  5 +-
 .../models/altclip/modeling_altclip.py        |  7 +-
 src/transformers/models/bart/modeling_bart.py | 13 +--
 src/transformers/models/beit/modeling_beit.py |  2 +-
 src/transformers/models/bert/modeling_bert.py | 14 +--
 .../modeling_bert_generation.py               |  6 +-
 .../models/big_bird/modeling_big_bird.py      | 13 +--
 .../modeling_bigbird_pegasus.py               | 11 +--
 .../models/biogpt/modeling_biogpt.py          |  1 -
 .../models/blenderbot/modeling_blenderbot.py  | 11 +--
 .../modeling_blenderbot_small.py              | 11 +--
 src/transformers/models/blip/modeling_blip.py |  7 +-
 .../models/blip/modeling_blip_text.py         |  8 +-
 .../models/blip_2/modeling_blip_2.py          |  6 --
 .../models/bloom/modeling_bloom.py            | 13 ---
 .../bridgetower/modeling_bridgetower.py       |  8 +-
 .../models/camembert/modeling_camembert.py    | 36 +------
 .../models/canine/modeling_canine.py          |  5 +-
 .../chinese_clip/modeling_chinese_clip.py     |  7 +-
 src/transformers/models/clap/modeling_clap.py |  7 +-
 src/transformers/models/clip/modeling_clip.py |  7 +-
 .../models/clipseg/modeling_clipseg.py        |  7 +-
 .../models/codegen/modeling_codegen.py        |  2 +-
 .../models/convbert/modeling_convbert.py      | 17 +---
 .../models/cpmant/modeling_cpmant.py          |  2 -
 src/transformers/models/ctrl/modeling_ctrl.py |  1 -
 .../data2vec/modeling_data2vec_audio.py       |  1 -
 .../models/data2vec/modeling_data2vec_text.py | 37 +------
 .../data2vec/modeling_data2vec_vision.py      |  2 +-
 .../models/deberta/modeling_deberta.py        | 11 +--
 .../models/deberta_v2/modeling_deberta_v2.py  | 11 +--
 .../modeling_decision_transformer.py          |  4 -
 .../modeling_deformable_detr.py               |  1 -
 src/transformers/models/deta/modeling_deta.py |  1 -
 .../models/distilbert/modeling_distilbert.py  |  1 -
 src/transformers/models/dpr/modeling_dpr.py   |  5 -
 .../models/electra/modeling_electra.py        |  8 +-
 .../models/ernie/modeling_ernie.py            | 14 +--
 .../models/ernie_m/modeling_ernie_m.py        |  1 -
 src/transformers/models/esm/modeling_esm.py   | 14 +--
 .../models/flaubert/modeling_flaubert.py      |  4 -
 .../models/flava/modeling_flava.py            | 10 +-
 src/transformers/models/fnet/modeling_fnet.py |  7 +-
 src/transformers/models/fsmt/modeling_fsmt.py | 10 --
 .../models/funnel/modeling_funnel.py          |  1 -
 src/transformers/models/git/modeling_git.py   |  7 +-
 src/transformers/models/gpt2/modeling_gpt2.py | 13 ---
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  5 -
 .../models/gpt_neo/modeling_gpt_neo.py        | 14 +--
 .../models/gpt_neox/modeling_gpt_neox.py      |  8 +-
 .../modeling_gpt_neox_japanese.py             |  1 -
 src/transformers/models/gptj/modeling_gptj.py |  5 -
 .../modeling_gptsan_japanese.py               |  1 -
 .../models/graphormer/modeling_graphormer.py  |  1 -
 .../models/groupvit/modeling_groupvit.py      |  5 +-
 .../models/hubert/modeling_hubert.py          |  1 -
 .../models/ibert/modeling_ibert.py            | 18 +---
 .../models/imagegpt/modeling_imagegpt.py      |  8 +-
 .../models/jukebox/modeling_jukebox.py        |  3 -
 .../models/layoutlm/modeling_layoutlm.py      | 10 +-
 .../models/layoutlmv2/modeling_layoutlmv2.py  | 10 +-
 .../models/layoutlmv3/modeling_layoutlmv3.py  | 14 +--
 src/transformers/models/led/modeling_led.py   | 12 +--
 .../models/levit/modeling_levit.py            |  8 +-
 src/transformers/models/lilt/modeling_lilt.py | 23 +----
 .../models/llama/modeling_llama.py            |  3 -
 .../models/longformer/modeling_longformer.py  |  9 --
 .../models/longt5/modeling_longt5.py          | 10 --
 src/transformers/models/luke/modeling_luke.py | 13 ---
 .../models/lxmert/modeling_lxmert.py          |  1 -
 .../models/m2m_100/modeling_m2m_100.py        | 21 +---
 .../models/marian/modeling_marian.py          | 12 +--
 .../models/markuplm/modeling_markuplm.py      |  7 +-
 .../models/mbart/modeling_mbart.py            | 13 +--
 .../models/mctct/modeling_mctct.py            |  5 +-
 src/transformers/models/mega/modeling_mega.py | 33 -------
 .../megatron_bert/modeling_megatron_bert.py   | 16 +---
 .../models/mobilebert/modeling_mobilebert.py  | 20 +---
 .../models/mpnet/modeling_mpnet.py            | 18 +---
 src/transformers/models/mt5/modeling_mt5.py   | 40 +-------
 src/transformers/models/mvp/modeling_mvp.py   | 10 +-
 .../models/nezha/modeling_nezha.py            | 10 +-
 .../models/nllb_moe/modeling_nllb_moe.py      | 21 +---
 .../nystromformer/modeling_nystromformer.py   |  6 +-
 .../models/open_llama/modeling_open_llama.py  |  3 -
 .../models/openai/modeling_openai.py          |  9 +-
 src/transformers/models/opt/modeling_opt.py   |  6 --
 .../models/owlvit/modeling_owlvit.py          |  7 +-
 .../models/pegasus/modeling_pegasus.py        | 12 +--
 .../models/pegasus_x/modeling_pegasus_x.py    |  9 --
 .../models/pix2struct/modeling_pix2struct.py  |  8 --
 .../models/plbart/modeling_plbart.py          | 12 +--
 .../models/prophetnet/modeling_prophetnet.py  |  7 --
 .../models/qdqbert/modeling_qdqbert.py        | 13 +--
 src/transformers/models/rag/modeling_rag.py   |  1 -
 .../models/realm/modeling_realm.py            |  9 +-
 .../models/reformer/modeling_reformer.py      | 13 ++-
 .../models/rembert/modeling_rembert.py        |  6 +-
 .../models/roberta/modeling_roberta.py        | 37 +------
 .../modeling_roberta_prelayernorm.py          | 37 +------
 .../models/roc_bert/modeling_roc_bert.py      | 14 +--
 .../models/roformer/modeling_roformer.py      |  7 --
 src/transformers/models/sam/modeling_sam.py   |  1 -
 src/transformers/models/sew/modeling_sew.py   |  1 -
 .../models/sew_d/modeling_sew_d.py            |  1 -
 .../speech_to_text/modeling_speech_to_text.py | 11 ---
 .../modeling_speech_to_text_2.py              |  1 -
 .../models/speecht5/modeling_speecht5.py      | 21 +---
 .../models/splinter/modeling_splinter.py      |  5 +-
 .../squeezebert/modeling_squeezebert.py       | 10 +-
 .../modeling_switch_transformers.py           |  7 --
 src/transformers/models/t5/modeling_t5.py     | 23 +----
 .../models/tapas/modeling_tapas.py            |  1 -
 .../modeling_trajectory_transformer.py        |  1 +
 .../models/transfo_xl/modeling_transfo_xl.py  |  3 -
 .../models/trocr/modeling_trocr.py            |  1 -
 .../models/unispeech/modeling_unispeech.py    |  1 -
 .../unispeech_sat/modeling_unispeech_sat.py   |  1 -
 src/transformers/models/vilt/modeling_vilt.py |  7 +-
 .../visual_bert/modeling_visual_bert.py       |  7 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  1 -
 .../modeling_wav2vec2_conformer.py            |  1 -
 .../models/wavlm/modeling_wavlm.py            |  1 -
 .../models/whisper/modeling_whisper.py        | 10 --
 .../models/x_clip/modeling_x_clip.py          |  7 +-
 src/transformers/models/xglm/modeling_xglm.py |  8 --
 src/transformers/models/xlm/modeling_xlm.py   |  7 +-
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |  7 --
 .../xlm_roberta/modeling_xlm_roberta.py       | 37 +------
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 37 +------
 .../models/xlnet/modeling_xlnet.py            |  1 -
 src/transformers/models/xmod/modeling_xmod.py | 38 +-------
 src/transformers/models/yoso/modeling_yoso.py | 10 +-
 tests/models/roberta/test_modeling_roberta.py | 21 ----
 tests/test_modeling_common.py                 | 96 +++++++++++++------
 tests/test_modeling_utils.py                  |  8 +-
 138 files changed, 320 insertions(+), 1140 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2f4e755131e5..c6a8661af415 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -320,8 +320,9 @@ def shard_checkpoint(
 
         weight_size = weight.numel() * dtype_byte_size(weight.dtype)
 
-        # If this weight is going to tip up over the maximal size, we split.
-        if last_block_size + weight_size > max_shard_size:
+        # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
+        # weight in the current shard.
+        if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
             sharded_state_dicts.append({})
             last_block_size = 0
 
@@ -3044,15 +3045,30 @@ def _fix_key(key):
             expected_keys = [".".join([prefix, s]) for s in expected_keys]
 
         missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        unexpected_keys = set(loaded_keys) - set(expected_keys)
+        # Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model
+        # buffers
+        model_buffers = {n for n, _ in model.named_buffers()}
+        if remove_prefix_from_model:
+            model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers}
+        elif add_prefix_to_model:
+            model_buffers = {".".join([prefix, key]) for key in model_buffers}
+        unexpected_keys = list(unexpected_keys - model_buffers)
 
-        if is_accelerate_available():
-            model.tie_weights()
-            tied_params = find_tied_parameters(model)
-        else:
-            tied_params = []
+        model.tie_weights()
+        ptrs = collections.defaultdict(list)
+        for name, tensor in model.state_dict().items():
+            id_tensor = id_tensor_storage(tensor) if tensor.device != torch.device("meta") else id(tensor)
+            ptrs[id_tensor].append(name)
+
+        # These are all the pointers of shared tensors.
+        tied_params = [names for _, names in ptrs.items() if len(names) > 1]
 
         for group in tied_params:
+            if remove_prefix_from_model:
+                group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group]
+            elif add_prefix_to_model:
+                group = [".".join([prefix, key]) for key in group]
             missing_in_group = [k for k in missing_keys if k in group]
             if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
                 missing_keys = [k for k in missing_keys if k not in missing_in_group]
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 2a2f6d6ef539..7196e14be291 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -208,7 +208,9 @@ def __init__(self, config: AlbertConfig):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@@ -507,7 +509,6 @@ class AlbertPreTrainedModel(PreTrainedModel):
     config_class = AlbertConfig
     load_tf_weights = load_tf_weights_in_albert
     base_model_prefix = "albert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -760,11 +761,6 @@ def forward(
 )
 class AlbertForPreTraining(AlbertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
-    _keys_to_ignore_on_load_missing = [
-        "predictions.decoder.weight",
-        "predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
 
     def __init__(self, config: AlbertConfig):
         super().__init__(config)
@@ -912,13 +908,7 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
     ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
-    _keys_to_ignore_on_load_missing = [
-        "predictions.decoder.weight",
-        "predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
 
     def __init__(self, config):
         super().__init__(config)
@@ -1133,8 +1123,6 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForTokenClassification(AlbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config: AlbertConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1218,8 +1206,6 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config: AlbertConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 09ee6eca6265..a7d31775cf40 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -687,7 +687,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -1176,7 +1178,6 @@ class AlignPreTrainedModel(PreTrainedModel):
     config_class = AlignConfig
     base_model_prefix = "align"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 26b3f5928081..fe2754cac808 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -216,7 +216,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -1016,7 +1018,7 @@ def __init__(self, config: AltCLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -1038,7 +1040,6 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
     config_class = AltCLIPConfig
     base_model_prefix = "altclip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index f426956594d6..ad4d4ab9c988 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -506,7 +506,7 @@ class BartPretrainedModel(PreTrainedModel):
     config_class = BartConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
+    _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
     _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
 
@@ -1170,7 +1170,6 @@ def custom_forward(*inputs):
     BART_START_DOCSTRING,
 )
 class BartModel(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig):
@@ -1300,12 +1299,7 @@ def forward(
 class BartForConditionalGeneration(BartPretrainedModel):
     base_model_prefix = "model"
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
-    _keys_to_ignore_on_load_missing = [
-        "final_logits_bias",
-        "lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
 
     def __init__(self, config: BartConfig):
         super().__init__(config)
@@ -1478,7 +1472,6 @@ def _reorder_cache(past_key_values, beam_idx):
     BART_START_DOCSTRING,
 )
 class BartForSequenceClassification(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig, **kwargs):
@@ -1609,7 +1602,6 @@ def forward(
     BART_START_DOCSTRING,
 )
 class BartForQuestionAnswering(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
@@ -1748,7 +1740,6 @@ def forward(self, *args, **kwargs):
     BART_START_DOCSTRING,
 )
 class BartForCausalLM(BartPretrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index b17721fb2bcd..d698cff88b14 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -459,7 +459,7 @@ def __init__(self, config: BeitConfig, window_size: tuple) -> None:
         relative_position_index[0:, 0] = self.num_relative_distance - 2
         relative_position_index[0, 0] = self.num_relative_distance - 1
 
-        self.register_buffer("relative_position_index", relative_position_index)
+        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
 
     def forward(self) -> torch.Tensor:
         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index fb92a0e84cc4..17667e8443dd 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -192,7 +192,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -743,7 +745,6 @@ class BertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1053,7 +1054,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForPreTraining(BertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
@@ -1160,8 +1160,6 @@ def forward(
     """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
 )
 class BertLMHeadModel(BertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
@@ -1301,8 +1299,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
@@ -1715,8 +1711,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForTokenClassification(BertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1800,8 +1794,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForQuestionAnswering(BertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index f92b7a0633e8..3f4a26da4594 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -556,7 +556,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
         if input_ids is not None:
@@ -588,7 +590,6 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
     config_class = BertGenerationConfig
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -860,7 +861,6 @@ def _tie_weights(self):
     BERT_GENERATION_START_DOCSTRING,
 )
 class BertGenerationDecoder(BertGenerationPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index e1346a23c9db..a2db2e2638f5 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -257,7 +257,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -1765,7 +1767,6 @@ class BigBirdPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_big_bird
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -2261,7 +2262,6 @@ def _pad_to_block_size(
 
 
 class BigBirdForPreTraining(BigBirdPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -2368,7 +2368,6 @@ def forward(
 
 @add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
 class BigBirdForMaskedLM(BigBirdPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -2513,12 +2512,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING
 )
 class BigBirdForCausalLM(BigBirdPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids",
-        r"predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "cls.predictions.decoder.bias",
-    ]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index d7683d6fcf8f..fe43c1e68e25 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2358,7 +2358,6 @@ def custom_forward(*inputs):
     BIGBIRD_PEGASUS_START_DOCSTRING,
 )
 class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BigBirdPegasusConfig):
@@ -2491,12 +2490,7 @@ def forward(
 class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
     base_model_prefix = "model"
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
-    _keys_to_ignore_on_load_missing = [
-        "final_logits_bias",
-        "lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
 
     def __init__(self, config: BigBirdPegasusConfig):
         super().__init__(config)
@@ -2669,7 +2663,6 @@ def _reorder_cache(past_key_values, beam_idx):
     BIGBIRD_PEGASUS_START_DOCSTRING,
 )
 class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BigBirdPegasusConfig, **kwargs):
@@ -2799,7 +2792,6 @@ def forward(
     BIGBIRD_PEGASUS_START_DOCSTRING,
 )
 class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
@@ -2932,7 +2924,6 @@ def forward(self, *args, **kwargs):
 
 
 class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 3e925917cffb..7f6d44502c01 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -646,7 +646,6 @@ def custom_forward(*inputs):
     """BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
 )
 class BioGptForCausalLM(BioGptPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["output_projection.weight"]
     _tied_weights_keys = ["output_projection.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 8e582c4fa33a..a3aaf6b4a812 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1102,7 +1102,6 @@ def custom_forward(*inputs):
     BLENDERBOT_START_DOCSTRING,
 )
 class BlenderbotModel(BlenderbotPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: BlenderbotConfig):
@@ -1244,14 +1243,7 @@ def forward(
 )
 class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: BlenderbotConfig):
@@ -1441,7 +1433,6 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill
 class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 890b47373e7e..70794e80a43b 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1096,7 +1096,6 @@ def custom_forward(*inputs):
     BLENDERBOT_SMALL_START_DOCSTRING,
 )
 class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: BlenderbotSmallConfig):
@@ -1226,14 +1225,7 @@ def forward(
 )
 class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: BlenderbotSmallConfig):
@@ -1408,7 +1400,6 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M
 class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 0e70333c3404..115aa14e83fa 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -255,7 +255,9 @@ def __init__(self, config: BlipTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -419,7 +421,6 @@ class BlipPreTrainedModel(PreTrainedModel):
     config_class = BlipConfig
     base_model_prefix = "blip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -927,7 +928,6 @@ def forward(
 )
 class BlipForConditionalGeneration(BlipPreTrainedModel):
     config_class = BlipConfig
-    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
     _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
     main_input_name = "pixel_values"
 
@@ -1100,7 +1100,6 @@ def generate(
 )
 class BlipForQuestionAnswering(BlipPreTrainedModel):
     config_class = BlipConfig
-    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
     _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
 
     def __init__(self, config: BlipConfig):
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 1f269cf852ee..444a7a22b6b0 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -56,7 +56,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         self.config = config
@@ -552,7 +554,6 @@ class BlipTextPreTrainedModel(PreTrainedModel):
 
     config_class = BlipTextConfig
     base_model_prefix = "bert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -808,9 +809,6 @@ def forward(
 
 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
 class BlipTextLMHeadModel(BlipTextPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index b52a58d97f4a..5856df2c2572 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -273,12 +273,6 @@ class Blip2PreTrainedModel(PreTrainedModel):
     config_class = Blip2Config
     base_model_prefix = "blip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids",
-        r"language_model.encoder.embed_tokens.weight",
-        r"language_model.decoder.embed_tokens.weight",
-        r"language_model.lm_head.weight",
-    ]
     _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _keep_in_fp32_modules = ["wo"]
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 4f6de49a1447..d37972a429f1 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -471,12 +471,6 @@ def forward(
 
 
 class BloomPreTrainedModel(PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
     config_class = BloomConfig
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
@@ -826,7 +820,6 @@ def custom_forward(*inputs):
     BLOOM_START_DOCSTRING,
 )
 class BloomForCausalLM(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: BloomConfig):
@@ -995,8 +988,6 @@ def _reorder_cache(
     BLOOM_START_DOCSTRING,
 )
 class BloomForSequenceClassification(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
     def __init__(self, config: BloomConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1123,8 +1114,6 @@ def forward(
     BLOOM_START_DOCSTRING,
 )
 class BloomForTokenClassification(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
     def __init__(self, config: BloomConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1226,8 +1215,6 @@ def forward(
     BLOOM_START_DOCSTRING,
 )
 class BloomForQuestionAnswering(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.transformer = BloomModel(config)
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 4290241fbc09..1fb3cc131bc8 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -280,7 +280,7 @@ def __init__(self, config: BridgeTowerVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -880,7 +880,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -1038,8 +1040,6 @@ class BridgeTowerTextModel(BridgeTowerPreTrainedModel):
 
     config_class = BridgeTowerTextConfig
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index e98840fbc6d2..ed3afab11aa4 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -94,7 +94,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -627,15 +629,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, CamembertEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 CAMEMBERT_INPUTS_DOCSTRING = r"""
     Args:
@@ -762,7 +755,6 @@ class CamembertModel(CamembertPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     _no_split_modules = []
 
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert
@@ -935,9 +927,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForMaskedLM(CamembertPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -952,9 +941,6 @@ def __init__(self, config):
         self.roberta = CamembertModel(config, add_pooling_layer=False)
         self.lm_head = CamembertLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1042,8 +1028,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForSequenceClassification(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1144,8 +1128,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForMultipleChoice(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1241,9 +1223,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForTokenClassification(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1330,9 +1309,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT
 class CamembertForQuestionAnswering(CamembertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1431,9 +1407,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base
 class CamembertForCausalLM(CamembertPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -1445,9 +1418,6 @@ def __init__(self, config):
         self.roberta = CamembertModel(config, add_pooling_layer=False)
         self.lm_head = CamembertLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index a91d42f0395e..b863e294bdd2 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -216,7 +216,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
     def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int):
@@ -900,7 +902,6 @@ class CaninePreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_canine
     base_model_prefix = "canine"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 0adf5cfdcb18..86da1c7b6a8b 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -121,7 +121,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -190,7 +192,7 @@ def __init__(self, config: ChineseCLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -689,7 +691,6 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
     config_class = ChineseCLIPConfig
     base_model_prefix = "chinese_clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c4dbcb03f34d..0f3986ada0ce 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1166,7 +1166,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
         )
@@ -1677,7 +1679,6 @@ class ClapPreTrainedModel(PreTrainedModel):
     config_class = ClapConfig
     base_model_prefix = "clap"
     supports_gradient_checkpointing = False
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1781,7 +1782,6 @@ class ClapTextModel(ClapPreTrainedModel):
     """
 
     config_class = ClapTextConfig
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
     def __init__(self, config, add_pooling_layer=True):
@@ -1936,7 +1936,6 @@ def forward(
 @add_start_docstrings(CLAP_START_DOCSTRING)
 class ClapModel(ClapPreTrainedModel):
     config_class = ClapConfig
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config: ClapConfig):
         super().__init__(config)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index ee9d660ef713..487f756d3ff0 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -188,7 +188,7 @@ def __init__(self, config: CLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -210,7 +210,9 @@ def __init__(self, config: CLIPTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -410,7 +412,6 @@ class CLIPPreTrainedModel(PreTrainedModel):
     config_class = CLIPConfig
     base_model_prefix = "clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 85b119653068..b1d120e365a8 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -181,7 +181,7 @@ def __init__(self, config: CLIPSegVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def interpolate_position_embeddings(self, new_size):
         if len(new_size) != 2:
@@ -230,7 +230,9 @@ def __init__(self, config: CLIPSegTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -433,7 +435,6 @@ class CLIPSegPreTrainedModel(PreTrainedModel):
     config_class = CLIPSegConfig
     base_model_prefix = "clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 8b1d34f59e7b..4b87800cb1be 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -83,6 +83,7 @@ def __init__(self, config):
             torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -600,7 +601,6 @@ def custom_forward(*inputs):
     CODEGEN_START_DOCSTRING,
 )
 class CodeGenForCausalLM(CodeGenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index bbdba210c233..a3910e20dbef 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -191,7 +191,9 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -245,8 +247,6 @@ class ConvBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_convbert
     base_model_prefix = "convbert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -765,8 +765,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertModel(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.embeddings = ConvBertEmbeddings(config)
@@ -880,7 +878,6 @@ def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTens
 
 @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING)
 class ConvBertForMaskedLM(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"]
     _tied_weights_keys = ["generator.lm_head.weight"]
 
     def __init__(self, config):
@@ -992,8 +989,6 @@ def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1089,8 +1084,6 @@ def forward(
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1184,8 +1177,6 @@ def forward(
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForTokenClassification(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1267,8 +1258,6 @@ def forward(
     CONVBERT_START_DOCSTRING,
 )
 class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["embeddings.position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
index 33ead6a10464..808a341ac998 100755
--- a/src/transformers/models/cpmant/modeling_cpmant.py
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -537,7 +537,6 @@ class CpmAntPreTrainedModel(PreTrainedModel):
     config_class = CpmAntConfig
     base_model_prefix = "cpmant"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -749,7 +748,6 @@ def forward(
     CPMANT_START_DOCSTRING,
 )
 class CpmAntForCausalLM(CpmAntPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: CpmAntConfig):
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index dadcbb494cf9..7cf5168e74b9 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -509,7 +509,6 @@ def forward(
     CTRL_START_DOCSTRING,
 )
 class CTRLLMHeadModel(CTRLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 76b6b4d485f0..a42fb5eb0678 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -689,7 +689,6 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel):
     config_class = Data2VecAudioConfig
     base_model_prefix = "data2vec_audio"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 206fe1603b00..4c07acd11072 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -80,7 +80,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -615,15 +617,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, Data2VecTextEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 DATA2VECTEXT_START_DOCSTRING = r"""
     Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
@@ -714,8 +707,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -883,9 +874,6 @@ def forward(
     """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
 )
 class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -897,9 +885,6 @@ def __init__(self, config):
         self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1038,9 +1023,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
 class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -1055,9 +1037,6 @@ def __init__(self, config):
         self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
         self.lm_head = Data2VecTextLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1174,8 +1153,6 @@ def _tie_weights(self):
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1273,8 +1250,6 @@ def forward(
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1369,9 +1344,6 @@ def forward(
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1478,9 +1450,6 @@ def forward(self, features, **kwargs):
     DATA2VECTEXT_START_DOCSTRING,
 )
 class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index 77b424354892..f8fe59587af0 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -470,7 +470,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None:
         relative_position_index[0:, 0] = self.num_relative_distance - 2
         relative_position_index[0, 0] = self.num_relative_distance - 1
 
-        self.register_buffer("relative_position_index", relative_position_index)
+        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
 
     def forward(self) -> torch.Tensor:
         relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 9a0d43db3a0a..c946592730e4 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -764,7 +764,9 @@ def __init__(self, config):
         self.config = config
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
         if input_ids is not None:
@@ -821,7 +823,6 @@ class DebertaPreTrainedModel(PreTrainedModel):
 
     config_class = DebertaConfig
     base_model_prefix = "deberta"
-    _keys_to_ignore_on_load_missing = ["position_ids"]
     _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
     supports_gradient_checkpointing = True
 
@@ -1020,8 +1021,6 @@ def forward(
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class DebertaForMaskedLM(DebertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1277,8 +1276,6 @@ def forward(
     DEBERTA_START_DOCSTRING,
 )
 class DebertaForTokenClassification(DebertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1352,8 +1349,6 @@ def forward(
     DEBERTA_START_DOCSTRING,
 )
 class DebertaForQuestionAnswering(DebertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 1596ad4ffad4..608bca009580 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -862,7 +862,9 @@ def __init__(self, config):
         self.config = config
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
         if input_ids is not None:
@@ -920,7 +922,6 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
 
     config_class = DebertaV2Config
     base_model_prefix = "deberta"
-    _keys_to_ignore_on_load_missing = ["position_ids"]
     _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
     supports_gradient_checkpointing = True
 
@@ -1120,8 +1121,6 @@ def forward(
 
 @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
 class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1380,8 +1379,6 @@ def forward(
 )
 # Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
 class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1455,8 +1452,6 @@ def forward(
     DEBERTA_START_DOCSTRING,
 )
 class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 926947b1617d..064b3cb0ad72 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -476,8 +476,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -747,8 +745,6 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel):
     base_model_prefix = "decision_transformer"
     main_input_name = "states"
     supports_gradient_checkpointing = False
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 6469cf7a65df..cdeb3c796225 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1823,7 +1823,6 @@ def forward(
 )
 class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
     _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
 
     def __init__(self, config: DeformableDetrConfig):
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index bee84a5bf72f..b5fe0ea8a8e5 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -1775,7 +1775,6 @@ def forward(
 )
 class DetaForObjectDetection(DetaPreTrainedModel):
     # When using clones, all layers > 0 will be clones, but layer 0 *is* required
-    _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
     _tied_weights_keys = [r"bbox_embed\.\d+"]
 
     # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 8b71c086bbcc..97300dec2d64 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -595,7 +595,6 @@ def forward(
     DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["vocab_projector.weight"]
     _tied_weights_keys = ["vocab_projector.weight"]
 
     def __init__(self, config: PretrainedConfig):
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index a551e507300b..588440d4a6c5 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -296,8 +296,6 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel):
     config_class = DPRConfig
     load_tf_weights = None
     base_model_prefix = "ctx_encoder"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
 
 class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
@@ -309,8 +307,6 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
     config_class = DPRConfig
     load_tf_weights = None
     base_model_prefix = "question_encoder"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
 
 class DPRPretrainedReader(DPRPreTrainedModel):
@@ -322,7 +318,6 @@ class DPRPretrainedReader(DPRPreTrainedModel):
     config_class = DPRConfig
     load_tf_weights = None
     base_model_prefix = "span_predictor"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
 
 ###############
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index a7ee4ec93202..23ca78e8e064 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -161,7 +161,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@@ -672,8 +674,6 @@ class ElectraPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_electra
     base_model_prefix = "electra"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    _keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
@@ -1166,7 +1166,6 @@ def forward(
     ELECTRA_START_DOCSTRING,
 )
 class ElectraForMaskedLM(ElectraPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
     _tied_weights_keys = ["generator_lm_head.weight"]
 
     def __init__(self, config):
@@ -1534,7 +1533,6 @@ def forward(
     """ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
 )
 class ElectraForCausalLM(ElectraPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"]
     _tied_weights_keys = ["generator_lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index b8df1b2d5035..79b3c00280b7 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -89,7 +89,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -661,7 +663,6 @@ class ErniePreTrainedModel(PreTrainedModel):
     config_class = ErnieConfig
     base_model_prefix = "ernie"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -983,7 +984,6 @@ def forward(
     ERNIE_START_DOCSTRING,
 )
 class ErnieForPreTraining(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie
@@ -1095,8 +1095,6 @@ def forward(
     """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
 )
 class ErnieForCausalLM(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie
@@ -1243,8 +1241,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING)
 class ErnieForMaskedLM(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie
@@ -1665,8 +1661,6 @@ def forward(
     ERNIE_START_DOCSTRING,
 )
 class ErnieForTokenClassification(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
         super().__init__(config)
@@ -1746,8 +1740,6 @@ def forward(
     ERNIE_START_DOCSTRING,
 )
 class ErnieForQuestionAnswering(ErniePreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/ernie_m/modeling_ernie_m.py b/src/transformers/models/ernie_m/modeling_ernie_m.py
index 6d995cf84cb0..82e402394913 100755
--- a/src/transformers/models/ernie_m/modeling_ernie_m.py
+++ b/src/transformers/models/ernie_m/modeling_ernie_m.py
@@ -412,7 +412,6 @@ class ErnieMPreTrainedModel(PreTrainedModel):
     config_class = ErnieMConfig
     base_model_prefix = "ernie_m"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index e0b26e0f7812..43ff7d7b52b5 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -96,7 +96,7 @@ def __init__(self, dim: int):
         # Generate and save the inverse frequency buffer (non trainable)
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
         inv_freq = inv_freq
-        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         self._seq_len_cached = None
         self._cos_cached = None
@@ -178,7 +178,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -783,7 +785,6 @@ class EsmModel(EsmPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = False
 
     def __init__(self, config, add_pooling_layer=True):
@@ -960,8 +961,6 @@ def predict_contacts(self, tokens, attention_mask):
 
 @add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING)
 class EsmForMaskedLM(EsmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight"]
 
     def __init__(self, config):
@@ -1081,8 +1080,6 @@ def forward(self, features, **kwargs):
     ESM_START_DOCSTRING,
 )
 class EsmForSequenceClassification(EsmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1177,9 +1174,6 @@ def forward(
     ESM_START_DOCSTRING,
 )
 class EsmForTokenClassification(EsmPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 1b04da241036..318e9bfd471c 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -378,8 +378,6 @@ def _init_weights(self, module):
 
 
 class FlaubertModel(FlaubertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):  # , dico, is_encoder, with_output):
         super().__init__(config)
 
@@ -448,7 +446,6 @@ def __init__(self, config):  # , dico, is_encoder, with_output):
 
         # Initialize weights and apply final processing
         self.post_init()
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
 
         self.layerdrop = getattr(config, "layerdrop", 0.0)
         self.pre_norm = getattr(config, "pre_norm", False)
@@ -654,7 +651,6 @@ def forward(
 )
 # Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
 class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
     _tied_weights_keys = ["pred_layer.proj.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 5d49197f8ca5..d986a17b7503 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -387,7 +387,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -1724,12 +1726,6 @@ def forward(self, image_embeddings, text_embeddings, logit_scale):
 )
 class FlavaForPreTraining(FlavaPreTrainedModel):
     # Those are linked to xxx.bias
-    _keys_to_ignore_on_load_missing = [
-        "mmm_text_head.decoder.bias",
-        "mmm_image_head.decoder.bias",
-        "mlm_head.decoder.bias",
-        "mim_head.decoder.bias",
-    ]
     _tied_weights_keys = [
         "mmm_text_head.decoder.bias",
         "mmm_image_head.decoder.bias",
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 6bc526eeebcb..45042147761d 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -114,7 +114,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
@@ -411,7 +413,6 @@ class FNetPreTrainedModel(PreTrainedModel):
     config_class = FNetConfig
     base_model_prefix = "fnet"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -621,7 +622,6 @@ def forward(
     FNET_START_DOCSTRING,
 )
 class FNetForPreTraining(FNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
@@ -716,7 +716,6 @@ def forward(
 
 @add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING)
 class FNetForMaskedLM(FNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 255cf91df76c..608efabf7885 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -1034,7 +1034,6 @@ def _get_shape(t):
     FSMT_START_DOCSTRING,
 )
 class FSMTModel(PretrainedFSMTModel):
-    _keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"]
     _tied_weights_keys = ["decoder.embed_tokens.weight"]
 
     def __init__(self, config: FSMTConfig):
@@ -1172,15 +1171,6 @@ def set_output_embeddings(self, value):
 )
 class FSMTForConditionalGeneration(PretrainedFSMTModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-        "decoder.output_projection.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
     _tied_weights_keys = ["model.decoder.embed_tokens.weight"]
 
     def __init__(self, config: FSMTConfig):
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 805b651f2126..0ee9ed587ed9 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -1190,7 +1190,6 @@ def forward(
 
 @add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
 class FunnelForMaskedLM(FunnelPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: FunnelConfig) -> None:
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 23ae6d64962f..89696694ff4e 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -109,7 +109,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -510,7 +512,6 @@ class GitPreTrainedModel(PreTrainedModel):
     config_class = GitConfig
     base_model_prefix = "git"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -623,7 +624,7 @@ def __init__(self, config: GitVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index b9a8568f00e7..58b419897a7c 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -668,9 +668,6 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
     GPT2_START_DOCSTRING,
 )
 class GPT2Model(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -957,8 +954,6 @@ def custom_forward(*inputs):
     GPT2_START_DOCSTRING,
 )
 class GPT2LMHeadModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -1151,8 +1146,6 @@ def _reorder_cache(
     GPT2_START_DOCSTRING,
 )
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -1381,9 +1374,6 @@ def _reorder_cache(
     GPT2_START_DOCSTRING,
 )
 class GPT2ForSequenceClassification(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1605,9 +1595,6 @@ def forward(
     GPT2_START_DOCSTRING,
 )
 class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 705d07b1da25..a45b9bd4b261 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -500,8 +500,6 @@ def _set_gradient_checkpointing(self, module, value=False):
     GPT_BIGCODE_START_DOCSTRING,
 )
 class GPTBigCodeModel(GPTBigCodePreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
     def __init__(self, config):
         super().__init__(config)
         self.multi_query = config.multi_query
@@ -722,7 +720,6 @@ def custom_forward(*inputs):
     GPT_BIGCODE_START_DOCSTRING,
 )
 class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -876,8 +873,6 @@ def _reorder_cache(
     GPT_BIGCODE_START_DOCSTRING,
 )
 class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index b67f4ddbfaca..66471b6eac27 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -145,8 +145,8 @@ def __init__(self, config, attention_type):
         if attention_type == "local":
             bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size))
 
-        self.register_buffer("bias", bias)
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.register_buffer("bias", bias, persistent=False)
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
 
         self.attn_dropout = nn.Dropout(float(config.attention_dropout))
         self.resid_dropout = nn.Dropout(float(config.resid_dropout))
@@ -663,12 +663,6 @@ def custom_forward(*inputs):
     GPT_NEO_START_DOCSTRING,
 )
 class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"h\.\d+\.attn\.masked_bias",
-        r"lm_head.weight",
-        r"h\.\d+\.attn\.attention\.bias",
-    ]
-    _keys_to_ignore_on_save = [r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -820,8 +814,6 @@ def _reorder_cache(
     GPT_NEO_START_DOCSTRING,
 )
 class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1025,8 +1017,6 @@ def forward(
     GPT_NEO_START_DOCSTRING,
 )
 class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 7c3bfd1035f9..841cbe1aa8f2 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -100,8 +100,9 @@ def __init__(self, config):
             torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
         self.rotary_emb = RotaryEmbedding(
             self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
         )
@@ -600,7 +601,6 @@ def custom_forward(*inputs):
     """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
 )
 class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     _tied_weights_keys = ["embed_out.weight"]
 
     def __init__(self, config):
@@ -775,8 +775,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     GPT_NEOX_START_DOCSTRING,
 )
 class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -971,8 +969,6 @@ def forward(
     GPT_NEOX_START_DOCSTRING,
 )
 class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 1671e5916ef7..e7cb510e6222 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -591,7 +591,6 @@ def forward(
     GPT_NEOX_JAPANESE_START_DOCSTRING,
 )
 class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"]
     _tied_weights_keys = ["embed_out.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index de120167989d..e9a9045a6d2f 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -734,7 +734,6 @@ def custom_forward(*inputs):
     GPTJ_START_DOCSTRING,
 )
 class GPTJForCausalLM(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -933,8 +932,6 @@ def _reorder_cache(
     GPTJ_START_DOCSTRING,
 )
 class GPTJForSequenceClassification(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1059,8 +1056,6 @@ def forward(
     GPTJ_START_DOCSTRING,
 )
 class GPTJForQuestionAnswering(GPTJPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index 8c1cdd0b1a55..f02aa2dc839c 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -1111,7 +1111,6 @@ def forward(
     GPTSAN_JAPANESE_START_DOCSTRING,
 )
 class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: GPTSanJapaneseConfig):
diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py
index 2dd86b7b55ff..82ffd4b1637d 100755
--- a/src/transformers/models/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/graphormer/modeling_graphormer.py
@@ -714,7 +714,6 @@ class GraphormerPreTrainedModel(PreTrainedModel):
     config_class = GraphormerConfig
     base_model_prefix = "graphormer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     main_input_name_nodes = "input_nodes"
     main_input_name_edges = "input_edges"
 
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index c19ebd13b91d..9c312c0ff811 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -450,7 +450,9 @@ def __init__(self, config: GroupViTTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -767,7 +769,6 @@ class GroupViTPreTrainedModel(PreTrainedModel):
     config_class = GroupViTConfig
     base_model_prefix = "groupvit"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index af3d4e2d0aca..8228520dfd5e 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -869,7 +869,6 @@ class HubertPreTrainedModel(PreTrainedModel):
     base_model_prefix = "hubert"
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index 7f300e01ae4e..6cf484d96f78 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -80,7 +80,9 @@ def __init__(self, config):
         )
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         # End copy
@@ -740,8 +742,6 @@ class IBertModel(IBertPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -854,8 +854,6 @@ def forward(
 
 @add_start_docstrings("""I-BERT Model with a `language modeling` head on top.""", IBERT_START_DOCSTRING)
 class IBertForMaskedLM(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]
 
     def __init__(self, config):
@@ -969,8 +967,6 @@ def _tie_weights(self):
     IBERT_START_DOCSTRING,
 )
 class IBertForSequenceClassification(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1064,8 +1060,6 @@ def forward(
     IBERT_START_DOCSTRING,
 )
 class IBertForMultipleChoice(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1156,9 +1150,6 @@ def forward(
     IBERT_START_DOCSTRING,
 )
 class IBertForTokenClassification(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1256,9 +1247,6 @@ def forward(self, features, **kwargs):
     IBERT_START_DOCSTRING,
 )
 class IBertForQuestionAnswering(IBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 539119fabf28..f24cf7ae7136 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -183,8 +183,9 @@ def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx
             torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                 1, 1, max_positions, max_positions
             ),
+            persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -613,8 +614,6 @@ def _set_gradient_checkpointing(self, module, value=False):
     IMAGEGPT_START_DOCSTRING,
 )
 class ImageGPTModel(ImageGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
     def __init__(self, config: ImageGPTConfig):
         super().__init__(config)
 
@@ -893,7 +892,6 @@ def custom_forward(*inputs):
     IMAGEGPT_START_DOCSTRING,
 )
 class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: ImageGPTConfig):
@@ -1085,8 +1083,6 @@ def _reorder_cache(
     IMAGEGPT_START_DOCSTRING,
 )
 class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config: ImageGPTConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index f7be47c00588..236d1f4ff37b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -602,7 +602,6 @@ def forward(self, input_audio):
 class JukeboxVQVAE(PreTrainedModel):
     config_class = JukeboxVQVAEConfig
     base_model_prefix = "vqvae"
-    _keys_to_ignore_on_load_unexpected = [r"priors"]
 
     def _init_weights(self, module):
         if isinstance(module, nn.Embedding):  # embed_tokens
@@ -1792,7 +1791,6 @@ class JukeboxPrior(PreTrainedModel):
     """
 
     config_class = JukeboxPriorConfig
-    _keys_to_ignore_on_load_unexpected = ["vqvae"]
 
     def _init_weights(self, module):
         init_scale = self.config.init_scale
@@ -1832,7 +1830,6 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
         self.level = level if level is not None else config.level
 
         self.base_model_prefix = f"priors.{self.level}"
-        self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
 
         self.n_ctx = config.n_ctx
 
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 410f76509422..26c4cd92d6e5 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -68,7 +68,9 @@ def __init__(self, config):
         self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -619,7 +621,6 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
     pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST
     base_model_prefix = "layoutlm"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -857,11 +858,6 @@ def forward(
 
 @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING)
 class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "embeddings.position_ids",
-    ]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 5a6f39ce31a6..18927fb1fde8 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -77,7 +77,9 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def _calc_spatial_position_embeddings(self, bbox):
         try:
@@ -506,7 +508,6 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel):
     config_class = LayoutLMv2Config
     pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
     base_model_prefix = "layoutlmv2"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -567,8 +568,11 @@ def __init__(self, config):
         self.register_buffer(
             "pixel_mean",
             torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False
         )
-        self.register_buffer("pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1))
         self.out_feature_key = "p2"
         if torch.are_deterministic_algorithms_enabled():
             logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`")
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index db6618caaeaf..1648016b5740 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -245,7 +245,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -750,8 +752,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.config = config
@@ -1038,9 +1038,6 @@ def forward(self, x):
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1153,9 +1150,6 @@ def forward(
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1286,8 +1280,6 @@ def forward(
     LAYOUTLMV3_START_DOCSTRING,
 )
 class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 8de14242bfc7..d98c8d29672e 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -2209,7 +2209,6 @@ def custom_forward(*inputs):
     LED_START_DOCSTRING,
 )
 class LEDModel(LEDPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: LEDConfig):
@@ -2335,14 +2334,7 @@ def forward(
 )
 class LEDForConditionalGeneration(LEDPreTrainedModel):
     base_model_prefix = "led"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: LEDConfig):
@@ -2530,7 +2522,6 @@ def _reorder_cache(past_key_values, beam_idx):
     LED_START_DOCSTRING,
 )
 class LEDForSequenceClassification(LEDPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config: LEDConfig, **kwargs):
@@ -2667,7 +2658,6 @@ def forward(
     LED_START_DOCSTRING,
 )
 class LEDForQuestionAnswering(LEDPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
     _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py
index e45ffa05b157..0accc28391bd 100644
--- a/src/transformers/models/levit/modeling_levit.py
+++ b/src/transformers/models/levit/modeling_levit.py
@@ -195,7 +195,9 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio,
 
         self.attention_bias_cache = {}
         self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points))
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False
+        )
 
     @torch.no_grad()
     def train(self, mode=True):
@@ -271,7 +273,9 @@ def __init__(
                 indices.append(attention_offsets[offset])
 
         self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points))
+        self.register_buffer(
+            "attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False
+        )
 
     @torch.no_grad()
     def train(self, mode=True):
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index 74454d244e8d..e5783b970f87 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -59,7 +59,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         # End copy
@@ -610,15 +612,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, LiltEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 LILT_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -697,8 +690,6 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore):
     LILT_START_DOCSTRING,
 )
 class LiltModel(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -847,8 +838,6 @@ def forward(
     LILT_START_DOCSTRING,
 )
 class LiltForSequenceClassification(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Lilt, roberta->lilt
     def __init__(self, config):
         super().__init__(config)
@@ -967,9 +956,6 @@ def forward(
     LILT_START_DOCSTRING,
 )
 class LiltForTokenClassification(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Lilt, roberta->lilt
     def __init__(self, config):
         super().__init__(config)
@@ -1096,9 +1082,6 @@ def forward(self, features, **kwargs):
     LILT_START_DOCSTRING,
 )
 class LiltForQuestionAnswering(LiltPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Lilt, roberta->lilt
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index c9debdd252dc..24231c3f777d 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -344,7 +344,6 @@ class LlamaPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["LlamaDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -784,8 +783,6 @@ def _reorder_cache(past_key_values, beam_idx):
     LLAMA_START_DOCSTRING,
 )
 class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 665e2cb56421..994157daa879 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1421,7 +1421,6 @@ class LongformerPreTrainedModel(PreTrainedModel):
     config_class = LongformerConfig
     base_model_prefix = "longformer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"position_ids"]
     _no_split_modules = ["LongformerSelfAttention"]
 
     def _init_weights(self, module):
@@ -1770,8 +1769,6 @@ def forward(
 
 @add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING)
 class LongformerForMaskedLM(LongformerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.decoder"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder"]
 
     def __init__(self, config):
@@ -1886,8 +1883,6 @@ def forward(
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForSequenceClassification(LongformerPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -2015,8 +2010,6 @@ def forward(self, hidden_states, **kwargs):
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForQuestionAnswering(LongformerPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -2154,8 +2147,6 @@ def forward(
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForTokenClassification(LongformerPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 1a49444e8a50..303755ae4338 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -1763,10 +1763,6 @@ def custom_forward(*inputs):
     LONGT5_START_DOCSTRING,
 )
 class LongT5Model(LongT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
@@ -1917,11 +1913,6 @@ def forward(
 
 @add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING)
 class LongT5ForConditionalGeneration(LongT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
         r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
@@ -2160,7 +2151,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     LONGT5_START_DOCSTRING,
 )
 class LongT5EncoderModel(LongT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: LongT5Config):
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index ba21d3deb32e..8a3ceb14d50e 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -1022,8 +1022,6 @@ def _set_gradient_checkpointing(self, module, value=False):
     LUKE_START_DOCSTRING,
 )
 class LukeModel(LukePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
         super().__init__(config)
         self.config = config
@@ -1278,17 +1276,6 @@ def _tie_weights(self):
     LUKE_START_DOCSTRING,
 )
 class LukeForMaskedLM(LukePreTrainedModel):
-    _keys_to_ignore_on_save = [
-        r"lm_head.decoder.weight",
-        r"lm_head.decoder.bias",
-        r"entity_predictions.decoder.weight",
-    ]
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids",
-        r"lm_head.decoder.weight",
-        r"lm_head.decoder.bias",
-        r"entity_predictions.decoder.weight",
-    ]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 21a279ec29ca..2a1a21282ec0 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -1018,7 +1018,6 @@ def forward(
     LXMERT_START_DOCSTRING,
 )
 class LxmertForPreTraining(LxmertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index a9cde571f7d2..20db884c6366 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -131,7 +131,7 @@ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Opt
             # in forward put the weights on the correct dtype and device of the param
             emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
 
-        self.register_buffer("weights", emb_weights)
+        self.register_buffer("weights", emb_weights, persistent=False)
 
     @staticmethod
     def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
@@ -1137,14 +1137,6 @@ def custom_forward(*inputs):
     M2M_100_START_DOCSTRING,
 )
 class M2M100Model(M2M100PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_positions.weights",
-        "encoder.embed_positions.bias",
-        "decoder.embed_positions.weights",
-        "decoder.embed_positions.bias",
-    ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: M2M100Config):
@@ -1258,17 +1250,6 @@ def forward(
 )
 class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"encoder.embed_positions.weights",
-        r"encoder.embed_positions.bias",
-        r"decoder.embed_positions.weights",
-        r"decoder.embed_positions.bias",
-    ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: M2M100Config):
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 1d1cbe125e64..d25d1ed4bc22 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1103,7 +1103,6 @@ def custom_forward(*inputs):
     "The bare Marian Model outputting raw hidden-states without any specific head on top.", MARIAN_START_DOCSTRING
 )
 class MarianModel(MarianPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MarianConfig):
@@ -1292,13 +1291,9 @@ def forward(
 class MarianMTModel(MarianPreTrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"embed_positions",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
+        "final_logits_bias",
+        "encoder.embed_positions.weight",
+        "decoder.embed_positions.weight",
     ]
     _keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"]
     _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
@@ -1561,7 +1556,6 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en
 class MarianForCausalLM(MarianPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 0c6847b47815..34435b898fcf 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -143,7 +143,9 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         self.padding_idx = config.pad_token_id
         self.position_embeddings = nn.Embedding(
@@ -713,7 +715,6 @@ class MarkupLMPreTrainedModel(PreTrainedModel):
     config_class = MarkupLMConfig
     pretrained_model_archive_map = MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST
     base_model_prefix = "markuplm"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM
     def _init_weights(self, module):
@@ -971,8 +972,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     MARKUPLM_START_DOCSTRING,
 )
 class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with bert->markuplm, Bert->MarkupLM
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 7bf6b1b37e98..577d950c932d 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1156,7 +1156,6 @@ def custom_forward(*inputs):
     MBART_START_DOCSTRING,
 )
 class MBartModel(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MBartConfig):
@@ -1277,14 +1276,7 @@ def forward(
 )
 class MBartForConditionalGeneration(MBartPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MBartConfig):
@@ -1452,7 +1444,6 @@ def _reorder_cache(past_key_values, beam_idx):
     MBART_START_DOCSTRING,
 )
 class MBartForSequenceClassification(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
 
     def __init__(self, config: MBartConfig, **kwargs):
@@ -1582,7 +1573,6 @@ def forward(
     MBART_START_DOCSTRING,
 )
 class MBartForQuestionAnswering(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
 
     def __init__(self, config):
@@ -1716,7 +1706,6 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
 class MBartForCausalLM(MBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py
index 7f2de9f952a9..4b965b27ec18 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/mctct/modeling_mctct.py
@@ -149,7 +149,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids",
             torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
@@ -443,7 +445,6 @@ class MCTCTPreTrainedModel(PreTrainedModel):
     config_class = MCTCTConfig
     base_model_prefix = "mctct"
     main_input_name = "input_features"
-    _keys_to_ignore_on_load_missing = ["position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py
index 19e1cc107504..9381e60905cd 100644
--- a/src/transformers/models/mega/modeling_mega.py
+++ b/src/transformers/models/mega/modeling_mega.py
@@ -1387,15 +1387,6 @@ def _init_weights(self, module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 MEGA_START_DOCSTRING = r"""
 
@@ -1474,8 +1465,6 @@ class MegaModel(MegaPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = []
-
     def __init__(self, config: MegaConfig, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -1656,9 +1645,6 @@ def forward(
     """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
 )
 class MegaForCausalLM(MegaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.weight", r"lm_head.bias"]
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight", r"lm_head.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: MegaConfig):
@@ -1678,9 +1664,6 @@ def __init__(self, config: MegaConfig):
 
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1821,9 +1804,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
 class MegaForMaskedLM(MegaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"mlm_head.weight", r"mlm_head.bias"]
-    _keys_to_ignore_on_load_missing = [r"mlm_head.weight", r"mlm_head.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["mlm_head.weight"]
 
     def __init__(self, config: MegaConfig):
@@ -1845,9 +1825,6 @@ def __init__(self, config: MegaConfig):
         self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
         self.dropout = nn.Dropout(config.dropout_prob)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["mlm_head.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1931,8 +1908,6 @@ def forward(
     MEGA_START_DOCSTRING,
 )
 class MegaForSequenceClassification(MegaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = []
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -2024,8 +1999,6 @@ def forward(
     MEGA_START_DOCSTRING,
 )
 class MegaForMultipleChoice(MegaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = []
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -2111,9 +2084,6 @@ def forward(
     MEGA_START_DOCSTRING,
 )
 class MegaForTokenClassification(MegaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = []
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -2214,9 +2184,6 @@ def forward(self, features, **kwargs):
     MEGA_START_DOCSTRING,
 )
 class MegaForQuestionAnswering(MegaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = []
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index bba7e7369cb8..c28b681326c6 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -149,7 +149,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
     def forward(
@@ -713,7 +715,6 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_megatron_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1014,7 +1015,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config, add_binary_head=True):
@@ -1121,8 +1121,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
@@ -1267,8 +1265,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING)
 class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
@@ -1376,8 +1372,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"predictions"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1672,8 +1666,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1752,8 +1744,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index fcd49a8f8cf3..06318679faee 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -191,7 +191,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -686,7 +688,6 @@ class MobileBertPreTrainedModel(PreTrainedModel):
     pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST
     load_tf_weights = load_tf_weights_in_mobilebert
     base_model_prefix = "mobilebert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -923,11 +924,6 @@ def forward(
     MOBILEBERT_START_DOCSTRING,
 )
 class MobileBertForPreTraining(MobileBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.weight",
-        "cls.predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1036,12 +1032,6 @@ def forward(
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
 class MobileBertForMaskedLM(MobileBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.weight",
-        "cls.predictions.decoder.bias",
-        "embeddings.position_ids",
-    ]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1350,8 +1340,6 @@ def forward(
 )
 # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing
 class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1553,8 +1541,6 @@ def forward(
 )
 # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing
 class MobileBertForTokenClassification(MobileBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index 93e5abe72a48..68bdad1c9fd4 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -83,7 +83,9 @@ def __init__(self, config):
 
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
         if position_ids is None:
@@ -479,8 +481,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     MPNET_START_DOCSTRING,
 )
 class MPNetModel(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -570,8 +570,6 @@ def forward(
 
 
 class MPNetForMaskedLM(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder"]
 
     def __init__(self, config):
@@ -679,8 +677,6 @@ def forward(self, features, **kwargs):
     MPNET_START_DOCSTRING,
 )
 class MPNetForSequenceClassification(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -773,8 +769,6 @@ def forward(
     MPNET_START_DOCSTRING,
 )
 class MPNetForMultipleChoice(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -863,9 +857,6 @@ def forward(
     MPNET_START_DOCSTRING,
 )
 class MPNetForTokenClassification(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -962,9 +953,6 @@ def forward(self, features, **kwargs):
     MPNET_START_DOCSTRING,
 )
 class MPNetForQuestionAnswering(MPNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index dfeb3c10a915..03e3581cf05b 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -1316,18 +1316,8 @@ class MT5Model(MT5PreTrainedModel):
     ```"""
     model_type = "mt5"
     config_class = MT5Config
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
@@ -1552,15 +1542,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel):
 
     model_type = "mt5"
     config_class = MT5Config
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
@@ -1897,13 +1879,6 @@ class MT5EncoderModel(MT5PreTrainedModel):
 
     model_type = "mt5"
     config_class = MT5Config
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"encoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5
@@ -2029,14 +2004,7 @@ def forward(
     MT5_START_DOCSTRING,
 )
 class MT5ForQuestionAnswering(MT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index d135ee558d16..92e393b39e25 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -551,7 +551,6 @@ class MvpPreTrainedModel(PreTrainedModel):
     config_class = MvpConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -1300,8 +1299,7 @@ def custom_forward(*inputs):
     MVP_START_DOCSTRING,
 )
 class MvpModel(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _keys_to_ignore_on_load_unexpected = ["final_logits_bias"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MvpConfig):
@@ -1438,7 +1436,6 @@ def forward(
     "The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
 )
 class MvpForConditionalGeneration(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: MvpConfig):
@@ -1611,8 +1608,6 @@ def _reorder_cache(past_key_values, beam_idx):
     MVP_START_DOCSTRING,
 )
 class MvpForSequenceClassification(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: MvpConfig, **kwargs):
@@ -1740,8 +1735,6 @@ def forward(
     MVP_START_DOCSTRING,
 )
 class MvpForQuestionAnswering(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"]
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
@@ -1873,7 +1866,6 @@ def forward(self, *args, **kwargs):
 
 
 class MvpForCausalLM(MvpPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index 97c5b5a90ec3..8d66bfe41fab 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -163,7 +163,7 @@ def __init__(self, length, depth, max_relative_position=127):
         my_shape = list(final_mat.size())
         my_shape.append(depth)
         positions_encoding = positions_encoding.view(my_shape)
-        self.register_buffer("positions_encoding", positions_encoding)
+        self.register_buffer("positions_encoding", positions_encoding, persistent=False)
 
     def forward(self, length):
         return self.positions_encoding[:length, :length, :]
@@ -735,7 +735,6 @@ class NezhaPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_nezha
     base_model_prefix = "nezha"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"positions_encoding"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1037,7 +1036,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForPreTraining(NezhaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
@@ -1140,8 +1138,6 @@ def forward(
 
 @add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
 class NezhaForMaskedLM(NezhaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
@@ -1542,8 +1538,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForTokenClassification(NezhaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1623,8 +1617,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForQuestionAnswering(NezhaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 3585b1d3b62f..217314555840 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -183,7 +183,7 @@ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Opt
             # in forward put the weights on the correct dtype and device of the param
             emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
 
-        self.register_buffer("weights", emb_weights)
+        self.register_buffer("weights", emb_weights, persistent=False)
 
     @staticmethod
     def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
@@ -1500,14 +1500,6 @@ def custom_forward(*inputs):
     NLLB_MOE_START_DOCSTRING,
 )
 class NllbMoeModel(NllbMoePreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_positions.weights",
-        "encoder.embed_positions.bias",
-        "decoder.embed_positions.weights",
-        "decoder.embed_positions.bias",
-    ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: NllbMoeConfig):
@@ -1641,17 +1633,6 @@ def forward(
 )
 class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"encoder.embed_positions.weights",
-        r"encoder.embed_positions.bias",
-        r"decoder.embed_positions.weights",
-        r"decoder.embed_positions.bias",
-    ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: NllbMoeConfig):
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index b859b0db1d4f..607deb7b0ab6 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -64,7 +64,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids",
@@ -458,7 +460,6 @@ class NystromformerPreTrainedModel(PreTrainedModel):
     config_class = NystromformerConfig
     base_model_prefix = "nystromformer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -658,7 +659,6 @@ def forward(
 
 @add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING)
 class NystromformerForMaskedLM(NystromformerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 16ad554dc313..84d5c6e78fa2 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -368,7 +368,6 @@ class OpenLlamaPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["OpenLlamaDecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -825,8 +824,6 @@ def _reorder_cache(past_key_values, beam_idx):
 )
 # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama
 class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 0949b2f7dac7..23f8fc8bc7d3 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -141,7 +141,9 @@ def __init__(self, nx, n_positions, config, scale=False):
         if n_state % config.n_head != 0:
             raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")
         self.register_buffer(
-            "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
+            "bias",
+            torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions),
+            persistent=False,
         )
         self.n_head = config.n_head
         self.split_size = n_state
@@ -274,7 +276,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     config_class = OpenAIGPTConfig
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -407,7 +408,7 @@ def __init__(self, config):
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])
 
-        self.register_buffer("position_ids", torch.arange(config.n_positions))
+        self.register_buffer("position_ids", torch.arange(config.n_positions), persistent=False)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -529,7 +530,6 @@ def forward(
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -621,7 +621,6 @@ def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -
     OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 5ad783b92daf..a473d9bd5b6d 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -399,7 +399,6 @@ class OPTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["OPTDecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -817,7 +816,6 @@ def forward(
 
 
 class OPTForCausalLM(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
@@ -1025,8 +1023,6 @@ def _reorder_cache(past_key_values, beam_idx):
     OPT_START_DOCSTRING,
 )
 class OPTForSequenceClassification(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config: OPTConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1147,8 +1143,6 @@ def set_input_embeddings(self, value):
     OPT_START_DOCSTRING,
 )
 class OPTForQuestionAnswering(OPTPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config: OPTConfig):
         super().__init__(config)
         self.model = OPTModel(config)
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index f65a0688578e..34ee828a7400 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -304,7 +304,7 @@ def __init__(self, config: OwlViTVisionConfig):
         self.num_patches = (config.image_size // config.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -325,7 +325,9 @@ def __init__(self, config: OwlViTTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -530,7 +532,6 @@ class OwlViTPreTrainedModel(PreTrainedModel):
     config_class = OwlViTConfig
     base_model_prefix = "owlvit"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     _no_split_modules = ["OwlViTEncoderLayer"]
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 3eac50b327ff..e9121655d13f 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1156,7 +1156,6 @@ def custom_forward(*inputs):
     PEGASUS_START_DOCSTRING,
 )
 class PegasusModel(PegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PegasusConfig):
@@ -1309,15 +1308,7 @@ def forward(
 )
 class PegasusForConditionalGeneration(PegasusPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"embed_positions.weight",
-        "encoder.embed_tokens.weight",
-        "decoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PegasusConfig):
@@ -1518,7 +1509,6 @@ def forward(self, *args, **kwargs):
 
 
 class PegasusForCausalLM(PegasusPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index 0763aec360fb..caf736ba3ad8 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -1391,7 +1391,6 @@ def custom_forward(*inputs):
     PEGASUS_X_START_DOCSTRING,
 )
 class PegasusXModel(PegasusXPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PegasusXConfig):
@@ -1536,14 +1535,6 @@ def forward(
 @add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING)
 class PegasusXForConditionalGeneration(PegasusXPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        r"embed_positions.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PegasusXConfig):
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 2db104a5a112..b9cfff26a26a 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1597,14 +1597,6 @@ def custom_forward(*inputs):
 class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel):
     config_class = Pix2StructConfig
     main_input_name = "flattened_patches"
-
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.layer.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
     _tied_weights_keys = ["decoder.lm_head.weight"]
 
     def __init__(self, config: Pix2StructConfig):
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 30d9bd0ddc38..cf2901d43d26 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -1132,7 +1132,6 @@ def custom_forward(*inputs):
     PLBART_START_DOCSTRING,
 )
 class PLBartModel(PLBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PLBartConfig):
@@ -1251,14 +1250,7 @@ def forward(
 )
 class PLBartForConditionalGeneration(PLBartPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder.version",
-        r"decoder.version",
-        r"lm_head.weight",
-        "decoder.embed_tokens.weight",
-        "encoder.embed_tokens.weight",
-    ]
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: PLBartConfig):
@@ -1423,7 +1415,6 @@ def _reorder_cache(past_key_values, beam_idx):
     PLBART_START_DOCSTRING,
 )
 class PLBartForSequenceClassification(PLBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: PLBartConfig, **kwargs):
@@ -1562,7 +1553,6 @@ def forward(self, *args, **kwargs):
 
 # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base
 class PLBartForCausalLM(PLBartPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 9160d5e1eb46..1b771705ab75 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1744,7 +1744,6 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
     PROPHETNET_START_DOCSTRING,
 )
 class ProphetNetModel(ProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
     _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
 
     def __init__(self, config: ProphetNetConfig):
@@ -1874,11 +1873,6 @@ def forward(
     PROPHETNET_START_DOCSTRING,
 )
 class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "decoder.word_embeddings.weight",
-        "encoder.word_embeddings.weight",
-        "lm_head.weight",
-    ]
     _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
 
     def __init__(self, config: ProphetNetConfig):
@@ -2091,7 +2085,6 @@ def get_decoder(self):
     PROPHETNET_START_DOCSTRING,
 )
 class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: ProphetNetConfig):
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 47a34e959072..da60b8efea1e 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -164,7 +164,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -738,7 +740,6 @@ class QDQBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_qdqbert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1012,8 +1013,6 @@ def forward(
     """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
 )
 class QDQBertLMHeadModel(QDQBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1166,8 +1165,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
 class QDQBertForMaskedLM(QDQBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1570,8 +1567,6 @@ def forward(
     QDQBERT_START_DOCSTRING,
 )
 class QDQBertForTokenClassification(QDQBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1650,8 +1645,6 @@ def forward(
     QDQBERT_START_DOCSTRING,
 )
 class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 019b26ef08e9..1e615512c91c 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -231,7 +231,6 @@ class RagPreTrainedModel(PreTrainedModel):
     """
     config_class = RagConfig
     base_model_prefix = "rag"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index f68fc04105de..2e675a4d3425 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -178,7 +178,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -968,7 +970,6 @@ class RealmPreTrainedModel(PreTrainedModel):
     config_class = RealmConfig
     load_tf_weights = load_tf_weights_in_realm
     base_model_prefix = "realm"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1147,7 +1148,6 @@ def forward(
     REALM_START_DOCSTRING,
 )
 class RealmEmbedder(RealmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1378,7 +1378,6 @@ def forward(
     REALM_START_DOCSTRING,
 )
 class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
     _tied_weights_keys = ["cls.predictions.decoder"]
 
     def __init__(self, config):
@@ -1529,8 +1528,6 @@ def forward(
 
 @add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
 class RealmReader(RealmPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 98b4577b67d9..7f3979ad21ee 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -352,10 +352,10 @@ def __init__(self, config):
         self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False)
 
         # save mask value here. Need fp32 and fp16 mask values
-        self.register_buffer("self_mask_value_float16", torch.tensor(-1e3))
-        self.register_buffer("self_mask_value_float32", torch.tensor(-1e5))
-        self.register_buffer("mask_value_float16", torch.tensor(-1e4))
-        self.register_buffer("mask_value_float32", torch.tensor(-1e9))
+        self.register_buffer("self_mask_value_float16", torch.tensor(-1e3), persistent=False)
+        self.register_buffer("self_mask_value_float32", torch.tensor(-1e5), persistent=False)
+        self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
+        self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
 
     def forward(
         self,
@@ -1049,8 +1049,8 @@ def __init__(self, config):
         self.dropout = config.local_attention_probs_dropout_prob
 
         # save mask value here
-        self.register_buffer("mask_value_float16", torch.tensor(-1e4))
-        self.register_buffer("mask_value_float32", torch.tensor(-1e9))
+        self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False)
+        self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False)
 
     def forward(
         self,
@@ -2185,7 +2185,6 @@ def _pad_to_mult_of_chunk_length(
 
 @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING)
 class ReformerModelWithLMHead(ReformerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index da4ad9608514..e0ab18088aae 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -158,7 +158,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -654,7 +656,6 @@ class RemBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_rembert
     base_model_prefix = "rembert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1016,7 +1017,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
 )
 class RemBertForCausalLM(RemBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index b0f136924601..cf71ceba7c45 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -80,7 +80,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -614,15 +616,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, RobertaEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 ROBERTA_START_DOCSTRING = r"""
 
@@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -881,9 +872,6 @@ def forward(
     """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
 )
 class RobertaForCausalLM(RobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -895,9 +883,6 @@ def __init__(self, config):
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.lm_head = RobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1036,9 +1021,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
 class RobertaForMaskedLM(RobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -1053,9 +1035,6 @@ def __init__(self, config):
         self.roberta = RobertaModel(config, add_pooling_layer=False)
         self.lm_head = RobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1173,8 +1152,6 @@ def _tie_weights(self):
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForSequenceClassification(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1274,8 +1251,6 @@ def forward(
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForMultipleChoice(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1368,9 +1343,6 @@ def forward(
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForTokenClassification(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1478,9 +1450,6 @@ def forward(self, features, **kwargs):
     ROBERTA_START_DOCSTRING,
 )
 class RobertaForQuestionAnswering(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index b1e02e27f138..c9b455716fc2 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -83,7 +83,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -617,15 +619,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, RobertaPreLayerNormEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
 
@@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -886,9 +877,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
 class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -902,9 +890,6 @@ def __init__(self, config):
         self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
         self.lm_head = RobertaPreLayerNormLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1045,9 +1030,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
 )
 class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
@@ -1063,9 +1045,6 @@ def __init__(self, config):
         self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
         self.lm_head = RobertaPreLayerNormLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1185,8 +1164,6 @@ def _tie_weights(self):
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
 class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1286,8 +1263,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
 class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1382,9 +1357,6 @@ def forward(
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
 class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1492,9 +1464,6 @@ def forward(self, features, **kwargs):
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
 class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 7647c14a9ea3..c57537ecf3e4 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -190,7 +190,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids",
@@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_roc_bert
     base_model_prefix = "roc_bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1081,7 +1082,6 @@ def forward(
     ROC_BERT_START_DOCSTRING,
 )
 class RoCBertForPreTraining(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1267,8 +1267,6 @@ def forward(
 
 @add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
 class RoCBertForMaskedLM(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert
@@ -1409,8 +1407,6 @@ def prepare_inputs_for_generation(
     """RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
 )
 class RoCBertForCausalLM(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert
@@ -1804,8 +1800,6 @@ def forward(
     ROC_BERT_START_DOCSTRING,
 )
 class RoCBertForTokenClassification(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
@@ -1892,8 +1886,6 @@ def forward(
     ROC_BERT_START_DOCSTRING,
 )
 class RoCBertForQuestionAnswering(RoCBertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index b966bf4490a9..ad91766f9660 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -696,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_roformer
     base_model_prefix = "roformer"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = []
-    _keys_to_ignore_on_load_unexpected = [
-        r"roformer.embeddings_project.weight",
-        r"roformer.embeddings_project.bias",
-    ]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -952,7 +947,6 @@ def forward(
 
 @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
 class RoFormerForMaskedLM(RoFormerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
@@ -1055,7 +1049,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
 )
 class RoFormerForCausalLM(RoFormerPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
     _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index c3cbaa9176f0..43d88232e364 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -1190,7 +1190,6 @@ def _init_weights(self, module):
     SAM_START_DOCSTRING,
 )
 class SamModel(SamPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
     _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 6b0869c87ad6..67b4bf1a0c6c 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -723,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel):
     base_model_prefix = "sew"
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 7f7c1977d692..6ae717d9a28a 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -1257,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel):
     config_class = SEWDConfig
     base_model_prefix = "sew-d"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 862dcac2ce7c..1af805a17905 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -1266,17 +1266,6 @@ def forward(
 )
 class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"model.encoder.embed_positions.weights",
-        r"model.decoder.embed_positions.weights",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"model.encoder.embed_positions.weights",
-        r"model.decoder.embed_positions.weights",
-    ]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: Speech2TextConfig):
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index a04fd82d4b8f..822025e40ae2 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -756,7 +756,6 @@ def forward(self, *args, **kwargs):
     SPEECH_TO_TEXT_2_START_DOCSTRING,
 )
 class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 301ed54af4d6..b77d775714ad 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -441,7 +441,7 @@ def __init__(self, dropout, dim, max_len=5000):
         pe[:, 1::2] = torch.cos(position.float() * div_term)
         pe = pe.unsqueeze(0)
         super().__init__()
-        self.register_buffer("pe", pe)
+        self.register_buffer("pe", pe, persistent=False)
         self.dropout = nn.Dropout(p=dropout)
         self.dim = dim
         self.alpha = torch.nn.Parameter(torch.tensor(1.0))
@@ -1251,8 +1251,6 @@ class SpeechT5PreTrainedModel(PreTrainedModel):
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, SpeechT5PositionalConvEmbedding):
@@ -2326,13 +2324,6 @@ def forward(
     SPEECHT5_START_DOCSTRING,
 )
 class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
-        r"text_decoder_postnet.lm_head.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
-    ]
     _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]
 
     def __init__(self, config: SpeechT5Config):
@@ -2638,9 +2629,6 @@ def _generate_speech(
     SPEECHT5_START_DOCSTRING,
 )
 class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = []
-    _keys_to_ignore_on_save = []
-
     main_input_name = "input_ids"
 
     def __init__(self, config: SpeechT5Config):
@@ -2859,13 +2847,6 @@ def generate_speech(
     SPEECHT5_START_DOCSTRING,
 )
 class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
-    ]
-    _keys_to_ignore_on_save = [
-        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
-    ]
-
     def __init__(self, config: SpeechT5Config):
         super().__init__(config)
 
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 6e636fb695da..193481e57f25 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -61,7 +61,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
     def forward(
@@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel):
     config_class = SplinterConfig
     base_model_prefix = "splinter"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index 3264d16ebbb3..b82de3a0b06b 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -64,7 +64,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
         if input_ids is not None:
@@ -425,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel):
 
     config_class = SqueezeBertConfig
     base_model_prefix = "transformer"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -643,11 +644,6 @@ def forward(
 
 @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
 class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "embeddings.position_ids",
-    ]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 008e23531ac1..98899af150a4 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -1337,7 +1337,6 @@ def custom_forward(*inputs):
     SWITCH_TRANSFORMERS_START_DOCSTRING,
 )
 class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
@@ -1506,11 +1505,6 @@ def forward(
     """SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING
 )
 class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
@@ -1819,7 +1813,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     SWITCH_TRANSFORMERS_START_DOCSTRING,
 )
 class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: SwitchTransformersConfig):
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index f1f7b17c7bfe..7934b10b0a27 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1326,12 +1326,8 @@ def custom_forward(*inputs):
     T5_START_DOCSTRING,
 )
 class T5Model(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+        "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
@@ -1530,13 +1526,8 @@ def forward(
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
 class T5ForConditionalGeneration(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
     _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+        "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
 
@@ -1845,7 +1836,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     T5_START_DOCSTRING,
 )
 class T5EncoderModel(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight"]
 
     def __init__(self, config: T5Config):
@@ -1963,14 +1953,7 @@ def forward(
     T5_START_DOCSTRING,
 )
 class T5ForQuestionAnswering(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.embed_tokens.weight",
-        r"decoder.embed_tokens.weight",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
-    ]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: T5Config):
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 1621653f3ee0..832a731b5bf4 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -998,7 +998,6 @@ def forward(
 
 @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
 class TapasForMaskedLM(TapasPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     config_class = TapasConfig
     base_model_prefix = "tapas"
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
index e8ecedccb5ea..1f634a9893d6 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
@@ -284,6 +284,7 @@ def __init__(self, config):
             torch.tril(torch.ones(config.block_size, config.block_size)).view(
                 1, 1, config.block_size, config.block_size
             ),
+            persistent=False,
         )
 
         # mask previous value estimates
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index d0f6cc029fb3..8ba96905242d 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -1002,7 +1002,6 @@ def forward(
     TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
     _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
 
     def __init__(self, config):
@@ -1191,8 +1190,6 @@ def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[tor
     TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 3ad4ff1bac52..cd4e522bf547 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -788,7 +788,6 @@ def forward(self, *args, **kwargs):
     TROCR_START_DOCSTRING,
 )
 class TrOCRForCausalLM(TrOCRPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["output_projection.weight"]
     _tied_weights_keys = ["output_projection.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 16c08bbbf3e0..9737433089f8 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -974,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
     config_class = UniSpeechConfig
     base_model_prefix = "unispeech"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index b57369ea6f75..4c4ab4b90f3b 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -988,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
     config_class = UniSpeechSatConfig
     base_model_prefix = "unispeech_sat"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 6ee1e396a625..4d5283bae60e 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -249,7 +249,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -886,7 +888,6 @@ def forward(self, hidden_states):
     VILT_START_DOCSTRING,
 )
 class ViltForMaskedLM(ViltPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["mlm_score.decoder.bias"]
     _tied_weights_keys = ["mlm_score.decoder.weight", "mlm_score.decoder.bias"]
 
     def __init__(self, config):
@@ -1419,8 +1420,6 @@ def forward(
     VILT_START_DOCSTRING,
 )
 class ViltForTokenClassification(ViltPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 0bef6e4af9d9..0706eb1f1c48 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -78,7 +78,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
         # For Visual Features
         # Token type and position embedding for image features
@@ -531,7 +533,6 @@ class VisualBertPreTrainedModel(PreTrainedModel):
     config_class = VisualBertConfig
     base_model_prefix = "visual_bert"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -871,7 +872,6 @@ def forward(
     VISUAL_BERT_START_DOCSTRING,
 )
 class VisualBertForPreTraining(VisualBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
@@ -1462,7 +1462,6 @@ def forward(self, query, key, attention_mask):
     VISUAL_BERT_START_DOCSTRING,
 )
 class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"]
     _tied_weights_keys = ["cls.predictions.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 1c8965c96003..3e48dc530dec 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1089,7 +1089,6 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
     config_class = Wav2Vec2Config
     base_model_prefix = "wav2vec2"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 7a757d0a51f9..d5836de3394f 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1087,7 +1087,6 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
     config_class = Wav2Vec2ConformerConfig
     base_model_prefix = "wav2vec2_conformer"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index d782a47402f0..d573ee601b4c 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -974,7 +974,6 @@ class WavLMPreTrainedModel(PreTrainedModel):
     config_class = WavLMConfig
     base_model_prefix = "wavlm"
     main_input_name = "input_values"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index cffb2810838d..fa9eae4c4797 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1225,8 +1225,6 @@ def custom_forward(*inputs):
     WHISPER_START_DOCSTRING,
 )
 class WhisperModel(WhisperPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"proj_out.weight"]
-
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
 
@@ -1396,14 +1394,6 @@ def forward(
 )
 class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"encoder.version",
-        r"decoder.version",
-        r"proj_out.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"proj_out.weight",
-    ]
     _tied_weights_keys = ["proj_out.weight"]
 
     def __init__(self, config: WhisperConfig):
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 8db4ee0fd194..bcf91b0b51d5 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -139,7 +139,7 @@ def __init__(self, config: XCLIPVisionConfig):
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)))
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
@@ -162,7 +162,9 @@ def __init__(self, config: XCLIPTextConfig):
         self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def forward(
         self,
@@ -481,7 +483,6 @@ class XCLIPPreTrainedModel(PreTrainedModel):
     config_class = XCLIPConfig
     base_model_prefix = "x_clip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index b7172127d906..9e578cebf190 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -749,14 +749,6 @@ def custom_forward(*inputs):
 )
 class XGLMForCausalLM(XGLMPreTrainedModel):
     base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"model.embed_positions.weights",
-        r"embed_positions.weights",
-        r"lm_head.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        r"model.embed_positions.weights",
-    ]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index a448b4b11631..d342cde80d3c 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -391,8 +391,6 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
     XLM_START_DOCSTRING,
 )
 class XLMModel(XLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -461,7 +459,9 @@ def __init__(self, config):
 
         # Initialize weights and apply final processing
         self.post_init()
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -670,7 +670,6 @@ def forward(self, x, y=None):
     XLM_START_DOCSTRING,
 )
 class XLMWithLMHeadModel(XLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"]
     _tied_weights_keys = ["pred_layer.proj.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index 2d14bfb6a7b5..c84e3fac5aeb 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -1768,7 +1768,6 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask):
 )
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"]
     _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
@@ -1899,11 +1898,6 @@ def forward(
 )
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "decoder.word_embeddings.weight",
-        "encoder.word_embeddings.weight",
-        "lm_head.weight",
-    ]
     _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
@@ -2119,7 +2113,6 @@ def get_decoder(self):
 )
 # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
 class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["lm_head.weight"]
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: XLMProphetNetConfig):
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index ae8d51a3f8eb..881f60875dbb 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -81,7 +81,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -616,15 +618,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, XLMRobertaEncoder):
             module.gradient_checkpointing = value
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 XLM_ROBERTA_START_DOCSTRING = r"""
 
@@ -713,8 +706,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -885,9 +876,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -899,9 +887,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1044,9 +1029,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -1061,9 +1043,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1183,8 +1162,6 @@ def _tie_weights(self):
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1285,8 +1262,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1382,9 +1357,6 @@ def forward(
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1494,9 +1466,6 @@ def forward(self, features, **kwargs):
 )
 # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
 class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index fb86717e1d7f..4299880e0c4f 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -73,7 +73,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -599,15 +601,6 @@ def _init_weights(self, module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
 
 XLM_ROBERTA_XL_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -679,8 +672,6 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
     an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRobertaXL
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -850,9 +841,6 @@ def forward(
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -864,9 +852,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaXLLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         self.init_weights()
 
     def get_output_embeddings(self):
@@ -1001,9 +986,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     """XLM-RoBERTa-xlarge Model with a `language modeling` head on top.""", XLM_ROBERTA_XL_START_DOCSTRING
 )
 class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     def __init__(self, config):
@@ -1018,9 +1000,6 @@ def __init__(self, config):
         self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
         self.lm_head = XLMRobertaXLLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         self.init_weights()
 
     def get_output_embeddings(self):
@@ -1129,8 +1108,6 @@ def _tie_weights(self):
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1225,8 +1202,6 @@ def forward(
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
 
@@ -1318,9 +1293,6 @@ def forward(
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForTokenClassification(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1432,9 +1404,6 @@ def forward(self, features, **kwargs):
     XLM_ROBERTA_XL_START_DOCSTRING,
 )
 class XLMRobertaXLForQuestionAnswering(XLMRobertaXLPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index bea8ab643b19..87bf48d61ed5 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1292,7 +1292,6 @@ def forward(
     XLNET_START_DOCSTRING,
 )
 class XLNetLMHeadModel(XLNetPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_loss.weight"]
     _tied_weights_keys = ["lm_loss.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index d99b77fedda3..c44cded49952 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -74,7 +74,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.register_buffer(
             "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
         )
@@ -682,16 +684,6 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, XmodEncoder):
             module.gradient_checkpointing = value
 
-    # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel.update_keys_to_ignore
-    def update_keys_to_ignore(self, config, del_keys_to_ignore):
-        """Remove some keys from ignore list"""
-        if not config.tie_word_embeddings:
-            # must make a new list, or the class variable gets modified!
-            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
-            self._keys_to_ignore_on_load_missing = [
-                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
-            ]
-
     def set_default_language(self, language: str):
         """
         Set the default language code for the model. This is used when the language is not specified in the input.
@@ -811,8 +803,6 @@ class XmodModel(XmodPreTrainedModel):
 
     """
 
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
@@ -989,9 +979,6 @@ def forward(
     XMOD_START_DOCSTRING,
 )
 class XmodForCausalLM(XmodPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod
@@ -1004,9 +991,6 @@ def __init__(self, config):
         self.roberta = XmodModel(config, add_pooling_layer=False)
         self.lm_head = XmodLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1152,9 +1136,6 @@ def _reorder_cache(self, past_key_values, beam_idx):
     XMOD_START_DOCSTRING,
 )
 class XmodForMaskedLM(XmodPreTrainedModel):
-    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod
@@ -1170,9 +1151,6 @@ def __init__(self, config):
         self.roberta = XmodModel(config, add_pooling_layer=False)
         self.lm_head = XmodLMHead(config)
 
-        # The LM head weights require special treatment only when they are tied with the word embeddings
-        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1285,8 +1263,6 @@ def _tie_weights(self):
     XMOD_START_DOCSTRING,
 )
 class XmodForSequenceClassification(XmodPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod
     def __init__(self, config):
         super().__init__(config)
@@ -1380,8 +1356,6 @@ def forward(
     XMOD_START_DOCSTRING,
 )
 class XmodForMultipleChoice(XmodPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ with Roberta->Xmod
     def __init__(self, config):
         super().__init__(config)
@@ -1471,9 +1445,6 @@ def forward(
     XMOD_START_DOCSTRING,
 )
 class XmodForTokenClassification(XmodPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Xmod
     def __init__(self, config):
         super().__init__(config)
@@ -1576,9 +1547,6 @@ def forward(self, features, **kwargs):
     XMOD_START_DOCSTRING,
 )
 class XmodForQuestionAnswering(XmodPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
     # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Xmod
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index 8c2ff9fa4e07..4d4ef9a4f509 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -252,7 +252,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
         self.register_buffer(
             "token_type_ids",
@@ -649,7 +651,6 @@ class YosoPreTrainedModel(PreTrainedModel):
     config_class = YosoConfig
     base_model_prefix = "yoso"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -849,11 +850,6 @@ def forward(
 
 @add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING)
 class YosoForMaskedLM(YosoPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        "cls.predictions.decoder.bias",
-        "cls.predictions.decoder.weight",
-        "embeddings.position_ids",
-    ]
     _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
 
     def __init__(self, config):
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 49caa67d4f6c..7ca78e23b7f1 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -15,7 +15,6 @@
 
 
 import unittest
-from copy import deepcopy
 
 from transformers import RobertaConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
@@ -579,23 +578,3 @@ def test_inference_classification_head(self):
         # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
 
         self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
-    # XXX: this might be a candidate for common tests if we have many of those
-    def test_lm_head_ignore_keys(self):
-        keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
-        keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"]
-        config = RobertaConfig.from_pretrained(ROBERTA_TINY)
-        config_tied = deepcopy(config)
-        config_tied.tie_word_embeddings = True
-        config_untied = deepcopy(config)
-        config_untied.tie_word_embeddings = False
-        for cls in [RobertaForMaskedLM, RobertaForCausalLM]:
-            model = cls(config_tied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls)
-
-            # the keys should be different when embeddings aren't tied
-            model = cls(config_untied)
-            self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls)
-
-            # test that saving works with updated ignore keys - just testing that it doesn't fail
-            model.save_pretrained(self.get_auto_remove_tmp_dir())
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 07a8b16bfef7..878e3c647302 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1562,7 +1562,7 @@ def check_same_values(layer_1, layer_2):
 
     @require_safetensors
     def test_can_use_safetensors(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model_tied = model_class(config)
             with tempfile.TemporaryDirectory() as d:
@@ -1579,6 +1579,8 @@ def test_can_use_safetensors(self):
                     torch.testing.assert_close(
                         v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
                     )
+                # Checking there was no complain of missing weights
+                self.assertEqual(infos["missing_keys"], [])
 
                 # Checking the tensor sharing are correct
                 ptrs = defaultdict(list)
@@ -1595,6 +1597,25 @@ def test_can_use_safetensors(self):
                         f"The shared pointers are incorrect, found different pointers for keys {shared_names}",
                     )
 
+    def test_load_save_without_tied_weights(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config.tie_word_embeddings = False
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as d:
+                model.save_pretrained(d)
+
+                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
+                # Checking the state dicts are correct
+                reloaded_state = model_reloaded.state_dict()
+                for k, v in model.state_dict().items():
+                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
+                    torch.testing.assert_close(
+                        v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
+                    )
+                # Checking there was no complain of missing weights
+                self.assertEqual(infos["missing_keys"], [])
+
     def test_tied_weights_keys(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         config.tie_word_embeddings = True
@@ -1620,55 +1641,72 @@ def test_tied_weights_keys(self):
                     tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
 
             tied_params = [group for group in tied_params if len(group) > 1]
-            self.assertListEqual(tied_params, [])
+            self.assertListEqual(
+                tied_params,
+                [],
+                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
+            )
 
-    def test_tied_model_weights_key_ignore(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-            model_tied = model_class(config)
-            with tempfile.TemporaryDirectory() as d:
-                model_tied.save_pretrained(d)
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model.save_pretrained(tmp_dir)
 
                 # We are nuking ALL weights on file, so every parameter should
                 # yell on load. We're going to detect if we yell too much, or too little.
-                with open(os.path.join(d, "pytorch_model.bin"), "wb") as f:
+                with open(os.path.join(tmp_dir, "pytorch_model.bin"), "wb") as f:
                     torch.save({}, f)
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-
-                # ! Actually we could use `state_dict()` and check iteratively the tensors which are the same (for instance using `tensor.data_ptr()`). to detect the duplicates.
-                # ```python
-                # model = GPT2LMHeadModel.from_pretrained("gpt2")
-                # "lm_head.weight" in model.state_dict().keys()  # True
-                # "lm_head.weight" in model.named_parameters() # False
-                # In [6]: model.lm_head.weight.data_ptr()
-                # Out[6]: 139901378371648
-                # In [9]: model.transformer.wte.weight.data_ptr()
-                # Out[9]: 139901378371648  # Same PTR, it's the same DATA ! we would need to check for stride too to be 100% accurate.
-                # ```
+                model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)
 
                 prefix = f"{model_reloaded.base_model_prefix}."
                 params = dict(model_reloaded.named_parameters())
                 params.update(dict(model_reloaded.named_buffers()))
-                # param_names = set(k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys())
                 param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()}
 
                 missing_keys = set(infos["missing_keys"])
 
                 extra_missing = missing_keys - param_names
-                # missed_missing = param_names - missing_keys
+                # Remove tied weights from extra missing: they are normally not warned as missing if their tied
+                # counterpart is present but here there are no weights at all so we do get the warning.
+                ptrs = collections.defaultdict(list)
+                for name, tensor in model_reloaded.state_dict().items():
+                    ptrs[id_tensor_storage(tensor)].append(name)
+                tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+                for group in tied_params:
+                    group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group}
+                    # We remove the group from extra_missing if not all weights from group are in it
+                    if len(group - extra_missing) > 0:
+                        extra_missing = extra_missing - set(group)
 
                 self.assertEqual(
                     extra_missing,
                     set(),
-                    f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}",
+                    f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}. "
+                    f"For debugging, tied parameters are {tied_params}",
                 )
 
-                # self.assertEqual(
-                #     missed_missing,
-                #     set(),
-                #     f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real"
-                #     " parameters",
-                # )
+                missed_missing = param_names - missing_keys
+                # Remove nonpersistent buffers from missed_missing
+                buffers = [n for n, _ in model_reloaded.named_buffers()]
+                nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
+                nonpersistent_buffers = {
+                    k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers
+                }
+                missed_missing = missed_missing - nonpersistent_buffers
+
+                if model_reloaded._keys_to_ignore_on_load_missing is None:
+                    expected_missing = set()
+                else:
+                    expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing)
+                self.assertEqual(
+                    missed_missing,
+                    expected_missing,
+                    f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real"
+                    " parameters. If they are non persistent buffers make sure to instantiate them with"
+                    " `persistent=False`",
+                )
 
     def test_model_outputs_equivalence(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 3b441ec7e582..17ddf1963a28 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -500,8 +500,8 @@ def test_checkpoint_variant_local_sharded(self):
             self.assertTrue(os.path.isfile(weights_index_file))
             self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME)))
 
-            for i in range(1, 6):
-                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"])
+            for i in range(1, 5):
+                weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["bin"])
                 weights_name_file = os.path.join(tmp_dir, weights_name)
                 self.assertTrue(os.path.isfile(weights_name_file))
 
@@ -546,8 +546,8 @@ def test_checkpoint_variant_local_sharded_safe(self):
             self.assertTrue(os.path.isfile(weights_index_file))
             self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
 
-            for i in range(1, 6):
-                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"])
+            for i in range(1, 5):
+                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["safetensors"])
                 weights_name_file = os.path.join(tmp_dir, weights_name)
                 self.assertTrue(os.path.isfile(weights_name_file))
 

From 462f77cbceaec26510236afeacc18c07cfb2ec74 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 27 Jun 2023 20:34:36 +0100
Subject: [PATCH 549/935] Allow backbones not in backbones_supported -
 Maskformer Mask2Former (#24532)

Allow backbones not in backbones_supported
---
 .../mask2former/configuration_mask2former.py  | 12 ++++++++--
 .../maskformer/configuration_maskformer.py    | 23 +++++++++----------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 94fc4866bef5..6711904b7015 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -170,11 +170,19 @@ def __init__(
                 use_absolute_embeddings=False,
                 out_features=["stage1", "stage2", "stage3", "stage4"],
             )
-        elif isinstance(backbone_config, dict):
-            backbone_model_type = backbone_config.get("model_type")
+
+        if isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
             config_class = CONFIG_MAPPING[backbone_model_type]
             backbone_config = config_class.from_dict(backbone_config)
 
+        # verify that the backbone is supported
+        if backbone_config.model_type not in self.backbones_supported:
+            logger.warning_once(
+                f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. "
+                f"Supported model types: {','.join(self.backbones_supported)}"
+            )
+
         self.backbone_config = backbone_config
         self.feature_size = feature_size
         self.mask_feature_size = mask_feature_size
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 655bee2b9a5f..25deef2f2062 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -129,19 +129,18 @@ def __init__(
                 drop_path_rate=0.3,
                 out_features=["stage1", "stage2", "stage3", "stage4"],
             )
-        else:
-            # verify that the backbone is supported
-            backbone_model_type = (
-                backbone_config.pop("model_type") if isinstance(backbone_config, dict) else backbone_config.model_type
+
+        if isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        # verify that the backbone is supported
+        if backbone_config.model_type not in self.backbones_supported:
+            logger.warning_once(
+                f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with MaskFormer. "
+                f"Supported model types: {','.join(self.backbones_supported)}"
             )
-            if backbone_model_type not in self.backbones_supported:
-                raise ValueError(
-                    f"Backbone {backbone_model_type} not supported, please use one of"
-                    f" {','.join(self.backbones_supported)}"
-                )
-            if isinstance(backbone_config, dict):
-                config_class = CONFIG_MAPPING[backbone_model_type]
-                backbone_config = config_class.from_dict(backbone_config)
 
         if decoder_config is None:
             # fall back to https://huggingface.co/facebook/detr-resnet-50

From 04f46a22d8b5f38c8369b3d995df3102b8d56dcd Mon Sep 17 00:00:00 2001
From: "MS Kim(tony9402)" <tony9402@soongsil.ac.kr>
Date: Wed, 28 Jun 2023 04:38:14 +0900
Subject: [PATCH 550/935] Fix Typo (#24530)

* Fix Typo

* Fix all copies
---
 src/transformers/models/bart/modeling_bart.py                   | 2 +-
 .../models/blenderbot_small/modeling_blenderbot_small.py        | 2 +-
 src/transformers/models/marian/modeling_marian.py               | 2 +-
 src/transformers/models/plbart/modeling_plbart.py               | 2 +-
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index ad4d4ab9c988..1bc9c7e9d3e7 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -319,7 +319,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 70794e80a43b..af571aa29f6e 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -304,7 +304,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index d25d1ed4bc22..022c0a4ea0b7 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -322,7 +322,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index cf2901d43d26..ca2f6d574b23 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -315,7 +315,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 98de5e12b481..26b9616840c8 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -484,7 +484,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size

From 89b6ee49fdc3ad625fba8067743641facdbb8597 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 27 Jun 2023 21:35:15 -0400
Subject: [PATCH 551/935] Finishing tidying keys to ignore on load (#24535)

---
 .../models/instructblip/modeling_instructblip.py       | 10 +++-------
 .../timm_backbone/test_modeling_timm_backbone.py       |  8 ++++++++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index acc0df630991..3eca3b0a32f8 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -275,12 +275,6 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
     config_class = InstructBlipConfig
     base_model_prefix = "blip"
     supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [
-        r"position_ids",
-        r"language_model.encoder.embed_tokens.weight",
-        r"language_model.decoder.embed_tokens.weight",
-        r"language_model.lm_head.weight",
-    ]
     _no_split_modules = ["InstructBlipAttention", "InstructBlipQFormerMultiHeadAttention"]
     _keep_in_fp32_modules = []
 
@@ -1011,7 +1005,9 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
         self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
 
         self.config = config
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index 145238c6bfd6..2af7c4c6178d 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -176,6 +176,14 @@ def test_tie_model_weights(self):
     def test_tied_model_weights_key_ignore(self):
         pass
 
+    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    def test_load_save_without_tied_weights(self):
+        pass
+
+    @unittest.skip("Only checkpoints on timm can be loaded into TimmBackbone")
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        pass
+
     @unittest.skip("TimmBackbone doesn't have hidden size info in its configuration.")
     def test_channels(self):
         pass

From 12240925cfa29fff932e49927eb9744713ab1018 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dario=20Su=C4=8Di=C4=87?= <dariosucic@gmail.com>
Date: Wed, 28 Jun 2023 05:55:32 +0200
Subject: [PATCH 552/935] Add bitsandbytes support for gpt2 models (#24504)

* Add bitsandbytes support for gpt2 models

* Guard Conv1D import to pass tensorflow test

* Appease ruff linter

* Fix 4bit test and remove int8 test boilerplate

* Update tests/bnb/test_mixed_int8.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 src/transformers/utils/bitsandbytes.py | 26 +++++++++++++++-----
 tests/bnb/test_4bit.py                 | 15 +++++++++++-
 tests/bnb/test_mixed_int8.py           | 33 ++++++++++++++++++++------
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 35448b8779aa..a24a82f1e403 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -12,6 +12,8 @@
     import torch
     import torch.nn as nn
 
+    from ..pytorch_utils import Conv1D
+
 if is_accelerate_available():
     from accelerate import init_empty_weights
     from accelerate.utils import find_tied_parameters
@@ -84,6 +86,11 @@ class `Int8Params` from `bitsandbytes`.
             else:
                 new_value = torch.tensor(value, device="cpu")
 
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
+            # Since weights are saved in the correct "orientation", we skip transposing when loading.
+            if issubclass(module.source_cls, Conv1D) and fp16_statistics is None:
+                new_value = new_value.T
+
             kwargs = old_value.__dict__
             if is_8bit:
                 new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
@@ -122,14 +129,20 @@ def _replace_with_bnb_linear(
             current_key_name = []
         current_key_name.append(name)
 
-        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
             # Check if the current key is not in the `modules_to_not_convert`
             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
                 with init_empty_weights():
+                    if isinstance(module, Conv1D):
+                        in_features, out_features = module.weight.shape
+                    else:
+                        in_features = module.in_features
+                        out_features = module.out_features
+
                     if quantization_config.quantization_method() == "llm_int8":
                         model._modules[name] = bnb.nn.Linear8bitLt(
-                            module.in_features,
-                            module.out_features,
+                            in_features,
+                            out_features,
                             module.bias is not None,
                             has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
                             threshold=quantization_config.llm_int8_threshold,
@@ -143,14 +156,16 @@ def _replace_with_bnb_linear(
                             pass
                         else:
                             model._modules[name] = bnb.nn.Linear4bit(
-                                module.in_features,
-                                module.out_features,
+                                in_features,
+                                out_features,
                                 module.bias is not None,
                                 quantization_config.bnb_4bit_compute_dtype,
                                 compress_statistics=quantization_config.bnb_4bit_use_double_quant,
                                 quant_type=quantization_config.bnb_4bit_quant_type,
                             )
                             has_been_replaced = True
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
         if len(list(module.children())) > 0:
@@ -200,7 +215,6 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
     if not has_been_replaced:
         logger.warning(
             "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
-            " this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers."
             " Please double check your model architecture, or submit an issue on github if you think this is"
             " a bug."
         )
diff --git a/tests/bnb/test_4bit.py b/tests/bnb/test_4bit.py
index 182dfb9a170b..0da82b063c82 100644
--- a/tests/bnb/test_4bit.py
+++ b/tests/bnb/test_4bit.py
@@ -39,6 +39,12 @@
 from transformers.utils.versions import importlib_metadata
 
 
+def get_some_linear_layer(model):
+    if model.config.model_type == "gpt2":
+        return model.transformer.h[0].mlp.c_fc
+    return model.transformer.h[0].mlp.dense_4h_to_h
+
+
 if is_torch_available():
     import torch
     import torch.nn as nn
@@ -83,6 +89,7 @@ class Base4bitTest(unittest.TestCase):
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
     EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
+    EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
     MAX_NEW_TOKENS = 10
 
     def setUp(self):
@@ -135,7 +142,8 @@ def test_memory_footprint(self):
         mem_4bit = self.model_4bit.get_memory_footprint()
 
         self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
-        self.assertTrue(self.model_4bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
+        linear = get_some_linear_layer(self.model_4bit)
+        self.assertTrue(linear.weight.__class__ == Params4bit)
 
     def test_linear_are_4bit(self):
         r"""
@@ -473,3 +481,8 @@ def test_training(self):
                 self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
             elif isinstance(module, nn.Embedding):
                 self.assertTrue(module.weight.grad is None)
+
+
+class Bnb4BitGPT2Test(Bnb4BitTest):
+    model_name = "gpt2-xl"
+    EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
diff --git a/tests/bnb/test_mixed_int8.py b/tests/bnb/test_mixed_int8.py
index 7927045d7888..2fd4ccabdaee 100644
--- a/tests/bnb/test_mixed_int8.py
+++ b/tests/bnb/test_mixed_int8.py
@@ -41,6 +41,12 @@
 from transformers.utils.versions import importlib_metadata
 
 
+def get_some_linear_layer(model):
+    if model.config.model_type == "gpt2":
+        return model.transformer.h[0].mlp.c_fc
+    return model.transformer.h[0].mlp.dense_4h_to_h
+
+
 if is_accelerate_available():
     from accelerate import PartialState
     from accelerate.logging import get_logger
@@ -142,7 +148,7 @@ def test_memory_footprint(self):
         mem_8bit = self.model_8bit.get_memory_footprint()
 
         self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
-        self.assertTrue(self.model_8bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
+        self.assertTrue(get_some_linear_layer(self.model_8bit).weight.__class__ == Int8Params)
 
     def test_linear_are_8bit(self):
         r"""
@@ -292,8 +298,9 @@ def test_int8_serialization(self):
 
             model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, load_in_8bit=True, device_map="auto")
 
-            self.assertTrue(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
-            self.assertTrue(hasattr(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB"))
+            linear = get_some_linear_layer(model_from_saved)
+            self.assertTrue(linear.weight.__class__ == Int8Params)
+            self.assertTrue(hasattr(linear.weight, "SCB"))
 
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
@@ -318,8 +325,9 @@ def test_int8_serialization_sharded(self):
 
             model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname)
 
-            self.assertTrue(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
-            self.assertTrue(hasattr(model_from_saved.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB"))
+            linear = get_some_linear_layer(model_from_saved)
+            self.assertTrue(linear.weight.__class__ == Int8Params)
+            self.assertTrue(hasattr(linear.weight, "SCB"))
 
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
@@ -339,8 +347,9 @@ def test_int8_from_pretrained(self):
 
         model = AutoModelForCausalLM.from_pretrained(model_id)
 
-        self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Int8Params)
-        self.assertTrue(hasattr(model.transformer.h[0].mlp.dense_4h_to_h.weight, "SCB"))
+        linear = get_some_linear_layer(model)
+        self.assertTrue(linear.weight.__class__ == Int8Params)
+        self.assertTrue(hasattr(linear.weight, "SCB"))
 
         # generate
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
@@ -748,3 +757,13 @@ def test_training(self):
                 self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
             elif isinstance(module, nn.Embedding):
                 self.assertTrue(module.weight.grad is None)
+
+
+class MixedInt8GPT2Test(MixedInt8Test):
+    model_name = "gpt2-xl"
+    EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
+    EXPECTED_OUTPUT = "Hello my name is John Doe, and I am a member of the"
+
+    def test_int8_from_pretrained(self):
+        # TODO @younesbelkada: Test loading quantized gpt2 model from the hub.
+        pass

From e84bf1f734f87aa2bedc41b9b9933d00fc6add98 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 28 Jun 2023 07:22:39 +0200
Subject: [PATCH 553/935] =?UTF-8?q?=E2=9A=A0=EF=B8=8F=20Time=20to=20say=20?=
 =?UTF-8?q?goodbye=20to=20py37=20(#24091)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/conda/meta.yaml                       |  2 --
 .github/workflows/stale.yml                   |  2 +-
 CONTRIBUTING.md                               |  2 +-
 setup.py                                      |  6 ++--
 src/transformers/dependency_versions_check.py |  4 ---
 src/transformers/dependency_versions_table.py |  2 +-
 src/transformers/hf_argparser.py              | 10 +-----
 src/transformers/integrations.py              | 14 ++++----
 src/transformers/modeling_utils.py            | 11 ++++---
 src/transformers/utils/bitsandbytes.py        |  5 +--
 src/transformers/utils/import_utils.py        | 32 +++++++++----------
 src/transformers/utils/quantization_config.py |  4 +--
 src/transformers/utils/versions.py            | 16 +++-------
 tests/bnb/test_4bit.py                        |  4 +--
 tests/bnb/test_mixed_int8.py                  |  4 +--
 tests/utils/test_hf_argparser.py              |  9 +-----
 tests/utils/test_versions_utils.py            |  7 ++--
 17 files changed, 53 insertions(+), 81 deletions(-)

diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
index 6f6b680d1e6b..6bf33f842fbf 100644
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -16,7 +16,6 @@ requirements:
     - pip
     - numpy >=1.17
     - dataclasses
-    - importlib_metadata
     - huggingface_hub
     - packaging
     - filelock
@@ -31,7 +30,6 @@ requirements:
     - python
     - numpy >=1.17
     - dataclasses
-    - importlib_metadata
     - huggingface_hub
     - packaging
     - filelock
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 9412442a7d0a..8cc27a7548c5 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -17,7 +17,7 @@ jobs:
     - name: Setup Python
       uses: actions/setup-python@v4
       with:
-        python-version: 3.7
+        python-version: 3.8
 
     - name: Install requirements
       run: |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1c2b896815be..6cfa3e47398c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -130,7 +130,7 @@ You will need basic `git` proficiency to contribute to
 manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.
 
-You'll need **[Python 3.7]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
+You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/main/setup.py#L426))** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
 
 1. Fork the [repository](https://github.com/huggingface/transformers) by
    clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
diff --git a/setup.py b/setup.py
index 21489a1ecfec..b88c7efe8dac 100644
--- a/setup.py
+++ b/setup.py
@@ -149,7 +149,7 @@
     "pytest>=7.2.0",
     "pytest-timeout",
     "pytest-xdist",
-    "python>=3.7.0",
+    "python>=3.8.0",
     "ray[tune]",
     "regex!=2019.12.17",
     "requests",
@@ -413,7 +413,6 @@ def run(self):
 
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
-    deps["importlib_metadata"] + ";python_version<'3.8'",  # importlib_metadata for Python versions that don't have it
     deps["filelock"],  # filesystem locks, e.g., to prevent parallel downloads
     deps["huggingface-hub"],
     deps["numpy"],
@@ -444,7 +443,7 @@ def run(self):
     zip_safe=False,
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
-    python_requires=">=3.7.0",
+    python_requires=">=3.8.0",
     install_requires=install_requires,
     classifiers=[
         "Development Status :: 5 - Production/Stable",
@@ -454,7 +453,6 @@ def run(self):
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
diff --git a/src/transformers/dependency_versions_check.py b/src/transformers/dependency_versions_check.py
index 1e51e839b49d..82d07850847e 100644
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 
 from .dependency_versions_table import deps
 from .utils.versions import require_version, require_version_core
@@ -38,9 +37,6 @@
     "pyyaml",
 ]
 
-if sys.version_info < (3, 8):
-    pkgs_to_check_at_runtime.append("importlib_metadata")
-
 for pkg in pkgs_to_check_at_runtime:
     if pkg in deps:
         if pkg == "tokenizers":
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index afea4a016944..1e585889cca5 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -54,7 +54,7 @@
     "pytest": "pytest>=7.2.0",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
-    "python": "python>=3.7.0",
+    "python": "python>=3.8.0",
     "ray[tune]": "ray[tune]",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index b31497dd103e..c8f8bb17787d 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -21,19 +21,11 @@
 from enum import Enum
 from inspect import isclass
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, NewType, Optional, Tuple, Union, get_type_hints
+from typing import Any, Callable, Dict, Iterable, List, Literal, NewType, Optional, Tuple, Union, get_type_hints
 
 import yaml
 
 
-try:
-    # For Python versions <3.8, Literal is not in typing: https://peps.python.org/pep-0586/
-    from typing import Literal
-except ImportError:
-    # For Python 3.7
-    from typing_extensions import Literal
-
-
 DataClass = NewType("DataClass", Any)
 DataClassType = NewType("DataClassType", Any)
 
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index a4af915fdce7..387003cead8f 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -15,6 +15,7 @@
 Integrations with other Python libraries.
 """
 import functools
+import importlib.metadata
 import importlib.util
 import json
 import numbers
@@ -31,7 +32,6 @@
 
 from . import __version__ as version
 from .utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
-from .utils.versions import importlib_metadata
 
 
 logger = logging.get_logger(__name__)
@@ -59,13 +59,13 @@
 )
 if TYPE_CHECKING and _has_neptune:
     try:
-        _neptune_version = importlib_metadata.version("neptune")
+        _neptune_version = importlib.metadata.version("neptune")
         logger.info(f"Neptune version {_neptune_version} available.")
-    except importlib_metadata.PackageNotFoundError:
+    except importlib.metadata.PackageNotFoundError:
         try:
-            _neptune_version = importlib_metadata.version("neptune-client")
+            _neptune_version = importlib.metadata.version("neptune-client")
             logger.info(f"Neptune-client version {_neptune_version} available.")
-        except importlib_metadata.PackageNotFoundError:
+        except importlib.metadata.PackageNotFoundError:
             _has_neptune = False
 
 from .trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
@@ -367,10 +367,8 @@ def dynamic_modules_import_trainable(*args, **kwargs):
 def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
     import sigopt
 
-    from transformers.utils.versions import importlib_metadata
-
     if trainer.args.process_index == 0:
-        if importlib_metadata.version("sigopt") >= "8.0.0":
+        if importlib.metadata.version("sigopt") >= "8.0.0":
             sigopt.set_project("huggingface")
 
             experiment = sigopt.create_experiment(
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c6a8661af415..4fb8043f4410 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 import collections
 import gc
+import importlib.metadata
 import inspect
 import json
 import os
@@ -73,7 +74,7 @@
     replace_return_docstrings,
 )
 from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
-from .utils.import_utils import ENV_VARS_TRUE_VALUES, importlib_metadata, is_sagemaker_mp_enabled
+from .utils.import_utils import ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled
 from .utils.quantization_config import BitsAndBytesConfig
 from .utils.versions import require_version_core
 
@@ -2203,7 +2204,7 @@ def from_pretrained(
             use_safetensors = False
 
         if is_bitsandbytes_available():
-            is_8bit_serializable = version.parse(importlib_metadata.version("bitsandbytes")) > version.parse("0.37.2")
+            is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse("0.37.2")
         else:
             is_8bit_serializable = False
 
@@ -2738,7 +2739,7 @@ def from_pretrained(
 
                 modules_to_not_convert.extend(keys_on_cpu)
 
-            supports_4bit = version.parse(importlib_metadata.version("bitsandbytes")) >= version.parse("0.39.0")
+            supports_4bit = version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.39.0")
 
             if load_in_4bit and not supports_4bit:
                 raise ValueError(
@@ -2751,7 +2752,7 @@ def from_pretrained(
             )
             # training in 8-bit is only available in 0.37.0+
             model._is_quantized_training_enabled = version.parse(
-                importlib_metadata.version("bitsandbytes")
+                importlib.metadata.version("bitsandbytes")
             ) >= version.parse("0.37.0")
 
             model.config.quantization_config = quantization_config
@@ -2786,7 +2787,7 @@ def from_pretrained(
             target_dtype = torch_dtype
 
             if load_in_4bit:
-                if version.parse(importlib_metadata.version("accelerate")) > version.parse("0.19.0"):
+                if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
                     from accelerate.utils import CustomDtype
 
                     target_dtype = CustomDtype.INT4
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index a24a82f1e403..8abbfd670665 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -1,10 +1,11 @@
+import importlib.metadata
 import warnings
 from copy import deepcopy
 
 from packaging import version
 
 from ..utils import logging
-from .import_utils import importlib_metadata, is_accelerate_available, is_bitsandbytes_available
+from .import_utils import is_accelerate_available, is_bitsandbytes_available
 
 
 if is_bitsandbytes_available():
@@ -75,7 +76,7 @@ class `Int8Params` from `bitsandbytes`.
             elif isinstance(value, torch.Tensor):
                 new_value = value.to("cpu")
                 if value.dtype == torch.int8:
-                    is_8bit_serializable = version.parse(importlib_metadata.version("bitsandbytes")) > version.parse(
+                    is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
                         "0.37.2"
                     )
                     if not is_8bit_serializable:
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index bbdae6a7b4b8..ed356a05b700 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -15,6 +15,7 @@
 Import utilities: Utilities related to imports and our lazy inits.
 """
 
+import importlib.metadata
 import importlib.util
 import json
 import os
@@ -31,7 +32,6 @@
 from packaging import version
 
 from . import logging
-from .versions import importlib_metadata
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -44,9 +44,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
     package_version = "N/A"
     if package_exists:
         try:
-            package_version = importlib_metadata.version(pkg_name)
+            package_version = importlib.metadata.version(pkg_name)
             package_exists = True
-        except importlib_metadata.PackageNotFoundError:
+        except importlib.metadata.PackageNotFoundError:
             package_exists = False
         logger.debug(f"Detected {pkg_name} version {package_version}")
     if return_version:
@@ -71,7 +71,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
 _apex_available = _is_package_available("apex")
 _bitsandbytes_available = _is_package_available("bitsandbytes")
-# `importlib_metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
+# `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
 _bs4_available = importlib.util.find_spec("bs4") is not None
 _coloredlogs_available = _is_package_available("coloredlogs")
 _datasets_available = _is_package_available("datasets")
@@ -80,13 +80,13 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 # We need to check both `faiss` and `faiss-cpu`.
 _faiss_available = importlib.util.find_spec("faiss") is not None
 try:
-    _faiss_version = importlib_metadata.version("faiss")
+    _faiss_version = importlib.metadata.version("faiss")
     logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib_metadata.PackageNotFoundError:
+except importlib.metadata.PackageNotFoundError:
     try:
-        _faiss_version = importlib_metadata.version("faiss-cpu")
+        _faiss_version = importlib.metadata.version("faiss-cpu")
         logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib_metadata.PackageNotFoundError:
+    except importlib.metadata.PackageNotFoundError:
         _faiss_available = False
 _ftfy_available = _is_package_available("ftfy")
 _ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
@@ -115,8 +115,8 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _sklearn_available = importlib.util.find_spec("sklearn") is not None
 if _sklearn_available:
     try:
-        importlib_metadata.version("scikit-learn")
-    except importlib_metadata.PackageNotFoundError:
+        importlib.metadata.version("scikit-learn")
+    except importlib.metadata.PackageNotFoundError:
         _sklearn_available = False
 _smdistributed_available = importlib.util.find_spec("smdistributed") is not None
 _soundfile_available = _is_package_available("soundfile")
@@ -168,9 +168,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
             # For the metadata, we have to look for both tensorflow and tensorflow-cpu
             for pkg in candidates:
                 try:
-                    _tf_version = importlib_metadata.version(pkg)
+                    _tf_version = importlib.metadata.version(pkg)
                     break
-                except importlib_metadata.PackageNotFoundError:
+                except importlib.metadata.PackageNotFoundError:
                     pass
             _tf_available = _tf_version is not None
         if _tf_available:
@@ -189,9 +189,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
     or importlib.util.find_spec("oneccl_bindings_for_pytorch") is not None
 )
 try:
-    ccl_version = importlib_metadata.version("oneccl_bind_pt")
+    ccl_version = importlib.metadata.version("oneccl_bind_pt")
     logger.debug(f"Detected oneccl_bind_pt version {ccl_version}")
-except importlib_metadata.PackageNotFoundError:
+except importlib.metadata.PackageNotFoundError:
     _is_ccl_available = False
 
 
@@ -530,8 +530,8 @@ def is_vision_available():
     _pil_available = importlib.util.find_spec("PIL") is not None
     if _pil_available:
         try:
-            package_version = importlib_metadata.version("Pillow")
-        except importlib_metadata.PackageNotFoundError:
+            package_version = importlib.metadata.version("Pillow")
+        except importlib.metadata.PackageNotFoundError:
             return False
         logger.debug(f"Detected PIL version {package_version}")
     return _pil_available
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 3c5634097cfa..cc7d195e1752 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import importlib.metadata
 import json
 import os
 from dataclasses import dataclass
@@ -23,7 +24,6 @@
 from packaging import version
 
 from ..utils import is_torch_available, logging
-from ..utils.import_utils import importlib_metadata
 
 
 if is_torch_available():
@@ -141,7 +141,7 @@ def post_init(self):
         if not isinstance(self.bnb_4bit_use_double_quant, bool):
             raise ValueError("bnb_4bit_use_double_quant must be a boolean")
 
-        if self.load_in_4bit and not version.parse(importlib_metadata.version("bitsandbytes")) >= version.parse(
+        if self.load_in_4bit and not version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse(
             "0.39.0"
         ):
             raise ValueError(
diff --git a/src/transformers/utils/versions.py b/src/transformers/utils/versions.py
index b97e75a33682..945a3977ce62 100644
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -15,6 +15,7 @@
 Utilities for working with package versions
 """
 
+import importlib.metadata
 import operator
 import re
 import sys
@@ -23,13 +24,6 @@
 from packaging import version
 
 
-# The package importlib_metadata is in a different place, depending on the python version.
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-
 ops = {
     "<": operator.lt,
     "<=": operator.le,
@@ -56,7 +50,7 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
     """
     Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
 
-    The installed module version comes from the *site-packages* dir via *importlib_metadata*.
+    The installed module version comes from the *site-packages* dir via *importlib.metadata*.
 
     Args:
         requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
@@ -105,9 +99,9 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
 
     # check if any version is installed
     try:
-        got_ver = importlib_metadata.version(pkg)
-    except importlib_metadata.PackageNotFoundError:
-        raise importlib_metadata.PackageNotFoundError(
+        got_ver = importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        raise importlib.metadata.PackageNotFoundError(
             f"The '{requirement}' distribution was not found and is required by this application. {hint}"
         )
 
diff --git a/tests/bnb/test_4bit.py b/tests/bnb/test_4bit.py
index 0da82b063c82..00d4109ca6ac 100644
--- a/tests/bnb/test_4bit.py
+++ b/tests/bnb/test_4bit.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import importlib.metadata
 import tempfile
 import unittest
 
@@ -36,7 +37,6 @@
     require_torch_multi_gpu,
     slow,
 )
-from transformers.utils.versions import importlib_metadata
 
 
 def get_some_linear_layer(model):
@@ -446,7 +446,7 @@ def setUp(self):
         super().setUp()
 
     def test_training(self):
-        if version.parse(importlib_metadata.version("bitsandbytes")) < version.parse("0.37.0"):
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
             return
 
         # Step 1: freeze all parameters
diff --git a/tests/bnb/test_mixed_int8.py b/tests/bnb/test_mixed_int8.py
index 2fd4ccabdaee..66c6c2ed6976 100644
--- a/tests/bnb/test_mixed_int8.py
+++ b/tests/bnb/test_mixed_int8.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import importlib.metadata
 import tempfile
 import unittest
 
@@ -38,7 +39,6 @@
     require_torch_multi_gpu,
     slow,
 )
-from transformers.utils.versions import importlib_metadata
 
 
 def get_some_linear_layer(model):
@@ -722,7 +722,7 @@ def setUp(self):
         super().setUp()
 
     def test_training(self):
-        if version.parse(importlib_metadata.version("bitsandbytes")) < version.parse("0.37.0"):
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
             return
 
         # Step 1: freeze all parameters
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index a9db072f04d1..c0fa748cbfa4 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -22,7 +22,7 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Literal, Optional
 
 import yaml
 
@@ -30,13 +30,6 @@
 from transformers.hf_argparser import make_choice_type_function, string_to_bool
 
 
-try:
-    # For Python versions <3.8, Literal is not in typing: https://peps.python.org/pep-0586/
-    from typing import Literal
-except ImportError:
-    # For Python 3.7
-    from typing_extensions import Literal
-
 # Since Python 3.10, we can use the builtin `|` operator for Union types
 # See PEP 604: https://peps.python.org/pep-0604
 is_python_no_less_than_3_10 = sys.version_info >= (3, 10)
diff --git a/tests/utils/test_versions_utils.py b/tests/utils/test_versions_utils.py
index e691a2bcc383..14839400c2cd 100644
--- a/tests/utils/test_versions_utils.py
+++ b/tests/utils/test_versions_utils.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib.metadata
 import sys
 
 from transformers.testing_utils import TestCasePlus
-from transformers.utils.versions import importlib_metadata, require_version, require_version_core
+from transformers.utils.versions import require_version, require_version_core
 
 
-numpy_ver = importlib_metadata.version("numpy")
+numpy_ver = importlib.metadata.version("numpy")
 python_ver = ".".join([str(x) for x in sys.version_info[:3]])
 
 
@@ -65,7 +66,7 @@ def test_core(self):
         for req in ["numpipypie>1", "numpipypie2"]:
             try:
                 require_version_core(req)
-            except importlib_metadata.PackageNotFoundError as e:
+            except importlib.metadata.PackageNotFoundError as e:
                 self.assertIn(f"The '{req}' distribution was not found and is required by this application", str(e))
                 self.assertIn("Try: pip install transformers -U", str(e))
 

From 11cb6e0f7eb48bf973595eb42e827b89831704ab Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:01:22 +0200
Subject: [PATCH 554/935] Unpin DeepSpeed and require DS >= 0.9.3 (#24541)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/self-scheduled.yml          | 2 +-
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index d92fb01cd529..a0a9d3a5de4e 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -393,7 +393,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed==0.9.2 --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/setup.py b/setup.py
index b88c7efe8dac..17a3e8c128bf 100644
--- a/setup.py
+++ b/setup.py
@@ -107,7 +107,7 @@
     "dataclasses",
     "datasets!=2.5.0",
     "decord==0.6.0",
-    "deepspeed>=0.8.3",
+    "deepspeed>=0.9.3",
     "diffusers",
     "dill<0.3.5",
     "evaluate>=0.2.0",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 1e585889cca5..9c525d79102a 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -12,7 +12,7 @@
     "dataclasses": "dataclasses",
     "datasets": "datasets!=2.5.0",
     "decord": "decord==0.6.0",
-    "deepspeed": "deepspeed>=0.8.3",
+    "deepspeed": "deepspeed>=0.9.3",
     "diffusers": "diffusers",
     "dill": "dill<0.3.5",
     "evaluate": "evaluate>=0.2.0",

From daccde143d646e4fec8d52cc870b8c7fd1d2581c Mon Sep 17 00:00:00 2001
From: Frank995 <47689966+Frank995@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:54:36 +0200
Subject: [PATCH 555/935] Allow for warn_only selection in
 enable_full_determinism (#24496)

* Warn only in enable full determinism

* Add option in the function definition
---
 src/transformers/trainer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 13e72b3d4412..a497ef1ee997 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -55,7 +55,7 @@ def seed_worker(_):
     set_seed(worker_seed)
 
 
-def enable_full_determinism(seed: int):
+def enable_full_determinism(seed: int, warn_only: bool = False):
     """
     Helper function for reproducible behavior during distributed training. See
     - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
@@ -70,7 +70,7 @@ def enable_full_determinism(seed: int):
         # depending on the CUDA version, so we set them both here
         os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-        torch.use_deterministic_algorithms(True)
+        torch.use_deterministic_algorithms(True, warn_only=warn_only)
 
         # Enable CUDNN deterministic mode
         torch.backends.cudnn.deterministic = True

From c5e29d4381d4b9739e6cb427adbca87fbb43a3ad Mon Sep 17 00:00:00 2001
From: Max Ryabinin <mryabinin0@gmail.com>
Date: Wed, 28 Jun 2023 17:36:17 +0300
Subject: [PATCH 556/935] Fix typing annotations for FSDP and DeepSpeed in
 TrainingArguments (#24549)

* Fix typing annotations for FSDP and DeepSpeed in TrainingArguments

* Change dict to Dict
---
 src/transformers/training_args.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index e8c2823f3793..ac875e0570cc 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -976,12 +976,12 @@ class TrainingArguments:
             )
         },
     )
-    fsdp_config: Optional[str] = field(
+    fsdp_config: Optional[Union[str, Dict]] = field(
         default=None,
         metadata={
             "help": (
-                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The  value is either a"
-                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded  json file as `dict`."
+                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The value is either a"
+                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`."
             )
         },
     )
@@ -994,11 +994,11 @@ class TrainingArguments:
             )
         },
     )
-    deepspeed: Optional[str] = field(
+    deepspeed: Optional[Union[str, Dict]] = field(
         default=None,
         metadata={
             "help": (
-                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already"
                 " loaded json file as a dict"
             )
         },

From 6c57ce15587810968d64fb4b5700b63726397194 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 28 Jun 2023 16:36:57 +0200
Subject: [PATCH 557/935] Update PT/TF weight conversion after #24030 (#24547)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/modeling_tf_pytorch_utils.py | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index afefa31b2cca..fbce340fea76 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -273,6 +273,18 @@ def load_pytorch_state_dict_in_tf2_model(
             new_key = key.replace("running_var", "moving_variance")
         if "running_mean" in key:
             new_key = key.replace("running_mean", "moving_mean")
+
+        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+        key_components = key.split(".")
+        name = None
+        if key_components[-3::2] == ["parametrizations", "original0"]:
+            name = key_components[-2] + "_g"
+        elif key_components[-3::2] == ["parametrizations", "original1"]:
+            name = key_components[-2] + "_v"
+        if name is not None:
+            key_components = key_components[:-3] + [name]
+            new_key = ".".join(key_components)
+
         if new_key is None:
             new_key = key
         tf_keys_to_pt_keys[new_key] = key
@@ -499,15 +511,27 @@ def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_
             new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
             continue
 
+        pt_weight_name_to_check = pt_weight_name
+        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+        key_components = pt_weight_name.split(".")
+        name = None
+        if key_components[-3::2] == ["parametrizations", "original0"]:
+            name = key_components[-2] + "_g"
+        elif key_components[-3::2] == ["parametrizations", "original1"]:
+            name = key_components[-2] + "_v"
+        if name is not None:
+            key_components = key_components[:-3] + [name]
+            pt_weight_name_to_check = ".".join(key_components)
+
         # Find associated numpy array in pytorch model state dict
-        if pt_weight_name not in tf_weights_map:
+        if pt_weight_name_to_check not in tf_weights_map:
             if allow_missing_keys:
                 missing_keys_pt.append(pt_weight_name)
                 continue
 
             raise AttributeError(f"{pt_weight_name} not found in TF 2.0 model")
 
-        array, transpose = tf_weights_map[pt_weight_name]
+        array, transpose = tf_weights_map[pt_weight_name_to_check]
 
         array = apply_transpose(transpose, array, pt_weight.shape, pt_to_tf=False)
 

From b0651655be69e24cd820723202f712c8bd95263b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 28 Jun 2023 18:01:41 +0200
Subject: [PATCH 558/935] Update `EncodecIntegrationTest` (#24553)

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/encodec/test_modeling_encodec.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 427f92f6c25d..b6291aafedfa 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -412,8 +412,8 @@ def test_integration_24kHz(self):
             "24.0": 0.0015,
         }
         expected_codesums = {
-            "1.5": [367184],
-            "24.0": [6648961],
+            "1.5": [371955],
+            "24.0": [6659962],
         }
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         model_id = "facebook/encodec_24khz"
@@ -466,8 +466,8 @@ def test_integration_48kHz(self):
             "24.0": 0.0005,
         }
         expected_codesums = {
-            "3.0": [142174, 147901, 154090, 178965, 161879],
-            "24.0": [1561048, 1284593, 1278330, 1487220, 1659404],
+            "3.0": [144259, 146765, 156435, 176871, 161971],
+            "24.0": [1568553, 1294948, 1306190, 1464747, 1663150],
         }
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         model_id = "facebook/encodec_48khz"
@@ -523,12 +523,12 @@ def test_batch_48kHz(self):
         }
         expected_codesums = {
             "3.0": [
-                [71689, 78549, 75644, 88889, 73100, 82509, 71449, 82835],
-                [84427, 82356, 75809, 52509, 80137, 87672, 87436, 70456],
+                [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842],
+                [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
             ],
             "24.0": [
-                [71689, 78549, 75644, 88889, 73100, 82509, 71449, 82835],
-                [84427, 82356, 75809, 52509, 80137, 87672, 87436, 70456],
+                [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842],
+                [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
             ],
         }
         librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

From 903b97d8df6a141bc88cfb95ac6ccba2e1b57372 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 28 Jun 2023 18:02:13 +0200
Subject: [PATCH 559/935] [`gpt2-int8`] Add gpt2-xl int8 test (#24543)

add gpt2-xl test
---
 tests/bnb/test_mixed_int8.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tests/bnb/test_mixed_int8.py b/tests/bnb/test_mixed_int8.py
index 66c6c2ed6976..f905b26e3f71 100644
--- a/tests/bnb/test_mixed_int8.py
+++ b/tests/bnb/test_mixed_int8.py
@@ -762,8 +762,24 @@ def test_training(self):
 class MixedInt8GPT2Test(MixedInt8Test):
     model_name = "gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
-    EXPECTED_OUTPUT = "Hello my name is John Doe, and I am a member of the"
+    EXPECTED_OUTPUT = "Hello my name is John Doe, and I'm a big fan of"
 
     def test_int8_from_pretrained(self):
-        # TODO @younesbelkada: Test loading quantized gpt2 model from the hub.
-        pass
+        r"""
+        Test whether loading a 8bit model from the Hub works as expected
+        """
+        from bitsandbytes.nn import Int8Params
+
+        model_id = "ybelkada/gpt2-xl-8bit"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+
+        linear = get_some_linear_layer(model)
+        self.assertTrue(linear.weight.__class__ == Int8Params)
+        self.assertTrue(hasattr(linear.weight, "SCB"))
+
+        # generate
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

From c70c88a26822902c9b16714940077d3e3ec349a8 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:17:27 +0100
Subject: [PATCH 560/935] Fix processor __init__ bug if image processor
 undefined (#24554)

Make sure feature_extractor is defined in all cases
---
 src/transformers/models/altclip/processing_altclip.py            | 1 +
 src/transformers/models/chinese_clip/processing_chinese_clip.py  | 1 +
 src/transformers/models/clip/processing_clip.py                  | 1 +
 src/transformers/models/clipseg/processing_clipseg.py            | 1 +
 src/transformers/models/donut/processing_donut.py                | 1 +
 src/transformers/models/flava/processing_flava.py                | 1 +
 src/transformers/models/layoutlmv2/processing_layoutlmv2.py      | 1 +
 src/transformers/models/layoutlmv3/processing_layoutlmv3.py      | 1 +
 src/transformers/models/mgp_str/processing_mgp_str.py            | 1 +
 src/transformers/models/owlvit/processing_owlvit.py              | 1 +
 src/transformers/models/trocr/processing_trocr.py                | 1 +
 src/transformers/models/vilt/processing_vilt.py                  | 1 +
 .../processing_vision_text_dual_encoder.py                       | 1 +
 src/transformers/models/x_clip/processing_x_clip.py              | 1 +
 14 files changed, 14 insertions(+)

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 8fe49ad678e9..4cbf7fe192f0 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -40,6 +40,7 @@ class AltCLIPProcessor(ProcessorMixin):
     tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 6a8d9c961a37..42986e2c347d 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -41,6 +41,7 @@ class ChineseCLIPProcessor(ProcessorMixin):
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 3e2f438d263e..291fd55674a1 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -40,6 +40,7 @@ class CLIPProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index df3705e99e2c..e42e18d0e66c 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -40,6 +40,7 @@ class CLIPSegProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 5693fe110d95..c332475e1e2a 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -42,6 +42,7 @@ class DonutProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index e13716330fcc..131454193566 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -41,6 +41,7 @@ class FlavaProcessor(ProcessorMixin):
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 280b93043afe..f9990128ff07 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -48,6 +48,7 @@ class LayoutLMv2Processor(ProcessorMixin):
     tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 4d4069623780..04e50562f834 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -48,6 +48,7 @@ class LayoutLMv3Processor(ProcessorMixin):
     tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 76143deead64..1313fb214575 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -54,6 +54,7 @@ class MgpstrProcessor(ProcessorMixin):
     char_tokenizer_class = "MgpstrTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index e41dc16354b3..35472c367e2b 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -43,6 +43,7 @@ class OwlViTProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 8466df54f230..1191c566841f 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -40,6 +40,7 @@ class TrOCRProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 4fd8d9188faa..520b3082686a 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -42,6 +42,7 @@ class ViltProcessor(ProcessorMixin):
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index 118ec4705957..72b139c4de96 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -42,6 +42,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 0854c1d86890..534d7bb9f970 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -40,6 +40,7 @@ class XCLIPProcessor(ProcessorMixin):
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
         if "feature_extractor" in kwargs:
             warnings.warn(
                 "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"

From 33b5ef5cdf403ae42f7cdddf906e39073583f062 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 28 Jun 2023 19:06:30 +0200
Subject: [PATCH 561/935] [`InstructBlip`] Add instruct blip int8 test (#24555)

* add 8bit instructblip test

* update tests
---
 .../instructblip/modeling_instructblip.py     |  2 +-
 .../instructblip/processing_instructblip.py   |  7 +++--
 .../test_modeling_instructblip.py             | 28 ++++++-------------
 3 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 3eca3b0a32f8..a26884c85282 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1030,7 +1030,7 @@ def forward(
         if input_ids is not None:
             embeddings = self.word_embeddings(input_ids)
             if self.position_embedding_type == "absolute":
-                position_embeddings = self.position_embeddings(position_ids)
+                position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
                 embeddings = embeddings + position_embeddings
 
             if query_embeds is not None:
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 6f5e71c1178f..ab4fa0f6753d 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -19,9 +19,10 @@
 import os
 from typing import List, Optional, Union
 
+from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
 from ..auto import AutoTokenizer
 
@@ -71,7 +72,7 @@ def __call__(
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
-    ) -> BatchEncoding:
+    ) -> BatchFeature:
         """
         This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
@@ -81,7 +82,7 @@ def __call__(
         if images is None and text is None:
             raise ValueError("You have to specify at least images or text.")
 
-        encoding = BatchEncoding()
+        encoding = BatchFeature()
 
         if text is not None:
             text_encoding = self.tokenizer(
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index c41865455336..df3f8aab15e9 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -521,51 +521,39 @@ def prepare_img():
 @require_torch
 @slow
 class InstructBlipModelIntegrationTest(unittest.TestCase):
-    # TODO (@Younes): Re-enable this when 8-bit or 4-bit is implemented.
-    @unittest.skip(reason="GPU OOM")
     def test_inference_vicuna_7b(self):
         processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
-        model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b").to(
-            torch_device
+        model = InstructBlipForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True
         )
 
         url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
         image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
         prompt = "What is unusual about this image?"
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
 
         # verify logits
         with torch.no_grad():
             logits = model(**inputs).logits
 
         expected_slice = torch.tensor(
-            [[-3.4684, -12.6759, 8.5067], [-5.1305, -12.2058, 7.9834], [-4.0632, -13.9285, 9.2327]],
+            [[-3.5410, -12.2812, 8.2812], [-5.2500, -12.0938, 7.8398], [-4.1523, -13.8281, 9.0000]],
             device=torch_device,
         )
-        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-5)
+        self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))
 
         # verify generation
-        outputs = model.generate(
-            **inputs,
-            do_sample=False,
-            num_beams=5,
-            max_length=256,
-            min_length=1,
-            top_p=0.9,
-            repetition_penalty=1.5,
-            length_penalty=1.0,
-            temperature=1,
-        )
+        outputs = model.generate(**inputs, max_new_tokens=30)
         outputs[outputs == 0] = 2
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 
         # fmt: off
-        expected_outputs = [2, 450, 22910,  9565,   310,   445,  1967,   338,   393,   263, 767,   338, 13977,   292, 22095,   373,   278,  1250,   310,   263, 13328, 20134, 29963, 29892,   607,   338, 14089,   287,   297,   278, 7256,   310,   263, 19587,  4272, 11952, 29889,   910,   338,   385, 443,   535,   794,  1848,  2948,   304, 13977,   292, 22095, 29892, 408,   372,  6858,   278,   767,   304, 17346,  3654,   322,   670, 13977,   292, 21083,   373,  2246,   310,   278, 19716,  1550, 12402, 1218,  1549, 12469, 29889, 19814, 29892,   278, 10122,   310,  8818, 275,   322,   916, 24413,   297,   278,  9088,  4340, 19310,  7093, 278, 22910,  5469,   310,   445,  6434, 29889,     2,     1]
+        expected_outputs = [    2,   450, 22910,  9565,   310,   445,  1967,   338,   393,   263, 767,   338, 13977,   292, 22095,   373,   278,  1250,   310,   263, 13328, 20134, 29963,  1550, 19500,  1623,   263, 19587,  4272, 11952, 29889]
         # fmt: on
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.",
+            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving down a busy city street.",
         )
 
     def test_inference_flant5_xl(self):

From faae8d8255f7006b3628292471016fa05062dff0 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 28 Jun 2023 19:44:31 +0200
Subject: [PATCH 562/935] Update PT/Flax weight conversion after #24030
 (#24556)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../modeling_flax_pytorch_utils.py            | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index 5de6a985df41..21d312647f9a 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -120,6 +120,16 @@ def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool:
     if pt_tuple_key[-1] == "beta":
         return renamed_pt_tuple_key, pt_tensor
 
+    # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+    name = None
+    if pt_tuple_key[-3::2] == ("parametrizations", "original0"):
+        name = pt_tuple_key[-2] + "_g"
+    elif pt_tuple_key[-3::2] == ("parametrizations", "original1"):
+        name = pt_tuple_key[-2] + "_v"
+    if name is not None:
+        renamed_pt_tuple_key = pt_tuple_key[:-3] + (name,)
+        return renamed_pt_tuple_key, pt_tensor
+
     return pt_tuple_key, pt_tensor
 
 
@@ -372,6 +382,24 @@ def load_flax_weights_in_pytorch_model(pt_model, flax_state):
         else:
             flax_key = ".".join(flax_key_tuple)
 
+        # We also need to look at `pt_model_dict` and see if there are keys requiring further transformation.
+        special_pt_names = {}
+        # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030
+        for key in pt_model_dict:
+            key_components = key.split(".")
+            name = None
+            if key_components[-3::2] == ["parametrizations", "original0"]:
+                name = key_components[-2] + "_g"
+            elif key_components[-3::2] == ["parametrizations", "original1"]:
+                name = key_components[-2] + "_v"
+            if name is not None:
+                key_components = key_components[:-3] + [name]
+                key_to_check = ".".join(key_components)
+                special_pt_names[key_to_check] = key
+
+        if flax_key in special_pt_names:
+            flax_key = special_pt_names[flax_key]
+
         if flax_key in pt_model_dict:
             if flax_tensor.shape != pt_model_dict[flax_key].shape:
                 raise ValueError(

From fd6735102abcc560cb2b68523b3f5012da54a956 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 28 Jun 2023 20:11:01 +0200
Subject: [PATCH 563/935] Make PT/Flax tests could be run on GPU (#24557)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/clip/test_modeling_clip.py       | 4 ++--
 tests/models/clipseg/test_modeling_clipseg.py | 4 ++--
 tests/models/whisper/test_modeling_whisper.py | 8 ++++----
 tests/test_modeling_common.py                 | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index d16241ab2f22..98a3f8383825 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -611,7 +611,7 @@ def test_equivalence_pt_to_flax(self):
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
                 for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
@@ -669,7 +669,7 @@ def test_equivalence_flax_to_pt(self):
                 with torch.no_grad():
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index b54861d8d8d0..35dca117e2b8 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -592,7 +592,7 @@ def test_equivalence_pt_to_flax(self):
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
                 for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
@@ -650,7 +650,7 @@ def test_equivalence_flax_to_pt(self):
                 with torch.no_grad():
                     pt_outputs = pt_model(**pt_inputs).to_tuple()
 
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_outputs = fx_model(**fx_inputs).to_tuple()
                 self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 20bbfbac3257..614b29b160d1 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -875,7 +875,7 @@ def test_equivalence_pt_to_flax(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
                 fx_model.params = fx_state
@@ -948,7 +948,7 @@ def test_equivalence_flax_to_pt(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
 
@@ -1805,7 +1805,7 @@ def test_equivalence_pt_to_flax(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
                 fx_model.params = fx_state
@@ -1878,7 +1878,7 @@ def test_equivalence_flax_to_pt(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 878e3c647302..7e7ca36b4cec 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2206,7 +2206,7 @@ def test_equivalence_pt_to_flax(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
                 fx_model.params = fx_state
@@ -2278,7 +2278,7 @@ def test_equivalence_flax_to_pt(self):
                 }
 
                 # convert inputs to Flax
-                fx_inputs = {k: np.array(v) for k, v in pt_inputs.items() if torch.is_tensor(v)}
+                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
 
                 pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
 

From 66954ea25e342fd451c26ec1c295da0b8692086b Mon Sep 17 00:00:00 2001
From: condor-cp <40066676+condor-cp@users.noreply.github.com>
Date: Wed, 28 Jun 2023 23:54:20 +0200
Subject: [PATCH 564/935] Update masked_language_modeling.md (#24560)

See https://github.com/huggingface/transformers/issues/24546
---
 docs/source/en/tasks/masked_language_modeling.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
index 5920bfcaf8d8..83a955efae07 100644
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -168,7 +168,6 @@ You can now use a second preprocessing function to
 ...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
 ...         for k, t in concatenated_examples.items()
 ...     }
-...     result["labels"] = result["input_ids"].copy()
 ...     return result
 ```
 

From 10c2ac7bc678a02569137eb606503fc6cdd50a0c Mon Sep 17 00:00:00 2001
From: Pasquale De Marinis <45567509+pasqualedem@users.noreply.github.com>
Date: Thu, 29 Jun 2023 10:17:26 +0200
Subject: [PATCH 565/935] Fixed OwlViTModel inplace operations (#24529)

* fixed OwlViTModel inplace operations

* fixed operands order in owlvit
---
 src/transformers/models/owlvit/modeling_owlvit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 34ee828a7400..9984cc516213 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1294,8 +1294,8 @@ def forward(
             return (pred_logits, image_class_embeds)
 
         # Normalize image and text features
-        image_class_embeds /= torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6
-        query_embeds /= torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6
+        image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
+        query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)
 
         # Get class predictions
         pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)

From ae454f41d472617f3af0a184482c576c722b7e7e Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 29 Jun 2023 10:17:36 +0100
Subject: [PATCH 566/935] Update old existing feature extractor references
 (#24552)

* Update old existing feature extractor references

* Typo

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Address comments from review - update 'feature extractor'
Co-authored by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 docs/source/de/preprocessing.md               | 10 +--
 docs/source/en/create_a_model.md              |  4 +-
 docs/source/en/model_doc/clip.md              |  4 +-
 docs/source/en/model_doc/donut.md             |  4 +-
 docs/source/en/model_doc/layoutlmv2.md        | 18 ++---
 docs/source/en/model_doc/layoutlmv3.md        |  2 +-
 docs/source/en/model_doc/layoutxlm.md         |  2 +-
 docs/source/en/model_doc/owlvit.md            |  2 +-
 docs/source/en/model_doc/vilt.md              |  2 +-
 docs/source/en/tasks/object_detection.md      |  7 +-
 docs/source/es/tasks/image_classification.md  | 12 ++--
 docs/source/ko/tasks/object_detection.md      |  7 +-
 .../models/align/convert_align_tf_to_hf.py    |  6 +-
 .../beit/convert_beit_unilm_to_pytorch.py     | 12 ++--
 .../configuration_chinese_clip.py             |  2 +-
 .../models/clip/configuration_clip.py         |  2 +-
 .../convert_clipseg_original_pytorch_to_hf.py |  6 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 14 ++--
 .../convnext/convert_convnext_to_pytorch.py   | 12 ++--
 ..._original_pytorch_checkpoint_to_pytorch.py |  8 +--
 ..._original_pytorch_checkpoint_to_pytorch.py |  8 +--
 .../convert_deformable_detr_to_pytorch.py     | 14 ++--
 .../deit/convert_deit_timm_to_pytorch.py      | 12 ++--
 ..._original_pytorch_checkpoint_to_pytorch.py | 14 ++--
 .../models/detr/image_processing_detr.py      |  3 +-
 .../dit/convert_dit_unilm_to_pytorch.py       | 14 ++--
 .../models/donut/convert_donut_to_pytorch.py  |  6 +-
 .../dpt/convert_dpt_hybrid_to_pytorch.py      | 12 ++--
 .../models/dpt/convert_dpt_to_pytorch.py      | 14 ++--
 ..._original_pytorch_checkpoint_to_pytorch.py |  6 +-
 .../modeling_efficientformer.py               |  4 +-
 .../convert_efficientnet_to_pytorch.py        |  6 +-
 .../models/glpn/convert_glpn_to_pytorch.py    | 14 ++--
 .../models/groupvit/configuration_groupvit.py |  2 +-
 .../imagegpt/image_processing_imagegpt.py     |  2 +-
 .../layoutlmv3/configuration_layoutlmv3.py    |  2 +-
 .../models/layoutxlm/processing_layoutxlm.py  | 72 ++++++++++++++-----
 .../levit/convert_levit_timm_to_pytorch.py    | 10 +--
 ..._original_pytorch_checkpoint_to_pytorch.py | 14 ++--
 .../mask2former/modeling_mask2former.py       |  4 +-
 ..._original_pytorch_checkpoint_to_pytorch.py | 22 +++---
 .../convert_maskformer_resnet_to_pytorch.py   | 14 ++--
 .../convert_maskformer_swin_to_pytorch.py     | 14 ++--
 ...nvert_original_tf_checkpoint_to_pytorch.py | 14 ++--
 ...nvert_original_tf_checkpoint_to_pytorch.py | 10 +--
 .../mobilevit/convert_mlcvnets_to_pytorch.py  | 14 ++--
 .../convert_mlcvnets_to_pytorch.py            |  8 +--
 .../models/owlvit/configuration_owlvit.py     |  2 +-
 .../convert_owlvit_original_flax_to_hf.py     | 10 +--
 .../convert_perceiver_haiku_to_pytorch.py     |  6 +-
 .../convert_poolformer_original_to_pytorch.py | 22 +++---
 .../convert_regnet_seer_10b_to_pytorch.py     | 10 +--
 .../regnet/convert_regnet_to_pytorch.py       | 10 +--
 .../resnet/convert_resnet_to_pytorch.py       | 10 +--
 .../convert_segformer_original_to_pytorch.py  | 14 ++--
 .../swin/convert_swin_simmim_to_pytorch.py    | 14 ++--
 .../swin/convert_swin_timm_to_pytorch.py      | 10 +--
 .../swinv2/convert_swinv2_timm_to_pytorch.py  | 10 +--
 ..._original_pytorch_checkpoint_to_pytorch.py | 12 ++--
 .../convert_timesformer_to_pytorch.py         | 10 +--
 .../timesformer/modeling_timesformer.py       |  4 +-
 .../trocr/convert_trocr_unilm_to_pytorch.py   |  6 +-
 .../models/van/convert_van_to_pytorch.py      | 10 +--
 .../videomae/convert_videomae_to_pytorch.py   | 10 +--
 .../vilt/convert_vilt_original_to_pytorch.py  |  6 +-
 .../models/vit/convert_dino_to_pytorch.py     | 12 ++--
 .../models/vit/convert_vit_timm_to_pytorch.py | 14 ++--
 .../vit_mae/convert_vit_mae_to_pytorch.py     | 12 ++--
 .../models/vit_msn/convert_msn_to_pytorch.py  | 12 ++--
 .../convert_x_clip_original_pytorch_to_hf.py  |  6 +-
 .../models/yolos/convert_yolos_to_pytorch.py  | 14 ++--
 src/transformers/onnx/__main__.py             |  6 +-
 tests/models/beit/test_modeling_beit.py       | 32 ++++-----
 tests/models/beit/test_modeling_flax_beit.py  | 20 +++---
 tests/models/bit/test_modeling_bit.py         |  6 +-
 .../test_image_processing_bridgetower.py      |  8 +--
 .../test_modeling_conditional_detr.py         | 16 ++---
 .../models/convnext/test_modeling_convnext.py | 10 +--
 .../convnext/test_modeling_tf_convnext.py     | 12 ++--
 tests/models/cvt/test_modeling_cvt.py         | 10 +--
 tests/models/cvt/test_modeling_tf_cvt.py      | 10 +--
 .../data2vec/test_modeling_data2vec_vision.py | 12 ++--
 .../test_modeling_tf_data2vec_vision.py       | 12 ++--
 .../test_modeling_deformable_detr.py          | 20 +++---
 tests/models/deit/test_modeling_deit.py       | 14 ++--
 tests/models/deit/test_modeling_tf_deit.py    | 10 +--
 tests/models/detr/test_modeling_detr.py       | 30 ++++----
 tests/models/dinat/test_modeling_dinat.py     |  6 +-
 tests/models/dit/test_modeling_dit.py         |  6 +-
 tests/models/dpt/test_modeling_dpt.py         | 18 ++---
 tests/models/dpt/test_modeling_dpt_hybrid.py  |  6 +-
 .../test_modeling_efficientformer.py          | 10 +--
 tests/models/glpn/test_modeling_glpn.py       |  6 +-
 .../models/imagegpt/test_modeling_imagegpt.py | 10 +--
 .../layoutlmv3/test_modeling_layoutlmv3.py    | 10 +--
 .../layoutlmv3/test_modeling_tf_layoutlmv3.py | 10 +--
 .../layoutxlm/test_processor_layoutxlm.py     | 60 ++++++++--------
 tests/models/levit/test_modeling_levit.py     | 10 +--
 .../test_image_processing_mask2former.py      |  6 +-
 .../mask2former/test_modeling_mask2former.py  | 14 ++--
 .../test_image_processing_maskformer.py       |  6 +-
 .../maskformer/test_modeling_maskformer.py    | 22 +++---
 .../models/mgp_str/test_processor_mgp_str.py  |  2 +-
 .../test_modeling_mobilenet_v1.py             | 12 ++--
 .../test_modeling_mobilenet_v2.py             | 16 ++---
 .../mobilevit/test_modeling_mobilevit.py      | 22 +++---
 .../mobilevit/test_modeling_tf_mobilevit.py   | 10 +--
 tests/models/nat/test_modeling_nat.py         |  6 +-
 .../perceiver/test_modeling_perceiver.py      | 14 ++--
 .../poolformer/test_modeling_poolformer.py    |  6 +-
 .../regnet/test_modeling_flax_regnet.py       | 10 +--
 tests/models/regnet/test_modeling_regnet.py   | 10 +--
 .../models/regnet/test_modeling_tf_regnet.py  | 10 +--
 .../resnet/test_modeling_flax_resnet.py       | 10 +--
 tests/models/resnet/test_modeling_resnet.py   | 10 +--
 .../models/resnet/test_modeling_tf_resnet.py  | 10 +--
 .../segformer/test_modeling_segformer.py      | 18 ++---
 .../segformer/test_modeling_tf_segformer.py   | 10 +--
 .../swiftformer/test_modeling_swiftformer.py  |  6 +-
 tests/models/swin/test_modeling_swin.py       | 10 +--
 tests/models/swin/test_modeling_tf_swin.py    | 10 +--
 tests/models/swinv2/test_modeling_swinv2.py   | 10 +--
 .../test_modeling_table_transformer.py        |  6 +-
 .../timesformer/test_modeling_timesformer.py  | 10 +--
 tests/models/tvlt/test_modeling_tvlt.py       |  6 +-
 tests/models/van/test_modeling_van.py         | 10 +--
 .../models/videomae/test_modeling_videomae.py | 14 ++--
 ...st_modeling_flax_vision_encoder_decoder.py |  6 +-
 ...test_modeling_tf_vision_encoder_decoder.py | 12 ++--
 .../test_modeling_vision_encoder_decoder.py   |  6 +-
 ...test_processor_vision_text_dual_encoder.py |  4 +-
 tests/models/vit/test_modeling_tf_vit.py      | 10 +--
 tests/models/vit/test_modeling_vit.py         | 18 ++---
 .../vit_hybrid/test_modeling_vit_hybrid.py    | 10 +--
 .../vit_mae/test_modeling_tf_vit_mae.py       | 10 +--
 tests/models/vit_mae/test_modeling_vit_mae.py | 10 +--
 tests/models/vit_msn/test_modeling_vit_msn.py | 10 +--
 tests/models/yolos/test_modeling_yolos.py     | 12 ++--
 138 files changed, 762 insertions(+), 743 deletions(-)

diff --git a/docs/source/de/preprocessing.md b/docs/source/de/preprocessing.md
index eb3b3057c3e9..1e8f6ff4062a 100644
--- a/docs/source/de/preprocessing.md
+++ b/docs/source/de/preprocessing.md
@@ -354,12 +354,12 @@ Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (h
 
 ### Merkmalsextraktor
 
-Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
+Laden Sie den Merkmalsextraktor mit [`AutoImageProcessor.from_pretrained`]:
 
 ```py
->>> from transformers import AutoFeatureExtractor
+>>> from transformers import AutoImageProcessor
 
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
 ```
 
 ### Datenerweiterung
@@ -371,9 +371,9 @@ Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarb
 ```py
 >>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
 
->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
 >>> _transforms = Compose(
-...     [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
+...     [RandomResizedCrop(image_processor.size["height"]), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
 ... )
 ```
 
diff --git a/docs/source/en/create_a_model.md b/docs/source/en/create_a_model.md
index 239edf085ce5..91dd99e72544 100644
--- a/docs/source/en/create_a_model.md
+++ b/docs/source/en/create_a_model.md
@@ -263,7 +263,7 @@ To use, create an image processor associated with the model you're using. For ex
 ViTImageProcessor {
   "do_normalize": true,
   "do_resize": true,
-  "feature_extractor_type": "ViTImageProcessor",
+  "image_processor_type": "ViTImageProcessor",
   "image_mean": [
     0.5,
     0.5,
@@ -295,7 +295,7 @@ Modify any of the [`ViTImageProcessor`] parameters to create your custom image p
 ViTImageProcessor {
   "do_normalize": false,
   "do_resize": true,
-  "feature_extractor_type": "ViTImageProcessor",
+  "image_processor_type": "ViTImageProcessor",
   "image_mean": [
     0.3,
     0.3,
diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md
index 7bd40fc05b0b..be98edfb2a81 100644
--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@@ -50,10 +50,10 @@ product between the projected image and text features is then used as a similar
 To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
 which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
 also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
-The [`CLIPFeatureExtractor`] can be used to resize (or rescale) and normalize images for the model.
+The [`CLIPImageProcessor`] can be used to resize (or rescale) and normalize images for the model.
 
 The [`CLIPTokenizer`] is used to encode the text. The [`CLIPProcessor`] wraps
-[`CLIPFeatureExtractor`] and [`CLIPTokenizer`] into a single instance to both
+[`CLIPImageProcessor`] and [`CLIPTokenizer`] into a single instance to both
 encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
 [`CLIPProcessor`] and [`CLIPModel`].
 
diff --git a/docs/source/en/model_doc/donut.md b/docs/source/en/model_doc/donut.md
index 6ccfe39dbb3a..1e214da8bc77 100644
--- a/docs/source/en/model_doc/donut.md
+++ b/docs/source/en/model_doc/donut.md
@@ -46,9 +46,9 @@ Tips:
 Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
 [`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
 
-The [`DonutFeatureExtractor`] class is responsible for preprocessing the input image and
+The [`DonutImageProcessor`] class is responsible for preprocessing the input image and
 [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] decodes the generated target tokens to the target string. The
-[`DonutProcessor`] wraps [`DonutFeatureExtractor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
+[`DonutProcessor`] wraps [`DonutImageProcessor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
 into a single instance to both extract the input features and decode the predicted token ids.
 
 - Step-by-step Document Image Classification
diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md
index 37d1c1671e7f..f2a1c65a42b1 100644
--- a/docs/source/en/model_doc/layoutlmv2.md
+++ b/docs/source/en/model_doc/layoutlmv2.md
@@ -150,23 +150,23 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 ## Usage: LayoutLMv2Processor
 
 The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally
-combines a feature extractor ([`LayoutLMv2FeatureExtractor`]) and a tokenizer
-([`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]). The feature extractor
+combines a image processor ([`LayoutLMv2ImageProcessor`]) and a tokenizer
+([`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]). The image processor
 handles the image modality, while the tokenizer handles the text modality. A processor combines both, which is ideal
 for a multi-modal model like LayoutLMv2. Note that you can still use both separately, if you only want to handle one
 modality.
 
 ```python
-from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
+from transformers import LayoutLMv2ImageProcessor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
 
-feature_extractor = LayoutLMv2FeatureExtractor()  # apply_ocr is set to True by default
+image_processor = LayoutLMv2ImageProcessor()  # apply_ocr is set to True by default
 tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
-processor = LayoutLMv2Processor(feature_extractor, tokenizer)
+processor = LayoutLMv2Processor(image_processor, tokenizer)
 ```
 
 In short, one can provide a document image (and possibly additional data) to [`LayoutLMv2Processor`],
 and it will create the inputs expected by the model. Internally, the processor first uses
-[`LayoutLMv2FeatureExtractor`] to apply OCR on the image to get a list of words and normalized
+[`LayoutLMv2ImageProcessor`] to apply OCR on the image to get a list of words and normalized
 bounding boxes, as well to resize the image to a given size in order to get the `image` input. The words and
 normalized bounding boxes are then provided to [`LayoutLMv2Tokenizer`] or
 [`LayoutLMv2TokenizerFast`], which converts them to token-level `input_ids`,
@@ -176,7 +176,7 @@ which are turned into token-level `labels`.
 [`LayoutLMv2Processor`] uses [PyTesseract](https://pypi.org/project/pytesseract/), a Python
 wrapper around Google's Tesseract OCR engine, under the hood. Note that you can still use your own OCR engine of
 choice, and provide the words and normalized boxes yourself. This requires initializing
-[`LayoutLMv2FeatureExtractor`] with `apply_ocr` set to `False`.
+[`LayoutLMv2ImageProcessor`] with `apply_ocr` set to `False`.
 
 In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
 use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
@@ -184,7 +184,7 @@ use cases work for both batched and non-batched inputs (we illustrate them for n
 **Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr =
 True**
 
-This is the simplest case, in which the processor (actually the feature extractor) will perform OCR on the image to get
+This is the simplest case, in which the processor (actually the image processor) will perform OCR on the image to get
 the words and normalized bounding boxes.
 
 ```python
@@ -205,7 +205,7 @@ print(encoding.keys())
 
 **Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False**
 
-In case one wants to do OCR themselves, one can initialize the feature extractor with `apply_ocr` set to
+In case one wants to do OCR themselves, one can initialize the image processor with `apply_ocr` set to
 `False`. In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to
 the processor.
 
diff --git a/docs/source/en/model_doc/layoutlmv3.md b/docs/source/en/model_doc/layoutlmv3.md
index 39408ab88a79..22e2c3ff7186 100644
--- a/docs/source/en/model_doc/layoutlmv3.md
+++ b/docs/source/en/model_doc/layoutlmv3.md
@@ -31,7 +31,7 @@ Tips:
 - In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
     - images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
     - text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
-  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3FeatureExtractor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
+  Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3ImageProcessor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
 - Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-layoutlmv2processor) of its predecessor.
 - Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
 - Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md
index 20f9ffd603d0..8858560bbb21 100644
--- a/docs/source/en/model_doc/layoutxlm.md
+++ b/docs/source/en/model_doc/layoutxlm.md
@@ -52,7 +52,7 @@ tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
 ```
 
 Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
-[`LayoutLMv2FeatureExtractor`] and
+[`LayoutLMv2ImageProcessor`] and
 [`LayoutXLMTokenizer`]/[`LayoutXLMTokenizerFast`] in sequence) to prepare all
 data for the model.
 
diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
index 59a4aa0bc31c..737e42e25399 100644
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -28,7 +28,7 @@ The abstract from the paper is the following:
 
 OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
 
-[`OwlViTFeatureExtractor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTFeatureExtractor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`].
+[`OwlViTImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`OwlViTProcessor`] wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`OwlViTProcessor`] and [`OwlViTForObjectDetection`].
 
 
 ```python
diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md
index 4170e374d868..2e2f4a140d20 100644
--- a/docs/source/en/model_doc/vilt.md
+++ b/docs/source/en/model_doc/vilt.md
@@ -39,7 +39,7 @@ Tips:
 - The quickest way to get started with ViLT is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViLT)
   (which showcase both inference and fine-tuning on custom data).
 - ViLT is a model that takes both `pixel_values` and `input_ids` as input. One can use [`ViltProcessor`] to prepare data for the model.
-  This processor wraps a feature extractor (for the image modality) and a tokenizer (for the language modality) into one.
+  This processor wraps a image processor (for the image modality) and a tokenizer (for the language modality) into one.
 - ViLT is trained with images of various sizes: the authors resize the shorter edge of input images to 384 and limit the longer edge to
   under 640 while preserving the aspect ratio. To make batching of images possible, the authors use a `pixel_mask` that indicates
   which pixel values are real and which are padding. [`ViltProcessor`] automatically creates this for you.
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index a38c33c92718..56d6a1d8c4f8 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -462,9 +462,9 @@ Next, prepare an instance of a `CocoDetection` class that can be used with `coco
 
 
 >>> class CocoDetection(torchvision.datasets.CocoDetection):
-...     def __init__(self, img_folder, feature_extractor, ann_file):
+...     def __init__(self, img_folder, image_processor, ann_file):
 ...         super().__init__(img_folder, ann_file)
-...         self.feature_extractor = feature_extractor
+...         self.image_processor = image_processor
 
 ...     def __getitem__(self, idx):
 ...         # read in PIL image and target in COCO format
@@ -474,7 +474,7 @@ Next, prepare an instance of a `CocoDetection` class that can be used with `coco
 ...         # resizing + normalization of both image and target)
 ...         image_id = self.ids[idx]
 ...         target = {"image_id": image_id, "annotations": target}
-...         encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
+...         encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
 ...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
 ...         target = encoding["labels"][0]  # remove batch dimension
 
@@ -591,4 +591,3 @@ Let's plot the result:
 <div class="flex justify-center">
     <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
 </div>
-
diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md
index ef67131d73af..3a959aa934ff 100644
--- a/docs/source/es/tasks/image_classification.md
+++ b/docs/source/es/tasks/image_classification.md
@@ -73,12 +73,12 @@ Cada clase de alimento - o label - corresponde a un número; `79` indica una cos
 
 ## Preprocesa
 
-Carga el feature extractor de ViT para procesar la imagen en un tensor:
+Carga el image processor de ViT para procesar la imagen en un tensor:
 
 ```py
->>> from transformers import AutoFeatureExtractor
+>>> from transformers import AutoImageProcessor
 
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
 ```
 
 Aplica varias transformaciones de imagen al dataset para hacer el modelo más robusto contra el overfitting. En este caso se utilizará el módulo [`transforms`](https://pytorch.org/vision/stable/transforms.html) de torchvision. Recorta una parte aleatoria de la imagen, cambia su tamaño y normalízala con la media y la desviación estándar de la imagen:
@@ -86,8 +86,8 @@ Aplica varias transformaciones de imagen al dataset para hacer el modelo más ro
 ```py
 >>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
 
->>> normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
->>> _transforms = Compose([RandomResizedCrop(feature_extractor.size), ToTensor(), normalize])
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> _transforms = Compose([RandomResizedCrop(image_processor.size["height"]), ToTensor(), normalize])
 ```
 
 Crea una función de preprocesamiento que aplique las transformaciones y devuelva los `pixel_values` - los inputs al modelo - de la imagen:
@@ -160,7 +160,7 @@ Al llegar a este punto, solo quedan tres pasos:
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=feature_extractor,
+...     tokenizer=image_processor,
 ... )
 
 >>> trainer.train()
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
index f29ca5d61b6c..9eb3f4f74aa6 100644
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@@ -454,9 +454,9 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 
 
 
 >>> class CocoDetection(torchvision.datasets.CocoDetection):
-...     def __init__(self, img_folder, feature_extractor, ann_file):
+...     def __init__(self, img_folder, image_processor, ann_file):
 ...         super().__init__(img_folder, ann_file)
-...         self.feature_extractor = feature_extractor
+...         self.image_processor = image_processor
 
 ...     def __getitem__(self, idx):
 ...         # read in PIL image and target in COCO format
@@ -466,7 +466,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 
 ...         # resizing + normalization of both image and target)
 ...         image_id = self.ids[idx]
 ...         target = {"image_id": image_id, "annotations": target}
-...         encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
+...         encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
 ...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
 ...         target = encoding["labels"][0]  # remove batch dimension
 
@@ -586,4 +586,3 @@ Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.
 <div class="flex justify-center">
     <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
 </div>
-
diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py
index fbf53844ab9c..e4221d5283b7 100644
--- a/src/transformers/models/align/convert_align_tf_to_hf.py
+++ b/src/transformers/models/align/convert_align_tf_to_hf.py
@@ -354,12 +354,12 @@ def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_mod
         # Create folder to save model
         if not os.path.isdir(pytorch_dump_folder_path):
             os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and feature extractor
+        # Save converted model and image processor
         hf_model.save_pretrained(pytorch_dump_folder_path)
         processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        # Push model and feature extractor to hub
+        # Push model and image processor to hub
         print("Pushing converted ALIGN to the hub...")
         processor.push_to_hub("align-base")
         hf_model.push_to_hub("align-base")
@@ -381,7 +381,7 @@ def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_mod
         help="Path to the output PyTorch model directory.",
     )
     parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
 
     args = parser.parse_args()
     convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
index f80d52db7d88..1797f3a7fb59 100644
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@@ -27,10 +27,10 @@
 
 from transformers import (
     BeitConfig,
-    BeitFeatureExtractor,
     BeitForImageClassification,
     BeitForMaskedImageModeling,
     BeitForSemanticSegmentation,
+    BeitImageProcessor,
 )
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
@@ -266,16 +266,16 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     # Check outputs on an image
     if is_semantic:
-        feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
+        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
         ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         image = Image.open(ds[0]["file"])
     else:
-        feature_extractor = BeitFeatureExtractor(
+        image_processor = BeitImageProcessor(
             size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
         )
         image = prepare_img()
 
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     outputs = model(pixel_values)
@@ -353,8 +353,8 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index a1243b1852e2..941b76969644 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -468,7 +468,7 @@ def generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 0a3f6b9fc10a..e3efba36c5db 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -449,7 +449,7 @@ def generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 183bb93b9e2b..c614d61e5b3d 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -28,7 +28,7 @@
     CLIPSegTextConfig,
     CLIPSegVisionConfig,
     CLIPTokenizer,
-    ViTFeatureExtractor,
+    ViTImageProcessor,
 )
 
 
@@ -185,9 +185,9 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
     if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
         raise ValueError(f"Unexpected keys: {unexpected_keys}")
 
-    feature_extractor = ViTFeatureExtractor(size=352)
+    image_processor = ViTImageProcessor(size=352)
     tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPSegProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+    processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
     image = prepare_img()
     text = ["a glass", "something to fill", "wood", "a jar"]
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index 083b3c681ec2..b1a1b1c817ae 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -27,9 +27,9 @@
 
 from transformers import (
     ConditionalDetrConfig,
-    ConditionalDetrFeatureExtractor,
     ConditionalDetrForObjectDetection,
     ConditionalDetrForSegmentation,
+    ConditionalDetrImageProcessor,
 )
 from transformers.utils import logging
 
@@ -244,13 +244,13 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
         config.id2label = id2label
         config.label2id = {v: k for k, v in id2label.items()}
 
-    # load feature extractor
+    # load image processor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = ConditionalDetrFeatureExtractor(format=format)
+    image_processor = ConditionalDetrImageProcessor(format=format)
 
     # prepare image
     img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     logger.info(f"Converting model {model_name}...")
@@ -302,11 +302,11 @@ def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
     if is_panoptic:
         assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
 
-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
index 69c300eee404..cdcbf24d5523 100644
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
@@ -26,7 +26,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import ConvNextConfig, ConvNextFeatureExtractor, ConvNextForImageClassification
+from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor
 from transformers.utils import logging
 
 
@@ -144,10 +144,10 @@ def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
     model.eval()
 
-    # Check outputs on an image, prepared by ConvNextFeatureExtractor
+    # Check outputs on an image, prepared by ConvNextImageProcessor
     size = 224 if "224" in checkpoint_url else 384
-    feature_extractor = ConvNextFeatureExtractor(size=size)
-    pixel_values = feature_extractor(images=prepare_img(), return_tensors="pt").pixel_values
+    image_processor = ConvNextImageProcessor(size=size)
+    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
 
     logits = model(pixel_values).logits
 
@@ -191,8 +191,8 @@ def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     print("Pushing model to the hub...")
     model_name = "convnext"
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
index 30a65beb7e96..ea4edac16cdb 100644
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@
 import torch
 from huggingface_hub import cached_download, hf_hub_url
 
-from transformers import AutoFeatureExtractor, CvtConfig, CvtForImageClassification
+from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification
 
 
 def embeddings(idx):
@@ -307,8 +307,8 @@ def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_fo
         config.embed_dim = [192, 768, 1024]
 
     model = CvtForImageClassification(config)
-    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-    feature_extractor.size["shortest_edge"] = image_size
+    image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+    image_processor.size["shortest_edge"] = image_size
     original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"))
 
     huggingface_weights = OrderedDict()
@@ -329,7 +329,7 @@ def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_fo
 
     model.load_state_dict(huggingface_weights)
     model.save_pretrained(pytorch_dump_folder)
-    feature_extractor.save_pretrained(pytorch_dump_folder)
+    image_processor.save_pretrained(pytorch_dump_folder)
 
 
 # Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al
diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
index 8ff8c1b910f0..0c6f42f4ba7f 100755
--- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
@@ -8,7 +8,7 @@
 from timm.models import create_model
 
 from transformers import (
-    BeitFeatureExtractor,
+    BeitImageProcessor,
     Data2VecVisionConfig,
     Data2VecVisionForImageClassification,
     Data2VecVisionModel,
@@ -304,9 +304,9 @@ def main():
     orig_model.eval()
 
     # 3. Forward Beit model
-    feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
+    image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
     image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
@@ -354,7 +354,7 @@ def main():
     # 7. Save
     print(f"Saving to {args.hf_checkpoint_name}")
     hf_model.save_pretrained(args.hf_checkpoint_name)
-    feature_extractor.save_pretrained(args.hf_checkpoint_name)
+    image_processor.save_pretrained(args.hf_checkpoint_name)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
index d1fd8bcbe466..928fa368ed34 100644
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection
+from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
 from transformers.utils import logging
 
 
@@ -115,12 +115,12 @@ def convert_deformable_detr_checkpoint(
     config.id2label = id2label
     config.label2id = {v: k for k, v in id2label.items()}
 
-    # load feature extractor
-    feature_extractor = DeformableDetrFeatureExtractor(format="coco_detection")
+    # load image processor
+    image_processor = DeformableDetrImageProcessor(format="coco_detection")
 
     # prepare image
     img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     logger.info("Converting model...")
@@ -185,11 +185,11 @@ def convert_deformable_detr_checkpoint(
 
     print("Everything ok!")
 
-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     # Push to hub
     if push_to_hub:
diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
index a91702106e0f..2b5c795ff2d2 100644
--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
@@ -25,7 +25,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
+from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
 from transformers.utils import logging
 
 
@@ -182,12 +182,12 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
     model = DeiTForImageClassificationWithTeacher(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image, prepared by DeiTFeatureExtractor
+    # Check outputs on an image, prepared by DeiTImageProcessor
     size = int(
         (256 / 224) * config.image_size
     )  # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
-    feature_extractor = DeiTFeatureExtractor(size=size, crop_size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
     outputs = model(pixel_values)
 
@@ -198,8 +198,8 @@ def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
index b6dcc617da7b..72de2be8701a 100644
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation
+from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
 from transformers.utils import logging
 
 
@@ -201,13 +201,13 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
         config.id2label = id2label
         config.label2id = {v: k for k, v in id2label.items()}
 
-    # load feature extractor
+    # load image processor
     format = "coco_panoptic" if is_panoptic else "coco_detection"
-    feature_extractor = DetrFeatureExtractor(format=format)
+    image_processor = DetrImageProcessor(format=format)
 
     # prepare image
     img = prepare_img()
-    encoding = feature_extractor(images=img, return_tensors="pt")
+    encoding = image_processor(images=img, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     logger.info(f"Converting model {model_name}...")
@@ -258,11 +258,11 @@ def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
     if is_panoptic:
         assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
 
-    # Save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # Save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index f39db7b8de5a..c8ea81eac7b6 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -1341,8 +1341,7 @@ def post_process_instance(self, results, outputs, orig_target_sizes, max_target_
 
         Args:
             results (`List[Dict]`):
-                Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks" results will be
-                added.
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
             outputs ([`DetrSegmentationOutput`]):
                 Raw outputs of the model.
             orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
index d80e9d890ddc..c754b9bbf3ea 100644
--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling
+from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
 
@@ -171,12 +171,12 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     model.load_state_dict(state_dict)
 
     # Check outputs on an image
-    feature_extractor = BeitFeatureExtractor(
+    image_processor = BeitImageProcessor(
         size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
     )
     image = prepare_img()
 
-    encoding = feature_extractor(images=image, return_tensors="pt")
+    encoding = image_processor(images=image, return_tensors="pt")
     pixel_values = encoding["pixel_values"]
 
     outputs = model(pixel_values)
@@ -189,18 +189,18 @@ def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         if has_lm_head:
             model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
         else:
             model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
         model.push_to_hub(
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index cd0f43773567..13f669ad97fd 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -21,7 +21,7 @@
 from donut import DonutModel
 
 from transformers import (
-    DonutFeatureExtractor,
+    DonutImageProcessor,
     DonutProcessor,
     DonutSwinConfig,
     DonutSwinModel,
@@ -152,10 +152,10 @@ def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     image = dataset["test"][0]["image"].convert("RGB")
 
     tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
-    feature_extractor = DonutFeatureExtractor(
+    image_processor = DonutImageProcessor(
         do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
     )
-    processor = DonutProcessor(feature_extractor, tokenizer)
+    processor = DonutProcessor(image_processor, tokenizer)
     pixel_values = processor(image, return_tensors="pt").pixel_values
 
     if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
index a563436b13c8..0fa69adfaf39 100644
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
+from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
 from transformers.utils import logging
 
 
@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
 
     # Check outputs on an image
     size = 480 if "ade" in checkpoint_url else 384
-    feature_extractor = DPTFeatureExtractor(size=size)
+    image_processor = DPTImageProcessor(size=size)
 
     image = prepare_img()
-    encoding = feature_extractor(image, return_tensors="pt")
+    encoding = image_processor(image, return_tensors="pt")
 
     # forward pass
     outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
@@ -271,12 +271,12 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model.push_to_hub("ybelkada/dpt-hybrid-midas")
-        feature_extractor.push_to_hub("ybelkada/dpt-hybrid-midas")
+        image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
index 7ef5f7cf119a..cee5be88c3a2 100644
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
+from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
 from transformers.utils import logging
 
 
@@ -211,10 +211,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
 
     # Check outputs on an image
     size = 480 if "ade" in checkpoint_url else 384
-    feature_extractor = DPTFeatureExtractor(size=size)
+    image_processor = DPTImageProcessor(size=size)
 
     image = prepare_img()
-    encoding = feature_extractor(image, return_tensors="pt")
+    encoding = image_processor(image, return_tensors="pt")
 
     # forward pass
     outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
@@ -233,8 +233,8 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing model to hub...")
@@ -244,10 +244,10 @@ def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub
             commit_message="Add model",
             use_temp_dir=True,
         )
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
diff --git a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
index 2f5e441226f1..7431cd6136a5 100644
--- a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
@@ -208,7 +208,7 @@ def convert_efficientformer_checkpoint(
         )
         processor.push_to_hub(
             repo_id=f"Bearnardd/{pytorch_dump_path}",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -234,12 +234,12 @@ def convert_efficientformer_checkpoint(
         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
     )
 
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
     parser.add_argument(
         "--no-push_to_hub",
         dest="push_to_hub",
         action="store_false",
-        help="Do not push model and feature extractor to the hub",
+        help="Do not push model and image processor to the hub",
     )
     parser.set_defaults(push_to_hub=True)
 
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
index cd38b2d813e1..c3ed5cace852 100644
--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -537,8 +537,8 @@ def _init_weights(self, module: nn.Module):
 EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
-            [`ViTFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
+            [`ViTImageProcessor.preprocess`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
index 9813ccb9b80f..e9988524aca0 100644
--- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
+++ b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
@@ -305,12 +305,12 @@ def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_m
         # Create folder to save model
         if not os.path.isdir(pytorch_dump_folder_path):
             os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and feature extractor
+        # Save converted model and image processor
         hf_model.save_pretrained(pytorch_dump_folder_path)
         preprocessor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        # Push model and feature extractor to hub
+        # Push model and image processor to hub
         print(f"Pushing converted {model_name} to the hub...")
         model_name = f"efficientnet-{model_name}"
         preprocessor.push_to_hub(model_name)
@@ -333,7 +333,7 @@ def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_m
         help="Path to the output PyTorch model directory.",
     )
     parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
 
     args = parser.parse_args()
     convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
index fa0af4691a14..5f0183783ec8 100644
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
@@ -23,7 +23,7 @@
 import torch
 from PIL import Image
 
-from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation
+from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
 from transformers.utils import logging
 
 
@@ -131,12 +131,12 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
     # load GLPN configuration (Segformer-B4 size)
     config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
 
-    # load feature extractor (only resize + rescale)
-    feature_extractor = GLPNFeatureExtractor()
+    # load image processor (only resize + rescale)
+    image_processor = GLPNImageProcessor()
 
     # prepare image
     image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
 
     logger.info("Converting model...")
 
@@ -179,17 +179,17 @@ def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_h
 
     # finally, push to hub if required
     if push_to_hub:
-        logger.info("Pushing model and feature extractor to the hub...")
+        logger.info("Pushing model and image processor to the hub...")
         model.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
             commit_message="Add model",
             use_temp_dir=True,
         )
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
             organization="nielsr",
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 9f29b9a48120..867377a863e8 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -458,7 +458,7 @@ def generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index e40e09a533a1..b00ef83dda7e 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -81,7 +81,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor
+        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
         clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
         do_resize: bool = True,
         size: Dict[str, int] = None,
diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index e39e5b61e299..31ca2e00e471 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -260,7 +260,7 @@ def generate_dummy_inputs(
         """
 
         # A dummy image is used so OCR should not be applied
-        setattr(processor.feature_extractor, "apply_ocr", False)
+        setattr(processor.image_processor, "apply_ocr", False)
 
         # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
         batch_size = compute_effective_axis_dimension(
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 250c8c7fc614..1fd621f8883f 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -15,6 +15,7 @@
 """
 Processor class for LayoutXLM.
 """
+import warnings
 from typing import List, Optional, Union
 
 from ...processing_utils import ProcessorMixin
@@ -24,26 +25,45 @@
 
 class LayoutXLMProcessor(ProcessorMixin):
     r"""
-    Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
-    single processor.
+    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
+    processor.
 
     [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
-    to get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
     [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
     `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
     into token-level `labels` for token classification tasks (such as FUNSD, CORD).
 
     Args:
-        feature_extractor (`LayoutLMv2FeatureExtractor`):
-            An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
+        image_processor (`LayoutLMv2ImageProcessor`):
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
         tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
             An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
     """
-    feature_extractor_class = "LayoutLMv2FeatureExtractor"
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv2ImageProcessor"
     tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
 
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
     def __call__(
         self,
         images,
@@ -68,37 +88,37 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
-        [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
+        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
         bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
+        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
         `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
         arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
 
         Please refer to the docstring of the above two methods for more information.
         """
         # verify input
-        if self.feature_extractor.apply_ocr and (boxes is not None):
+        if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
                 "You cannot provide bounding boxes "
-                "if you initialized the feature extractor with apply_ocr set to True."
+                "if you initialized the image processor with apply_ocr set to True."
             )
 
-        if self.feature_extractor.apply_ocr and (word_labels is not None):
+        if self.image_processor.apply_ocr and (word_labels is not None):
             raise ValueError(
-                "You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
             )
 
         if return_overflowing_tokens is True and return_offsets_mapping is False:
             raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
 
-        # first, apply the feature extractor
-        features = self.feature_extractor(images=images, return_tensors=return_tensors)
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
 
         # second, apply the tokenizer
-        if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
             if isinstance(text, str):
-                text = [text]  # add batch dimension (as the feature extractor always adds a batch dimension)
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
             text_pair = features["words"]
 
         encoded_inputs = self.tokenizer(
@@ -162,3 +182,19 @@ def decode(self, *args, **kwargs):
     @property
     def model_input_names(self):
         return ["input_ids", "bbox", "attention_mask", "image"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
index a6ae6603e5de..6f285a6de393 100644
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
@@ -25,7 +25,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 
-from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher
+from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
 from transformers.utils import logging
 
 
@@ -74,8 +74,8 @@ def convert_weight_and_push(
 
     if push_to_hub:
         our_model.save_pretrained(save_directory / checkpoint_name)
-        feature_extractor = LevitFeatureExtractor()
-        feature_extractor.save_pretrained(save_directory / checkpoint_name)
+        image_processor = LevitImageProcessor()
+        image_processor.save_pretrained(save_directory / checkpoint_name)
 
         print(f"Pushed {checkpoint_name}")
 
@@ -167,12 +167,12 @@ def convert_weights_and_push(save_directory: Path, model_name: str = None, push_
         required=False,
         help="Path to the output PyTorch model directory.",
     )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
     parser.add_argument(
         "--no-push_to_hub",
         dest="push_to_hub",
         action="store_false",
-        help="Do not push model and feature extractor to the hub",
+        help="Do not push model and image processor to the hub",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
index 20ff7e780d36..ea1c578509f6 100644
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
@@ -192,7 +192,7 @@ def __call__(self, original_config: object) -> Mask2FormerConfig:
         return config
 
 
-class OriginalMask2FormerConfigToFeatureExtractorConverter:
+class OriginalMask2FormerConfigToImageProcessorConverter:
     def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
         model = original_config.MODEL
         model_input = original_config.INPUT
@@ -846,7 +846,7 @@ def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object
 def test(
     original_model,
     our_model: Mask2FormerForUniversalSegmentation,
-    feature_extractor: Mask2FormerImageProcessor,
+    image_processor: Mask2FormerImageProcessor,
     tolerance: float,
 ):
     with torch.no_grad():
@@ -854,7 +854,7 @@ def test(
         our_model = our_model.eval()
 
         im = prepare_img()
-        x = feature_extractor(images=im, return_tensors="pt")["pixel_values"]
+        x = image_processor(images=im, return_tensors="pt")["pixel_values"]
 
         original_model_backbone_features = original_model.backbone(x.clone())
         our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
@@ -979,10 +979,10 @@ def get_model_name(checkpoint_file: Path):
         checkpoints_dir, config_dir
     ):
         model_name = get_model_name(checkpoint_file)
-        feature_extractor = OriginalMask2FormerConfigToFeatureExtractorConverter()(
+        image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
             setup_cfg(Args(config_file=config_file))
         )
-        feature_extractor.size = {"height": 384, "width": 384}
+        image_processor.size = {"height": 384, "width": 384}
 
         original_config = setup_cfg(Args(config_file=config_file))
         mask2former_kwargs = OriginalMask2Former.from_config(original_config)
@@ -1012,8 +1012,8 @@ def get_model_name(checkpoint_file: Path):
             tolerance = 3e-1
 
         logger.info(f"🪄 Testing {model_name}...")
-        test(original_model, mask2former_for_segmentation, feature_extractor, tolerance)
+        test(original_model, mask2former_for_segmentation, image_processor, tolerance)
         logger.info(f"🪄 Pushing {model_name} to hub...")
 
-        feature_extractor.push_to_hub(model_name)
+        image_processor.push_to_hub(model_name)
         mask2former_for_segmentation.push_to_hub(model_name)
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 47a4a18d1a15..f814e1a0817a 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -2106,8 +2106,8 @@ def forward(
 MASK2FORMER_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`AutoImageProcessor.preprocess`] for details.
         pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
             Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
 
diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
index df7c7aa7e1d8..999eee136afb 100644
--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
@@ -29,7 +29,7 @@
 from PIL import Image
 from torch import Tensor, nn
 
-from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerFeatureExtractor
+from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerImageProcessor
 from transformers.models.maskformer.modeling_maskformer import (
     MaskFormerConfig,
     MaskFormerForInstanceSegmentation,
@@ -164,13 +164,13 @@ def __call__(self, original_config: object) -> MaskFormerConfig:
         return config
 
 
-class OriginalMaskFormerConfigToFeatureExtractorConverter:
-    def __call__(self, original_config: object) -> MaskFormerFeatureExtractor:
+class OriginalMaskFormerConfigToImageProcessorConverter:
+    def __call__(self, original_config: object) -> MaskFormerImageProcessor:
         model = original_config.MODEL
         model_input = original_config.INPUT
         dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
 
-        return MaskFormerFeatureExtractor(
+        return MaskFormerImageProcessor(
             image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
             image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
             size=model_input.MIN_SIZE_TEST,
@@ -554,7 +554,7 @@ def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object
             yield config, checkpoint
 
 
-def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_extractor: MaskFormerFeatureExtractor):
+def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_processor: MaskFormerImageProcessor):
     with torch.no_grad():
         original_model = original_model.eval()
         our_model = our_model.eval()
@@ -600,7 +600,7 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_e
 
         our_model_out: MaskFormerForInstanceSegmentationOutput = our_model(x)
 
-        our_segmentation = feature_extractor.post_process_segmentation(our_model_out, target_size=(384, 384))
+        our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))
 
         assert torch.allclose(
             original_segmentation, our_segmentation, atol=1e-3
@@ -686,9 +686,7 @@ def get_name(checkpoint_file: Path):
     for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs(
         checkpoints_dir, config_dir
     ):
-        feature_extractor = OriginalMaskFormerConfigToFeatureExtractorConverter()(
-            setup_cfg(Args(config_file=config_file))
-        )
+        image_processor = OriginalMaskFormerConfigToImageProcessorConverter()(setup_cfg(Args(config_file=config_file)))
 
         original_config = setup_cfg(Args(config_file=config_file))
         mask_former_kwargs = OriginalMaskFormer.from_config(original_config)
@@ -712,15 +710,15 @@ def get_name(checkpoint_file: Path):
             mask_former_for_instance_segmentation
         )
 
-        test(original_model, mask_former_for_instance_segmentation, feature_extractor)
+        test(original_model, mask_former_for_instance_segmentation, image_processor)
 
         model_name = get_name(checkpoint_file)
         logger.info(f"🪄 Saving {model_name}")
 
-        feature_extractor.save_pretrained(save_directory / model_name)
+        image_processor.save_pretrained(save_directory / model_name)
         mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
 
-        feature_extractor.push_to_hub(
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / model_name,
             commit_message="Add model",
             use_temp_dir=True,
diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
index 0657de9ee69e..fec508de4138 100644
--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
@@ -26,7 +26,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, ResNetConfig
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, ResNetConfig
 from transformers.utils import logging
 
 
@@ -297,9 +297,9 @@ def convert_maskformer_checkpoint(
     else:
         ignore_index = 255
     reduce_labels = True if "ade" in model_name else False
-    feature_extractor = MaskFormerFeatureExtractor(ignore_index=ignore_index, reduce_labels=reduce_labels)
+    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, reduce_labels=reduce_labels)
 
-    inputs = feature_extractor(image, return_tensors="pt")
+    inputs = image_processor(image, return_tensors="pt")
 
     outputs = model(**inputs)
 
@@ -340,15 +340,15 @@ def convert_maskformer_checkpoint(
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor of {model_name} to {pytorch_dump_folder_path}")
+        print(f"Saving model and image processor of {model_name} to {pytorch_dump_folder_path}")
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         model.save_pretrained(pytorch_dump_folder_path)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and feature extractor of {model_name} to the hub...")
+        print(f"Pushing model and image processor of {model_name} to the hub...")
         model.push_to_hub(f"facebook/{model_name}")
-        feature_extractor.push_to_hub(f"facebook/{model_name}")
+        image_processor.push_to_hub(f"facebook/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
index 4ec5f64f22ae..8f0d0e99df1e 100644
--- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
@@ -26,7 +26,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, SwinConfig
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, SwinConfig
 from transformers.utils import logging
 
 
@@ -278,9 +278,9 @@ def convert_maskformer_checkpoint(
     else:
         ignore_index = 255
     reduce_labels = True if "ade" in model_name else False
-    feature_extractor = MaskFormerFeatureExtractor(ignore_index=ignore_index, reduce_labels=reduce_labels)
+    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, reduce_labels=reduce_labels)
 
-    inputs = feature_extractor(image, return_tensors="pt")
+    inputs = image_processor(image, return_tensors="pt")
 
     outputs = model(**inputs)
 
@@ -294,15 +294,15 @@ def convert_maskformer_checkpoint(
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         model.save_pretrained(pytorch_dump_folder_path)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print("Pushing model and feature extractor to the hub...")
+        print("Pushing model and image processor to the hub...")
         model.push_to_hub(f"nielsr/{model_name}")
-        feature_extractor.push_to_hub(f"nielsr/{model_name}")
+        image_processor.push_to_hub(f"nielsr/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
index 66533cb8d034..4985e0ff22d7 100644
--- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
@@ -27,8 +27,8 @@
 
 from transformers import (
     MobileNetV1Config,
-    MobileNetV1FeatureExtractor,
     MobileNetV1ForImageClassification,
+    MobileNetV1ImageProcessor,
     load_tf_weights_in_mobilenet_v1,
 )
 from transformers.utils import logging
@@ -83,12 +83,12 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     # Load weights from TensorFlow checkpoint
     load_tf_weights_in_mobilenet_v1(model, config, checkpoint_path)
 
-    # Check outputs on an image, prepared by MobileNetV1FeatureExtractor
-    feature_extractor = MobileNetV1FeatureExtractor(
+    # Check outputs on an image, prepared by MobileNetV1ImageProcessor
+    image_processor = MobileNetV1ImageProcessor(
         crop_size={"width": config.image_size, "height": config.image_size},
         size={"shortest_edge": config.image_size + 32},
     )
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits
 
@@ -107,13 +107,13 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing to the hub...")
         repo_id = "google/" + model_name
-        feature_extractor.push_to_hub(repo_id)
+        image_processor.push_to_hub(repo_id)
         model.push_to_hub(repo_id)
 
 
diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
index 8b216aecb83d..443bf8fd7e4e 100644
--- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
@@ -99,11 +99,11 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     load_tf_weights_in_mobilenet_v2(model, config, checkpoint_path)
 
     # Check outputs on an image, prepared by MobileNetV2ImageProcessor
-    feature_extractor = MobileNetV2ImageProcessor(
+    image_processor = MobileNetV2ImageProcessor(
         crop_size={"width": config.image_size, "height": config.image_size},
         size={"shortest_edge": config.image_size + 32},
     )
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits
 
@@ -143,13 +143,13 @@ def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing to the hub...")
         repo_id = "google/" + model_name
-        feature_extractor.push_to_hub(repo_id)
+        image_processor.push_to_hub(repo_id)
         model.push_to_hub(repo_id)
 
 
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
index 50da675851bc..e251b124b465 100644
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
@@ -26,9 +26,9 @@
 
 from transformers import (
     MobileViTConfig,
-    MobileViTFeatureExtractor,
     MobileViTForImageClassification,
     MobileViTForSemanticSegmentation,
+    MobileViTImageProcessor,
 )
 from transformers.utils import logging
 
@@ -211,9 +211,9 @@ def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_f
     new_state_dict = convert_state_dict(state_dict, model)
     model.load_state_dict(new_state_dict)
 
-    # Check outputs on an image, prepared by MobileViTFeatureExtractor
-    feature_extractor = MobileViTFeatureExtractor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    # Check outputs on an image, prepared by MobileViTImageProcessor
+    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits
 
@@ -265,8 +265,8 @@ def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_f
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {mobilevit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model_mapping = {
@@ -280,7 +280,7 @@ def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_f
 
         print("Pushing to the hub...")
         model_name = model_mapping[mobilevit_name]
-        feature_extractor.push_to_hub(model_name, organization="apple")
+        image_processor.push_to_hub(model_name, organization="apple")
         model.push_to_hub(model_name, organization="apple")
 
 
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
index 16e9029d7244..2e2d31295d7c 100644
--- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
@@ -259,8 +259,8 @@ def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path,
     model.load_state_dict(state_dict)
 
     # Check outputs on an image, prepared by MobileViTImageProcessor
-    feature_extractor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
 
     # verify classification model
@@ -276,8 +276,8 @@ def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path,
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {task_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 673f6d7ee6dd..2403937b89eb 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -383,7 +383,7 @@ def generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
         image_input_dict = super().generate_dummy_inputs(
-            processor.feature_extractor, batch_size=batch_size, framework=framework
+            processor.image_processor, batch_size=batch_size, framework=framework
         )
         return {**text_input_dict, **image_input_dict}
 
diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
index 934c23b4d3eb..1e9fbb950467 100644
--- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+++ b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
@@ -29,8 +29,8 @@
 from transformers import (
     CLIPTokenizer,
     OwlViTConfig,
-    OwlViTFeatureExtractor,
     OwlViTForObjectDetection,
+    OwlViTImageProcessor,
     OwlViTModel,
     OwlViTProcessor,
 )
@@ -350,16 +350,16 @@ def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dum
     # Save HF model
     hf_model.save_pretrained(repo.local_dir)
 
-    # Initialize feature extractor
-    feature_extractor = OwlViTFeatureExtractor(
+    # Initialize image processor
+    image_processor = OwlViTImageProcessor(
         size=config.vision_config.image_size, crop_size=config.vision_config.image_size
     )
     # Initialize tokenizer
     tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)
 
     # Initialize processor
-    processor = OwlViTProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-    feature_extractor.save_pretrained(repo.local_dir)
+    processor = OwlViTProcessor(image_processor=image_processor, tokenizer=tokenizer)
+    image_processor.save_pretrained(repo.local_dir)
     processor.save_pretrained(repo.local_dir)
 
     repo.git_add()
diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
index 9b9b3cb454be..1ea979812752 100644
--- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+++ b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
@@ -29,13 +29,13 @@
 
 from transformers import (
     PerceiverConfig,
-    PerceiverFeatureExtractor,
     PerceiverForImageClassificationConvProcessing,
     PerceiverForImageClassificationFourier,
     PerceiverForImageClassificationLearned,
     PerceiverForMaskedLM,
     PerceiverForMultimodalAutoencoding,
     PerceiverForOpticalFlow,
+    PerceiverImageProcessor,
     PerceiverTokenizer,
 )
 from transformers.utils import logging
@@ -389,9 +389,9 @@ def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architec
         inputs = encoding.input_ids
         input_mask = encoding.attention_mask
     elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]:
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         image = prepare_img()
-        encoding = feature_extractor(image, return_tensors="pt")
+        encoding = image_processor(image, return_tensors="pt")
         inputs = encoding.pixel_values
     elif architecture == "optical_flow":
         inputs = torch.randn(1, 2, 27, 368, 496)
diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
index d00a9970aaae..e5fad6da1a3f 100644
--- a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
+++ b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import PoolFormerConfig, PoolFormerFeatureExtractor, PoolFormerForImageClassification
+from transformers import PoolFormerConfig, PoolFormerForImageClassification, PoolFormerImageProcessor
 from transformers.utils import logging
 
 
@@ -141,12 +141,12 @@ def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_fold
     else:
         raise ValueError(f"Size {size} not supported")
 
-    # load feature extractor
-    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
+    # load image processor
+    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
 
     # Prepare image
     image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
 
     logger.info(f"Converting model {model_name}...")
 
@@ -161,9 +161,9 @@ def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_fold
     model.load_state_dict(state_dict)
     model.eval()
 
-    # Define feature extractor
-    feature_extractor = PoolFormerFeatureExtractor(crop_pct=crop_pct)
-    pixel_values = feature_extractor(images=prepare_img(), return_tensors="pt").pixel_values
+    # Define image processor
+    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
+    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
 
     # forward pass
     outputs = model(pixel_values)
@@ -187,12 +187,12 @@ def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_fold
     assert logits.shape == expected_shape
     assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)
 
-    # finally, save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # finally, save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
index f379b40d2ab4..93a516fb3c77 100644
--- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
@@ -34,7 +34,7 @@
 from torch import Tensor
 from vissl.models.model_helpers import get_trunk_forward_outputs
 
-from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 
@@ -262,10 +262,10 @@ def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
         )
         size = 384
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / model_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             output_dir=save_directory / model_name,
         )
 
@@ -294,7 +294,7 @@ def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
index 1228e65c4662..14d01ae44525 100644
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -30,7 +30,7 @@
 from torch import Tensor
 from vissl.models.model_helpers import get_trunk_forward_outputs
 
-from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
+from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
 from transformers.utils import logging
 
 
@@ -209,10 +209,10 @@ def convert_weight_and_push(
 
         size = 224 if "seer" not in name else 384
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -449,7 +449,7 @@ def load_using_classy_vision(checkpoint_url: str, model_func: Callable[[], nn.Mo
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
index f32887c9645f..52b0bd906871 100644
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
@@ -28,7 +28,7 @@
 from huggingface_hub import hf_hub_download
 from torch import Tensor
 
-from transformers import AutoFeatureExtractor, ResNetConfig, ResNetForImageClassification
+from transformers import AutoImageProcessor, ResNetConfig, ResNetForImageClassification
 from transformers.utils import logging
 
 
@@ -113,10 +113,10 @@ def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Pat
         )
 
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -191,7 +191,7 @@ def convert_weights_and_push(save_directory: Path, model_name: str = None, push_
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
index 024a8699b01e..48dd453309cb 100644
--- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+++ b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
@@ -27,9 +27,9 @@
 
 from transformers import (
     SegformerConfig,
-    SegformerFeatureExtractor,
     SegformerForImageClassification,
     SegformerForSemanticSegmentation,
+    SegformerImageProcessor,
 )
 from transformers.utils import logging
 
@@ -179,14 +179,14 @@ def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
     else:
         raise ValueError(f"Size {size} not supported")
 
-    # load feature extractor (only resize + normalize)
-    feature_extractor = SegformerFeatureExtractor(
+    # load image processor (only resize + normalize)
+    image_processor = SegformerImageProcessor(
         image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
     )
 
     # prepare image
     image = prepare_img()
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
 
     logger.info(f"Converting model {model_name}...")
 
@@ -362,11 +362,11 @@ def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folde
         assert logits.shape == expected_shape
         assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2)
 
-    # finally, save model and feature extractor
-    logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+    # finally, save model and image processor
+    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
index 72e6bce5b09e..156b0ba86c52 100644
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
@@ -22,7 +22,7 @@
 import torch
 from PIL import Image
 
-from transformers import SwinConfig, SwinForMaskedImageModeling, ViTFeatureExtractor
+from transformers import SwinConfig, SwinForMaskedImageModeling, ViTImageProcessor
 
 
 def get_swin_config(model_name):
@@ -132,9 +132,9 @@ def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_pat
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-    feature_extractor = ViTFeatureExtractor(size={"height": 192, "width": 192})
+    image_processor = ViTImageProcessor(size={"height": 192, "width": 192})
     image = Image.open(requests.get(url, stream=True).raw)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     with torch.no_grad():
         outputs = model(**inputs).logits
@@ -146,13 +146,13 @@ def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_pat
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
 
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print(f"Pushing model and feature extractor for {model_name} to hub")
+        print(f"Pushing model and image processor for {model_name} to hub")
         model.push_to_hub(f"microsoft/{model_name}")
-        feature_extractor.push_to_hub(f"microsoft/{model_name}")
+        image_processor.push_to_hub(f"microsoft/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
index c5734757e601..828237490e0e 100644
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
@@ -7,7 +7,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import AutoFeatureExtractor, SwinConfig, SwinForImageClassification
+from transformers import AutoImageProcessor, SwinConfig, SwinForImageClassification
 
 
 def get_swin_config(swin_name):
@@ -140,9 +140,9 @@ def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
+    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
     image = Image.open(requests.get(url, stream=True).raw)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     timm_outs = timm_model(inputs["pixel_values"])
     hf_outs = model(**inputs).logits
@@ -152,8 +152,8 @@ def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
     print(f"Saving model {swin_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
index ba70e707a949..21deda864c6d 100644
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import AutoFeatureExtractor, Swinv2Config, Swinv2ForImageClassification
+from transformers import AutoImageProcessor, Swinv2Config, Swinv2ForImageClassification
 
 
 def get_swinv2_config(swinv2_name):
@@ -180,9 +180,9 @@ def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
 
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
+    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
     image = Image.open(requests.get(url, stream=True).raw)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     timm_outs = timm_model(inputs["pixel_values"])
     hf_outs = model(**inputs).logits
@@ -192,8 +192,8 @@ def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
     print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     model.push_to_hub(
         repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
index 1973fe82e9d5..d351473e2224 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
@@ -27,7 +27,7 @@
 from PIL import Image
 from torchvision.transforms import functional as F
 
-from transformers import DetrFeatureExtractor, TableTransformerConfig, TableTransformerForObjectDetection
+from transformers import DetrImageProcessor, TableTransformerConfig, TableTransformerForObjectDetection
 from transformers.utils import logging
 
 
@@ -242,7 +242,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
         config.id2label = id2label
         config.label2id = {v: k for k, v in id2label.items()}
 
-    feature_extractor = DetrFeatureExtractor(
+    image_processor = DetrImageProcessor(
         format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000
     )
     model = TableTransformerForObjectDetection(config)
@@ -277,11 +277,11 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
     print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
-        # Save model and feature extractor
-        logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...")
+        # Save model and image processor
+        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         model.save_pretrained(pytorch_dump_folder_path)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         # Push model to HF hub
@@ -292,7 +292,7 @@ def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_pat
             else "microsoft/table-transformer-structure-recognition"
         )
         model.push_to_hub(model_name)
-        feature_extractor.push_to_hub(model_name)
+        image_processor.push_to_hub(model_name)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
index 5a66776d4381..ce4d13421ffd 100644
--- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+++ b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
@@ -22,7 +22,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 
-from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEFeatureExtractor
+from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEImageProcessor
 
 
 def get_timesformer_config(model_name):
@@ -156,9 +156,9 @@ def convert_timesformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, mod
     model.eval()
 
     # verify model on basic input
-    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
     video = prepare_video()
-    inputs = feature_extractor(video[:8], return_tensors="pt")
+    inputs = image_processor(video[:8], return_tensors="pt")
 
     outputs = model(**inputs)
     logits = outputs.logits
@@ -215,8 +215,8 @@ def convert_timesformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, mod
     print("Logits ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
         model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index 9f886b6ece53..ad5120ab575c 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -513,8 +513,8 @@ def _set_gradient_checkpointing(self, module, value=False):
 TIMESFORMER_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`VideoMAEFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`VideoMAEImageProcessor.preprocess`] for details.
 
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
index 26291296817b..b82adf690e7e 100644
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
@@ -29,7 +29,7 @@
     TrOCRProcessor,
     VisionEncoderDecoderModel,
     ViTConfig,
-    ViTFeatureExtractor,
+    ViTImageProcessor,
     ViTModel,
 )
 from transformers.utils import logging
@@ -182,9 +182,9 @@ def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     model.load_state_dict(state_dict)
 
     # Check outputs on an image
-    feature_extractor = ViTFeatureExtractor(size=encoder_config.image_size)
+    image_processor = ViTImageProcessor(size=encoder_config.image_size)
     tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
-    processor = TrOCRProcessor(feature_extractor, tokenizer)
+    processor = TrOCRProcessor(image_processor, tokenizer)
 
     pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values
 
diff --git a/src/transformers/models/van/convert_van_to_pytorch.py b/src/transformers/models/van/convert_van_to_pytorch.py
index 0cb51e59e6ba..c9bbc0fc9195 100644
--- a/src/transformers/models/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/van/convert_van_to_pytorch.py
@@ -30,7 +30,7 @@
 from huggingface_hub import cached_download, hf_hub_download
 from torch import Tensor
 
-from transformers import AutoFeatureExtractor, VanConfig, VanForImageClassification
+from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
 from transformers.models.van.modeling_van import VanLayerScaling
 from transformers.utils import logging
 
@@ -154,10 +154,10 @@ def convert_weight_and_push(
         )
 
         # we can use the convnext one
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        feature_extractor.push_to_hub(
+        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
+        image_processor.push_to_hub(
             repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add feature extractor",
+            commit_message="Add image processor",
             use_temp_dir=True,
         )
 
@@ -277,7 +277,7 @@ def convert_weights_and_push(save_directory: Path, model_name: str = None, push_
         default=True,
         type=bool,
         required=False,
-        help="If True, push model and feature extractor to the hub.",
+        help="If True, push model and image processor to the hub.",
     )
 
     args = parser.parse_args()
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index f0a058c42a3d..c98160a6bb82 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -24,9 +24,9 @@
 
 from transformers import (
     VideoMAEConfig,
-    VideoMAEFeatureExtractor,
     VideoMAEForPreTraining,
     VideoMAEForVideoClassification,
+    VideoMAEImageProcessor,
 )
 
 
@@ -198,9 +198,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
     model.eval()
 
     # verify model on basic input
-    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
     video = prepare_video()
-    inputs = feature_extractor(video, return_tensors="pt")
+    inputs = image_processor(video, return_tensors="pt")
 
     if "finetuned" not in model_name:
         local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
@@ -288,8 +288,8 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
         print("Loss ok!")
 
     if pytorch_dump_folder_path is not None:
-        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
+        image_processor.save_pretrained(pytorch_dump_folder_path)
         model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
index 693ca34efcf6..015db07453d1 100644
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
@@ -27,11 +27,11 @@
 from transformers import (
     BertTokenizer,
     ViltConfig,
-    ViltFeatureExtractor,
     ViltForImageAndTextRetrieval,
     ViltForImagesAndTextClassification,
     ViltForMaskedLM,
     ViltForQuestionAnswering,
+    ViltImageProcessor,
     ViltProcessor,
 )
 from transformers.utils import logging
@@ -223,9 +223,9 @@ def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
         model.load_state_dict(state_dict)
 
     # Define processor
-    feature_extractor = ViltFeatureExtractor(size=384)
+    image_processor = ViltImageProcessor(size=384)
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    processor = ViltProcessor(feature_extractor, tokenizer)
+    processor = ViltProcessor(image_processor, tokenizer)
 
     # Forward pass on example inputs (image + text)
     if nlvr_model:
diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py
index ceedc2603a74..7eec823ad5d1 100644
--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ b/src/transformers/models/vit/convert_dino_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
+from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging
 
 
@@ -175,9 +175,9 @@ def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True
         model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image, prepared by ViTFeatureExtractor
-    feature_extractor = ViTFeatureExtractor()
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    # Check outputs on an image, prepared by ViTImageProcessor
+    image_processor = ViTImageProcessor()
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
     outputs = model(pixel_values)
 
@@ -192,8 +192,8 @@ def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 718e8624cc8f..b73c5f346dba 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -25,7 +25,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import DeiTFeatureExtractor, ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
+from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging
 
 
@@ -208,12 +208,12 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
         model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
 
-    # Check outputs on an image, prepared by ViTFeatureExtractor/DeiTFeatureExtractor
+    # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
     if "deit" in vit_name:
-        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
+        image_processor = DeiTImageProcessor(size=config.image_size)
     else:
-        feature_extractor = ViTFeatureExtractor(size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+        image_processor = ViTImageProcessor(size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     pixel_values = encoding["pixel_values"]
     outputs = model(pixel_values)
 
@@ -229,8 +229,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
index fc61d8924c8d..47e77593f6fd 100644
--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
@@ -20,7 +20,7 @@
 import torch
 from PIL import Image
 
-from transformers import ViTMAEConfig, ViTMAEFeatureExtractor, ViTMAEForPreTraining
+from transformers import ViTMAEConfig, ViTMAEForPreTraining, ViTMAEImageProcessor
 
 
 def rename_key(name):
@@ -120,7 +120,7 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
 
-    feature_extractor = ViTMAEFeatureExtractor(size=config.image_size)
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
 
     new_state_dict = convert_state_dict(state_dict, config)
 
@@ -130,8 +130,8 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
 
     image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = ViTMAEFeatureExtractor(size=config.image_size)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
+    inputs = image_processor(images=image, return_tensors="pt")
 
     # forward pass
     torch.manual_seed(2)
@@ -157,8 +157,8 @@ def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
index 482073a4faa0..899c74f18320 100644
--- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+++ b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
@@ -22,7 +22,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import ViTFeatureExtractor, ViTMSNConfig, ViTMSNModel
+from transformers import ViTImageProcessor, ViTMSNConfig, ViTMSNModel
 from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
@@ -180,7 +180,7 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["target_encoder"]
 
-    feature_extractor = ViTFeatureExtractor(size=config.image_size)
+    image_processor = ViTImageProcessor(size=config.image_size)
 
     remove_projection_head(state_dict)
     rename_keys = create_rename_keys(config, base_model=True)
@@ -195,10 +195,10 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
     image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = ViTFeatureExtractor(
+    image_processor = ViTImageProcessor(
         size=config.image_size, image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD
     )
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    inputs = image_processor(images=image, return_tensors="pt")
 
     # forward pass
     torch.manual_seed(2)
@@ -224,8 +224,8 @@ def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index d4281fe849d6..8ff878f2cc9f 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -23,7 +23,7 @@
 from transformers import (
     CLIPTokenizer,
     CLIPTokenizerFast,
-    VideoMAEFeatureExtractor,
+    VideoMAEImageProcessor,
     XCLIPConfig,
     XCLIPModel,
     XCLIPProcessor,
@@ -291,10 +291,10 @@ def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     model.eval()
 
     size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
-    feature_extractor = VideoMAEFeatureExtractor(size=size)
+    image_processor = VideoMAEImageProcessor(size=size)
     slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
     fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-    processor = XCLIPProcessor(feature_extractor=feature_extractor, tokenizer=fast_tokenizer)
+    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
 
     video = prepare_video(num_frames)
     inputs = processor(
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
index 02279518832b..35238151ab93 100644
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -24,7 +24,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from transformers import YolosConfig, YolosFeatureExtractor, YolosForObjectDetection
+from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
 from transformers.utils import logging
 
 
@@ -172,10 +172,10 @@ def convert_yolos_checkpoint(
     new_state_dict = convert_state_dict(state_dict, model)
     model.load_state_dict(new_state_dict)
 
-    # Check outputs on an image, prepared by YolosFeatureExtractor
+    # Check outputs on an image, prepared by YolosImageProcessor
     size = 800 if yolos_name != "yolos_ti" else 512
-    feature_extractor = YolosFeatureExtractor(format="coco_detection", size=size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    image_processor = YolosImageProcessor(format="coco_detection", size=size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
     outputs = model(**encoding)
     logits, pred_boxes = outputs.logits, outputs.pred_boxes
 
@@ -224,8 +224,8 @@ def convert_yolos_checkpoint(
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         model_mapping = {
@@ -238,7 +238,7 @@ def convert_yolos_checkpoint(
 
         print("Pushing to the hub...")
         model_name = model_mapping[yolos_name]
-        feature_extractor.push_to_hub(model_name, organization="hustvl")
+        image_processor.push_to_hub(model_name, organization="hustvl")
         model.push_to_hub(model_name, organization="hustvl")
 
 
diff --git a/src/transformers/onnx/__main__.py b/src/transformers/onnx/__main__.py
index 1a0cdb68b111..92dba71ed789 100644
--- a/src/transformers/onnx/__main__.py
+++ b/src/transformers/onnx/__main__.py
@@ -19,7 +19,7 @@
 
 from packaging import version
 
-from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
+from .. import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
 from ..utils import logging
 from ..utils.import_utils import is_optimum_available
 from .convert import export, validate_model_outputs
@@ -145,6 +145,8 @@ def export_with_transformers(args):
             preprocessor = get_preprocessor(args.model)
         elif args.preprocessor == "tokenizer":
             preprocessor = AutoTokenizer.from_pretrained(args.model)
+        elif args.preprocessor == "image_processor":
+            preprocessor = AutoImageProcessor.from_pretrained(args.model)
         elif args.preprocessor == "feature_extractor":
             preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
         elif args.preprocessor == "processor":
@@ -213,7 +215,7 @@ def main():
     parser.add_argument(
         "--preprocessor",
         type=str,
-        choices=["auto", "tokenizer", "feature_extractor", "processor"],
+        choices=["auto", "tokenizer", "feature_extractor", "image_processor", "processor"],
         default="auto",
         help="Which type of preprocessor to use. 'auto' tries to automatically detect it.",
     )
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index f9aa7339f7e0..4c0e5e5a4599 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -49,7 +49,7 @@
     import PIL
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class BeitModelTester:
@@ -342,18 +342,16 @@ def prepare_img():
 @require_vision
 class BeitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return (
-            BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_masked_image_modeling_head(self):
         model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
 
         # prepare bool_masked_pos
         bool_masked_pos = torch.ones((1, 196), dtype=torch.bool).to(torch_device)
@@ -377,9 +375,9 @@ def test_inference_masked_image_modeling_head(self):
     def test_inference_image_classification_head_imagenet_1k(self):
         model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -403,9 +401,9 @@ def test_inference_image_classification_head_imagenet_22k(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -428,11 +426,11 @@ def test_inference_semantic_segmentation(self):
         model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
         model = model.to(torch_device)
 
-        feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
+        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
 
         ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         image = Image.open(ds[0]["file"])
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -471,11 +469,11 @@ def test_post_processing_semantic_segmentation(self):
         model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
         model = model.to(torch_device)
 
-        feature_extractor = BeitFeatureExtractor(do_resize=True, size=640, do_center_crop=False)
+        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
 
         ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
         image = Image.open(ds[0]["file"])
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -483,10 +481,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
         expected_shape = torch.Size((500, 300))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((160, 160))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
index 75c58aec4717..0587174bac91 100644
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -33,7 +33,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class FlaxBeitModelTester(unittest.TestCase):
@@ -219,18 +219,16 @@ def prepare_img():
 @require_flax
 class FlaxBeitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return (
-            BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_masked_image_modeling_head(self):
         model = FlaxBeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
+        pixel_values = image_processor(images=image, return_tensors="np").pixel_values
 
         # prepare bool_masked_pos
         bool_masked_pos = np.ones((1, 196), dtype=bool)
@@ -253,9 +251,9 @@ def test_inference_masked_image_modeling_head(self):
     def test_inference_image_classification_head_imagenet_1k(self):
         model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")
 
         # forward pass
         outputs = model(**inputs)
@@ -276,9 +274,9 @@ def test_inference_image_classification_head_imagenet_1k(self):
     def test_inference_image_classification_head_imagenet_22k(self):
         model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index cec997e594b1..d7d2c60347f6 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -297,7 +297,7 @@ def prepare_img():
 @require_vision
 class BitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
             BitImageProcessor.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]) if is_vision_available() else None
         )
@@ -306,9 +306,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = BitForImageClassification.from_pretrained(BIT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py
index fe84227bad0a..9a9bff9fc36e 100644
--- a/tests/models/bridgetower/test_image_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_image_processing_bridgetower.py
@@ -145,7 +145,7 @@ def test_batch_feature(self):
         pass
 
     def test_call_pil(self):
-        # Initialize feature_extractor
+        # Initialize image processor
         image_processing = self.image_processing_class(**self.image_processor_dict)
         # create random PIL images
         image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
@@ -176,7 +176,7 @@ def test_call_pil(self):
         )
 
     def test_call_numpy(self):
-        # Initialize feature_extractor
+        # Initialize image processor
         image_processing = self.image_processing_class(**self.image_processor_dict)
         # create random numpy tensors
         image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
@@ -207,7 +207,7 @@ def test_call_numpy(self):
         )
 
     def test_call_pytorch(self):
-        # Initialize feature_extractor
+        # Initialize image processor
         image_processing = self.image_processing_class(**self.image_processor_dict)
         # create random PyTorch tensors
         image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
@@ -238,7 +238,7 @@ def test_call_pytorch(self):
         )
 
     def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize feature_extractors
+        # Initialize image processors
         image_processing_1 = self.image_processing_class(**self.image_processor_dict)
         image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
         # create random PyTorch tensors
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 11a548df9239..e1827f021391 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -43,7 +43,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConditionalDetrFeatureExtractor
+    from transformers import ConditionalDetrImageProcessor
 
 
 class ConditionalDetrModelTester:
@@ -493,9 +493,9 @@ def prepare_img():
 @slow
 class ConditionalDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            ConditionalDetrFeatureExtractor.from_pretrained("microsoft/conditional-detr-resnet-50")
+            ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
             if is_vision_available()
             else None
         )
@@ -503,9 +503,9 @@ def default_feature_extractor(self):
     def test_inference_no_head(self):
         model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
             outputs = model(**encoding)
@@ -522,9 +522,9 @@ def test_inference_object_detection_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -547,7 +547,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355]).to(torch_device)
diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py
index 16d57d7e9646..397fa596f102 100644
--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class ConvNextModelTester:
@@ -285,16 +285,16 @@ def prepare_img():
 @require_vision
 class ConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = ConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/convnext/test_modeling_tf_convnext.py b/tests/models/convnext/test_modeling_tf_convnext.py
index a508e038dfe6..4a06632513d5 100644
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ConvNextFeatureExtractor
+    from transformers import ConvNextImageProcessor
 
 
 class TFConvNextModelTester:
@@ -279,18 +279,16 @@ def prepare_img():
 @require_vision
 class TFConvNextModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return (
-            ConvNextFeatureExtractor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
-        )
+    def default_image_processor(self):
+        return ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index 35ecf5021728..e7fc756ea823 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class CvtConfigTester(ConfigTester):
@@ -264,16 +264,16 @@ def prepare_img():
 @require_vision
 class CvtModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
         model = CvtForImageClassification.from_pretrained(CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index 2a42732e67c4..d1d5835d7bc8 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -28,7 +28,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFCvtConfigTester(ConfigTester):
@@ -265,16 +265,16 @@ def prepare_img():
 @require_vision
 class TFCvtModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFCvtForImageClassification.from_pretrained(TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index b4c391fea17e..299ffad3e722 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -44,7 +44,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class Data2VecVisionModelTester:
@@ -327,11 +327,9 @@ def prepare_img():
 @require_vision
 class Data2VecVisionModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
-            if is_vision_available()
-            else None
+            BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
         )
 
     @slow
@@ -340,9 +338,9 @@ def test_inference_image_classification_head_imagenet_1k(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index b3f2eec8fb3d..9e3701c17ad3 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -46,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import BeitFeatureExtractor
+    from transformers import BeitImageProcessor
 
 
 class TFData2VecVisionModelTester:
@@ -469,20 +469,18 @@ def prepare_img():
 @require_vision
 class TFData2VecVisionModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
-            if is_vision_available()
-            else None
+            BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head_imagenet_1k(self):
         model = TFData2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 3c7c2f640a59..180f57f042f4 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class DeformableDetrModelTester:
@@ -563,15 +563,15 @@ def prepare_img():
 @slow
 class DeformableDetrModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None
 
     def test_inference_object_detection_head(self):
         model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -595,7 +595,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device)
@@ -612,9 +612,9 @@ def test_inference_object_detection_head_with_box_refine_two_stage(self):
             "SenseTime/deformable-detr-with-box-refine-two-stage"
         ).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -639,9 +639,9 @@ def test_inference_object_detection_head_with_box_refine_two_stage(self):
 
     @require_torch_gpu
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt")
+        encoding = image_processor(images=image, return_tensors="pt")
         pixel_values = encoding["pixel_values"]
         pixel_mask = encoding["pixel_mask"]
 
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index 1564b23aa659..37bfe3fa70b9 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -55,7 +55,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DeiTFeatureExtractor
+    from transformers import DeiTImageProcessor
 
 
 class DeiTModelTester:
@@ -381,9 +381,9 @@ def prepare_img():
 @require_vision
 class DeiTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+            DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
             if is_vision_available()
             else None
         )
@@ -394,9 +394,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -420,10 +420,10 @@ def test_inference_fp16(self):
         model = DeiTModel.from_pretrained(
             "facebook/deit-base-distilled-patch16-224", torch_dtype=torch.float16, device_map="auto"
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass to make sure inference works in fp16
diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py
index b350a5d546b0..3987b11d1968 100644
--- a/tests/models/deit/test_modeling_tf_deit.py
+++ b/tests/models/deit/test_modeling_tf_deit.py
@@ -46,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DeiTFeatureExtractor
+    from transformers import DeiTImageProcessor
 
 
 class TFDeiTModelTester:
@@ -266,9 +266,9 @@ def prepare_img():
 @require_vision
 class DeiTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+            DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
             if is_vision_available()
             else None
         )
@@ -277,9 +277,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index 31d3db444ca0..f993a96dcbd0 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DetrFeatureExtractor
+    from transformers import DetrImageProcessor
 
 
 class DetrModelTester:
@@ -512,15 +512,15 @@ def prepare_img():
 @slow
 class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
+    def default_image_processor(self):
+        return DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
 
     def test_inference_no_head(self):
         model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
             outputs = model(**encoding)
@@ -535,9 +535,9 @@ def test_inference_no_head(self):
     def test_inference_object_detection_head(self):
         model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -560,7 +560,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987]).to(torch_device)
@@ -575,9 +575,9 @@ def test_inference_object_detection_head(self):
     def test_inference_panoptic_segmentation_head(self):
         model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
         pixel_values = encoding["pixel_values"].to(torch_device)
         pixel_mask = encoding["pixel_mask"].to(torch_device)
 
@@ -607,7 +607,7 @@ def test_inference_panoptic_segmentation_head(self):
         self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-3))
 
         # verify postprocessing
-        results = feature_extractor.post_process_panoptic_segmentation(
+        results = image_processor.post_process_panoptic_segmentation(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
 
@@ -633,9 +633,9 @@ def test_inference_panoptic_segmentation_head(self):
 @slow
 class DetrModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
+            DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
             if is_vision_available()
             else None
         )
@@ -643,9 +643,9 @@ def default_feature_extractor(self):
     def test_inference_no_head(self):
         model = DetrModel.from_pretrained("facebook/detr-resnet-50", revision="no_timm").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        encoding = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
             outputs = model(**encoding)
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index c08abf3d121d..a7e0b7d0650e 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -367,16 +367,16 @@ def test_initialization(self):
 @require_torch
 class DinatModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/dit/test_modeling_dit.py b/tests/models/dit/test_modeling_dit.py
index e7e1474f17cb..1804afdf8289 100644
--- a/tests/models/dit/test_modeling_dit.py
+++ b/tests/models/dit/test_modeling_dit.py
@@ -25,7 +25,7 @@
     from transformers import AutoModelForImageClassification
 
 if is_vision_available():
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 @require_torch
@@ -33,7 +33,7 @@
 class DiTIntegrationTest(unittest.TestCase):
     @slow
     def test_for_image_classification(self):
-        feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
+        image_processor = AutoImageProcessor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
         model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip")
         model.to(torch_device)
 
@@ -43,7 +43,7 @@ def test_for_image_classification(self):
 
         image = dataset["train"][0]["image"].convert("RGB")
 
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 76790ee79502..31e6cf221d3b 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DPTFeatureExtractor
+    from transformers import DPTImageProcessor
 
 
 class DPTModelTester:
@@ -293,11 +293,11 @@ def prepare_img():
 @slow
 class DPTModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
         model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -315,11 +315,11 @@ def test_inference_depth_estimation(self):
         self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4))
 
     def test_inference_semantic_segmentation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
         model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -336,11 +336,11 @@ def test_inference_semantic_segmentation(self):
         self.assertTrue(torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
 
     def test_post_processing_semantic_segmentation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
         model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -348,10 +348,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
         expected_shape = torch.Size((500, 300))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((480, 480))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 6d4a75c80da1..68d600f33f37 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import DPTFeatureExtractor
+    from transformers import DPTImageProcessor
 
 
 class DPTModelTester:
@@ -314,11 +314,11 @@ def prepare_img():
 @slow
 class DPTModelIntegrationTest(unittest.TestCase):
     def test_inference_depth_estimation(self):
-        feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
         model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
index 2d951e4e5b90..2774a210da56 100644
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -444,7 +444,7 @@ def prepare_img():
 @require_vision
 class EfficientFormerModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
             EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
             if is_vision_available()
@@ -457,9 +457,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -478,9 +478,9 @@ def test_inference_image_classification_head_with_teacher(self):
             "snap-research/efficientformer-l1-300"
         ).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index bf279de9dff3..60e29b739f26 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -37,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import GLPNFeatureExtractor
+    from transformers import GLPNImageProcessor
 
 
 class GLPNConfigTester(ConfigTester):
@@ -337,11 +337,11 @@ def prepare_img():
 class GLPNModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_depth_estimation(self):
-        feature_extractor = GLPNFeatureExtractor.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+        image_processor = GLPNImageProcessor.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
         model = GLPNForDepthEstimation.from_pretrained(GLPN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 27d83f3eb8c1..af08725ef186 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -49,7 +49,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ImageGPTFeatureExtractor
+    from transformers import ImageGPTImageProcessor
 
 
 class ImageGPTModelTester:
@@ -535,16 +535,16 @@ def prepare_img():
 @require_vision
 class ImageGPTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
 
     @slow
     def test_inference_causal_lm_head(self):
         model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index c458024f1056..2c3aef9b9357 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -45,7 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import LayoutLMv3FeatureExtractor
+    from transformers import LayoutLMv3ImageProcessor
 
 
 class LayoutLMv3ModelTester:
@@ -382,16 +382,16 @@ def prepare_img():
 @require_torch
 class LayoutLMv3ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
+    def default_image_processor(self):
+        return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
 
     @slow
     def test_inference_no_head(self):
         model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
 
         input_ids = torch.tensor([[1, 2]])
         bbox = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).unsqueeze(0)
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index 1bdb3e2648d7..3a6f34185b53 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -51,7 +51,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import LayoutLMv3FeatureExtractor
+    from transformers import LayoutLMv3ImageProcessor
 
 
 class TFLayoutLMv3ModelTester:
@@ -482,16 +482,16 @@ def prepare_img():
 @require_tf
 class TFLayoutLMv3ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return LayoutLMv3FeatureExtractor(apply_ocr=False) if is_vision_available() else None
+    def default_image_processor(self):
+        return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
 
     @slow
     def test_inference_no_head(self):
         model = TFLayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        pixel_values = feature_extractor(images=image, return_tensors="tf").pixel_values
+        pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
 
         input_ids = tf.constant([[1, 2]])
         bbox = tf.expand_dims(tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]]), axis=0)
diff --git a/tests/models/layoutxlm/test_processor_layoutxlm.py b/tests/models/layoutxlm/test_processor_layoutxlm.py
index 5d74bacfa0bd..57c349b26b7f 100644
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@@ -36,7 +36,7 @@
 if is_pytesseract_available():
     from PIL import Image
 
-    from transformers import LayoutLMv2FeatureExtractor, LayoutXLMProcessor
+    from transformers import LayoutLMv2ImageProcessor, LayoutXLMProcessor
 
 
 @require_pytesseract
@@ -47,7 +47,7 @@ class LayoutXLMProcessorTest(unittest.TestCase):
     rust_tokenizer_class = LayoutXLMTokenizerFast
 
     def setUp(self):
-        feature_extractor_map = {
+        image_processor_map = {
             "do_resize": True,
             "size": 224,
             "apply_ocr": True,
@@ -56,7 +56,7 @@ def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
         self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
         with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
+            fp.write(json.dumps(image_processor_map) + "\n")
 
         # taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained`
         self.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm"
@@ -70,8 +70,8 @@ def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
     def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
         return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
 
-    def get_feature_extractor(self, **kwargs):
-        return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+    def get_image_processor(self, **kwargs):
+        return LayoutLMv2ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
 
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
@@ -88,10 +88,10 @@ def prepare_image_inputs(self):
         return image_inputs
 
     def test_save_load_pretrained_default(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             processor.save_pretrained(self.tmpdirname)
             processor = LayoutXLMProcessor.from_pretrained(self.tmpdirname)
@@ -99,16 +99,16 @@ def test_save_load_pretrained_default(self):
             self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
             self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
 
-            self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-            self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
+            self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
     def test_save_load_pretrained_additional_features(self):
-        processor = LayoutXLMProcessor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
+        processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
         processor.save_pretrained(self.tmpdirname)
 
         # slow tokenizer
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
 
         processor = LayoutXLMProcessor.from_pretrained(
             self.tmpdirname,
@@ -122,12 +122,12 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
         self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer)
 
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
         # fast tokenizer
         tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
 
         processor = LayoutXLMProcessor.from_pretrained(
             self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
@@ -136,14 +136,14 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
         self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
 
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
     def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
+        image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = LayoutXLMProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -215,15 +215,15 @@ def get_tokenizers(self):
     def test_processor_case_1(self):
         # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
 
-        feature_extractor = LayoutLMv2FeatureExtractor()
+        image_processor = LayoutLMv2ImageProcessor()
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
-            input_feat_extract = feature_extractor(images[0], return_tensors="pt")
+            input_feat_extract = image_processor(images[0], return_tensors="pt")
             input_processor = processor(images[0], return_tensors="pt")
 
             # verify keys
@@ -245,7 +245,7 @@ def test_processor_case_1(self):
             self.assertSequenceEqual(decoding, expected_decoding)
 
             # batched
-            input_feat_extract = feature_extractor(images, return_tensors="pt")
+            input_feat_extract = image_processor(images, return_tensors="pt")
             input_processor = processor(images, padding=True, return_tensors="pt")
 
             # verify keys
@@ -270,12 +270,12 @@ def test_processor_case_1(self):
     def test_processor_case_2(self):
         # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
 
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             words = ["hello", "world"]
@@ -324,12 +324,12 @@ def test_processor_case_2(self):
     def test_processor_case_3(self):
         # case 3: token classification (training), apply_ocr=False
 
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             words = ["weirdly", "world"]
@@ -389,12 +389,12 @@ def test_processor_case_3(self):
     def test_processor_case_4(self):
         # case 4: visual question answering (inference), apply_ocr=True
 
-        feature_extractor = LayoutLMv2FeatureExtractor()
+        image_processor = LayoutLMv2ImageProcessor()
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             question = "What's his name?"
@@ -440,12 +440,12 @@ def test_processor_case_4(self):
     def test_processor_case_5(self):
         # case 5: visual question answering (inference), apply_ocr=False
 
-        feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        image_processor = LayoutLMv2ImageProcessor(apply_ocr=False)
         tokenizers = self.get_tokenizers
         images = self.get_images
 
         for tokenizer in tokenizers:
-            processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
             # not batched
             question = "What's his name?"
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index d4e803ed4d2f..4f7852751497 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -46,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import LevitFeatureExtractor
+    from transformers import LevitImageProcessor
 
 
 class LevitConfigTester(ConfigTester):
@@ -409,8 +409,8 @@ def prepare_img():
 @require_vision
 class LevitModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return LevitFeatureExtractor.from_pretrained(LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return LevitImageProcessor.from_pretrained(LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
@@ -418,9 +418,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py
index 9f76f678f32e..c750edd2e247 100644
--- a/tests/models/mask2former/test_image_processing_mask2former.py
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -545,9 +545,9 @@ def test_post_process_semantic_segmentation(self):
         self.assertEqual(segmentation[0].shape, target_sizes[0])
 
     def test_post_process_instance_segmentation(self):
-        feature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
         outputs = self.image_processor_tester.get_fake_mask2former_outputs()
-        segmentation = feature_extractor.post_process_instance_segmentation(outputs, threshold=0)
+        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
 
         self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
         for el in segmentation:
@@ -556,7 +556,7 @@ def test_post_process_instance_segmentation(self):
             self.assertEqual(type(el["segments_info"]), list)
             self.assertEqual(el["segmentation"].shape, (384, 384))
 
-        segmentation = feature_extractor.post_process_instance_segmentation(
+        segmentation = image_processor.post_process_instance_segmentation(
             outputs, threshold=0, return_binary_maps=True
         )
 
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 8d7cd52f0f7f..7ec44c0713a7 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -325,14 +325,14 @@ def model_checkpoints(self):
         return "facebook/mask2former-swin-small-coco-instance"
 
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return Mask2FormerImageProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
 
     def test_inference_no_head(self):
         model = Mask2FormerModel.from_pretrained(self.model_checkpoints).to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -371,9 +371,9 @@ def test_inference_no_head(self):
 
     def test_inference_universal_segmentation_head(self):
         model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -408,9 +408,9 @@ def test_inference_universal_segmentation_head(self):
 
     def test_with_segmentation_maps_and_loss(self):
         model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
-        inputs = feature_extractor(
+        inputs = image_processor(
             [np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
             segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
             return_tensors="pt",
diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py
index 128d232d4cf8..3fff1022c1db 100644
--- a/tests/models/maskformer/test_image_processing_maskformer.py
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -574,9 +574,9 @@ def test_post_process_semantic_segmentation(self):
         self.assertEqual(segmentation[0].shape, target_sizes[0])
 
     def test_post_process_instance_segmentation(self):
-        feature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
+        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
         outputs = self.image_processor_tester.get_fake_maskformer_outputs()
-        segmentation = feature_extractor.post_process_instance_segmentation(outputs, threshold=0)
+        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
 
         self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
         for el in segmentation:
@@ -587,7 +587,7 @@ def test_post_process_instance_segmentation(self):
                 el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
             )
 
-        segmentation = feature_extractor.post_process_instance_segmentation(
+        segmentation = image_processor.post_process_instance_segmentation(
             outputs, threshold=0, return_binary_maps=True
         )
 
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 65ce4aa7f0ca..81c720a0b59e 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -35,7 +35,7 @@
     from transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
 
     if is_vision_available():
-        from transformers import MaskFormerFeatureExtractor
+        from transformers import MaskFormerImageProcessor
 
 if is_vision_available():
     from PIL import Image
@@ -326,18 +326,18 @@ def prepare_img():
 @slow
 class MaskFormerModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            MaskFormerFeatureExtractor.from_pretrained("facebook/maskformer-swin-small-coco")
+            MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-small-coco")
             if is_vision_available()
             else None
         )
 
     def test_inference_no_head(self):
         model = MaskFormerModel.from_pretrained("facebook/maskformer-swin-small-coco").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -380,9 +380,9 @@ def test_inference_instance_segmentation_head(self):
             .to(torch_device)
             .eval()
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -424,9 +424,9 @@ def test_inference_instance_segmentation_head_resnet_backbone(self):
             .to(torch_device)
             .eval()
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
         inputs_shape = inputs["pixel_values"].shape
         # check size is divisible by 32
         self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
@@ -460,9 +460,9 @@ def test_with_segmentation_maps_and_loss(self):
             .to(torch_device)
             .eval()
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
-        inputs = feature_extractor(
+        inputs = image_processor(
             [np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
             segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
             return_tensors="pt",
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
index 387d13fad18e..4cffc344a364 100644
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -64,7 +64,7 @@ def setUp(self):
         image_processor_map = {
             "do_normalize": False,
             "do_resize": True,
-            "feature_extractor_type": "ViTFeatureExtractor",
+            "image_processor_type": "ViTImageProcessor",
             "resample": 3,
             "size": {"height": 32, "width": 128},
         }
diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
index 0129143ec768..8c24935800b9 100644
--- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
@@ -37,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileNetV1FeatureExtractor
+    from transformers import MobileNetV1ImageProcessor
 
 
 class MobileNetV1ConfigTester(ConfigTester):
@@ -240,20 +240,18 @@ def prepare_img():
 @require_vision
 class MobileNetV1ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            MobileNetV1FeatureExtractor.from_pretrained("google/mobilenet_v1_1.0_224")
-            if is_vision_available()
-            else None
+            MobileNetV1ImageProcessor.from_pretrained("google/mobilenet_v1_1.0_224") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
         model = MobileNetV1ForImageClassification.from_pretrained("google/mobilenet_v1_1.0_224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
index e3768d0ee04a..06b2bd9d3fc4 100644
--- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
@@ -37,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileNetV2FeatureExtractor
+    from transformers import MobileNetV2ImageProcessor
 
 
 class MobileNetV2ConfigTester(ConfigTester):
@@ -295,20 +295,18 @@ def prepare_img():
 @require_vision
 class MobileNetV2ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            MobileNetV2FeatureExtractor.from_pretrained("google/mobilenet_v2_1.0_224")
-            if is_vision_available()
-            else None
+            MobileNetV2ImageProcessor.from_pretrained("google/mobilenet_v2_1.0_224") if is_vision_available() else None
         )
 
     @slow
     def test_inference_image_classification_head(self):
         model = MobileNetV2ForImageClassification.from_pretrained("google/mobilenet_v2_1.0_224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -327,10 +325,10 @@ def test_inference_semantic_segmentation(self):
         model = MobileNetV2ForSemanticSegmentation.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
         model = model.to(torch_device)
 
-        feature_extractor = MobileNetV2FeatureExtractor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
+        image_processor = MobileNetV2ImageProcessor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index e9029cd085b1..c0ad5a7744b3 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -37,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileViTFeatureExtractor
+    from transformers import MobileViTImageProcessor
 
 
 class MobileViTConfigTester(ConfigTester):
@@ -298,16 +298,16 @@ def prepare_img():
 @require_vision
 class MobileViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = MobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -326,10 +326,10 @@ def test_inference_semantic_segmentation(self):
         model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
         model = model.to(torch_device)
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -356,10 +356,10 @@ def test_post_processing_semantic_segmentation(self):
         model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
         model = model.to(torch_device)
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -367,10 +367,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
         expected_shape = torch.Size((50, 60))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((32, 32))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index c635feae8f42..4a3c4484d283 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -40,7 +40,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import MobileViTFeatureExtractor
+    from transformers import MobileViTImageProcessor
 
 
 class TFMobileViTConfigTester(ConfigTester):
@@ -381,9 +381,9 @@ class TFMobileViTModelIntegrationTest(unittest.TestCase):
     def test_inference_image_classification_head(self):
         model = TFMobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small")
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small")
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs, training=False)
@@ -401,10 +401,10 @@ def test_inference_semantic_segmentation(self):
         # `from_pt` will be removed
         model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
-        feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
+        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(inputs.pixel_values, training=False)
diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py
index cd4f0bf9647d..a27b087ce519 100644
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
@@ -364,16 +364,16 @@ def test_initialization(self):
 @require_torch
 class NatModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index c4a32b164720..64ad560d7d99 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -61,7 +61,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import PerceiverFeatureExtractor
+    from transformers import PerceiverImageProcessor
 
 
 class PerceiverModelTester:
@@ -899,13 +899,13 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_image_classification(self):
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
         model.to(torch_device)
 
         # prepare inputs
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
         input_mask = None
 
         # forward pass
@@ -923,13 +923,13 @@ def test_inference_image_classification(self):
 
     @slow
     def test_inference_image_classification_fourier(self):
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
         model.to(torch_device)
 
         # prepare inputs
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
         input_mask = None
 
         # forward pass
@@ -947,13 +947,13 @@ def test_inference_image_classification_fourier(self):
 
     @slow
     def test_inference_image_classification_conv(self):
-        feature_extractor = PerceiverFeatureExtractor()
+        image_processor = PerceiverImageProcessor()
         model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
         model.to(torch_device)
 
         # prepare inputs
         image = prepare_img()
-        inputs = feature_extractor(image, return_tensors="pt").pixel_values.to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
         input_mask = None
 
         # forward pass
diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py
index e741b9650dae..99667d6f1b45 100644
--- a/tests/models/poolformer/test_modeling_poolformer.py
+++ b/tests/models/poolformer/test_modeling_poolformer.py
@@ -37,7 +37,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import PoolFormerFeatureExtractor
+    from transformers import PoolFormerImageProcessor
 
 
 class PoolFormerConfigTester(ConfigTester):
@@ -237,10 +237,10 @@ def prepare_img():
 class PoolFormerModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_classification_head(self):
-        feature_extractor = PoolFormerFeatureExtractor()
+        image_processor = PoolFormerImageProcessor()
         model = PoolFormerForImageClassification.from_pretrained("sail/poolformer_s12").to(torch_device)
 
-        inputs = feature_extractor(images=prepare_img(), return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=prepare_img(), return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/regnet/test_modeling_flax_regnet.py b/tests/models/regnet/test_modeling_flax_regnet.py
index e9788ab09d7e..911d595c56e6 100644
--- a/tests/models/regnet/test_modeling_flax_regnet.py
+++ b/tests/models/regnet/test_modeling_flax_regnet.py
@@ -33,7 +33,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class FlaxRegNetModelTester(unittest.TestCase):
@@ -215,16 +215,16 @@ def prepare_img():
 @require_flax
 class FlaxRegNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/regnet-y-040") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = FlaxRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")
 
         outputs = model(**inputs)
 
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index e7c33699fda7..debd8271b5c6 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class RegNetModelTester:
@@ -248,9 +248,9 @@ def prepare_img():
 @require_vision
 class RegNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -259,9 +259,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = RegNetForImageClassification.from_pretrained(REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/regnet/test_modeling_tf_regnet.py b/tests/models/regnet/test_modeling_tf_regnet.py
index 4536458c549e..a2f2bf92ef46 100644
--- a/tests/models/regnet/test_modeling_tf_regnet.py
+++ b/tests/models/regnet/test_modeling_tf_regnet.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFRegNetModelTester:
@@ -267,9 +267,9 @@ def prepare_img():
 @require_vision
 class RegNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -278,9 +278,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = TFRegNetForImageClassification.from_pretrained(TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs, training=False)
diff --git a/tests/models/resnet/test_modeling_flax_resnet.py b/tests/models/resnet/test_modeling_flax_resnet.py
index ee56cfe113b1..e9566e2e2fd5 100644
--- a/tests/models/resnet/test_modeling_flax_resnet.py
+++ b/tests/models/resnet/test_modeling_flax_resnet.py
@@ -32,7 +32,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class FlaxResNetModelTester(unittest.TestCase):
@@ -206,16 +206,16 @@ def prepare_img():
 @require_flax
 class FlaxResNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="np")
+        inputs = image_processor(images=image, return_tensors="np")
 
         outputs = model(**inputs)
 
diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py
index fe7a1a0458e9..4cfa18d6bcf4 100644
--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class ResNetModelTester:
@@ -301,9 +301,9 @@ def prepare_img():
 @require_vision
 class ResNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -312,9 +312,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = ResNetForImageClassification.from_pretrained(RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/resnet/test_modeling_tf_resnet.py b/tests/models/resnet/test_modeling_tf_resnet.py
index e6f8d121c278..827fc807dfe5 100644
--- a/tests/models/resnet/test_modeling_tf_resnet.py
+++ b/tests/models/resnet/test_modeling_tf_resnet.py
@@ -41,7 +41,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFResNetModelTester:
@@ -229,9 +229,9 @@ def prepare_img():
 @require_vision
 class TFResNetModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+            AutoImageProcessor.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
             else None
         )
@@ -240,9 +240,9 @@ def default_feature_extractor(self):
     def test_inference_image_classification_head(self):
         model = TFResNetForImageClassification.from_pretrained(TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index c357fee5aea7..03706ca96e2d 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -42,7 +42,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import SegformerFeatureExtractor
+    from transformers import SegformerImageProcessor
 
 
 class SegformerConfigTester(ConfigTester):
@@ -365,7 +365,7 @@ class SegformerModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_segmentation_ade(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(
@@ -373,7 +373,7 @@ def test_inference_image_segmentation_ade(self):
         )
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
         with torch.no_grad():
@@ -394,7 +394,7 @@ def test_inference_image_segmentation_ade(self):
     @slow
     def test_inference_image_segmentation_city(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = SegformerForSemanticSegmentation.from_pretrained(
@@ -402,7 +402,7 @@ def test_inference_image_segmentation_city(self):
         ).to(torch_device)
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
         with torch.no_grad():
@@ -423,7 +423,7 @@ def test_inference_image_segmentation_city(self):
     @slow
     def test_post_processing_semantic_segmentation(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(
@@ -431,7 +431,7 @@ def test_post_processing_semantic_segmentation(self):
         )
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = encoded_inputs.pixel_values.to(torch_device)
 
         with torch.no_grad():
@@ -439,10 +439,10 @@ def test_post_processing_semantic_segmentation(self):
 
         outputs.logits = outputs.logits.detach().cpu()
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
         expected_shape = torch.Size((500, 300))
         self.assertEqual(segmentation[0].shape, expected_shape)
 
-        segmentation = feature_extractor.post_process_semantic_segmentation(outputs=outputs)
+        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((128, 128))
         self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index b381a2e890ba..b4dc657b7976 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import SegformerFeatureExtractor
+    from transformers import SegformerImageProcessor
 
 
 class TFSegformerConfigTester(ConfigTester):
@@ -454,13 +454,13 @@ class TFSegformerModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_image_segmentation_ade(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+        encoded_inputs = image_processor(images=image, return_tensors="tf")
         pixel_values = encoded_inputs.pixel_values
 
         outputs = model(pixel_values, training=False)
@@ -480,7 +480,7 @@ def test_inference_image_segmentation_ade(self):
     @slow
     def test_inference_image_segmentation_city(self):
         # only resize + normalize
-        feature_extractor = SegformerFeatureExtractor(
+        image_processor = SegformerImageProcessor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = TFSegformerForSemanticSegmentation.from_pretrained(
@@ -488,7 +488,7 @@ def test_inference_image_segmentation_city(self):
         )
 
         image = prepare_img()
-        encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+        encoded_inputs = image_processor(images=image, return_tensors="tf")
         pixel_values = encoded_inputs.pixel_values
 
         outputs = model(pixel_values, training=False)
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index 159d5ec0fa9a..e524719d23ed 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -283,16 +283,16 @@ def prepare_img():
 @require_vision
 class SwiftFormerModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 9dcb1bfdc689..383f0fe867d4 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class SwinModelTester:
@@ -482,9 +482,9 @@ def test_initialization(self):
 @require_torch
 class SwinModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
             if is_vision_available()
             else None
         )
@@ -492,10 +492,10 @@ def default_feature_extractor(self):
     @slow
     def test_inference_image_classification_head(self):
         model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224").to(torch_device)
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
index de271fb38a43..597643936f95 100644
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ b/tests/models/swin/test_modeling_tf_swin.py
@@ -45,7 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TFSwinModelTester:
@@ -382,9 +382,9 @@ def test_model_from_pretrained(self):
 @require_tf
 class TFSwinModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
             if is_vision_available()
             else None
         )
@@ -392,10 +392,10 @@ def default_feature_extractor(self):
     @slow
     def test_inference_image_classification_head(self):
         model = TFSwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(inputs)
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index b0ceaff2ed2c..5a771b2c4b63 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -36,7 +36,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class Swinv2ModelTester:
@@ -412,9 +412,9 @@ def test_initialization(self):
 @require_torch
 class Swinv2ModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
-            AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
+            AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
             if is_vision_available()
             else None
         )
@@ -424,10 +424,10 @@ def test_inference_image_classification_head(self):
         model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256").to(
             torch_device
         )
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 0dafb55e5a38..2d42e2990680 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -39,7 +39,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class TableTransformerModelTester:
@@ -501,13 +501,13 @@ def prepare_img():
 @slow
 class TableTransformerModelIntegrationTests(unittest.TestCase):
     def test_table_detection(self):
-        feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/table-transformer-detection")
+        image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
         model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
         model.to(torch_device)
 
         file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
         image = Image.open(file_path).convert("RGB")
-        inputs = feature_extractor(image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index ff73fa208a7f..2783a65cede9 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -45,7 +45,7 @@
 
 
 if is_vision_available():
-    from transformers import VideoMAEFeatureExtractor
+    from transformers import VideoMAEImageProcessor
 
 
 class TimesformerModelTester:
@@ -339,10 +339,10 @@ def prepare_video():
 @require_vision
 class TimesformerModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         # logits were tested with a different mean and std, so we use the same here
         return (
-            VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+            VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
             if is_vision_available()
             else None
         )
@@ -353,9 +353,9 @@ def test_inference_for_video_classification(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         video = prepare_video()
-        inputs = feature_extractor(video[:8], return_tensors="pt").to(torch_device)
+        inputs = image_processor(video[:8], return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index e1864e48dd04..52af9b6f1b3e 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -564,7 +564,7 @@ def prepare_audio(num_samples=1):
 @require_vision
 class TvltModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_processors(self):
         # logits were tested with a different mean and std, so we use the same here
         return (
             TvltImageProcessor() if is_vision_available() else None,
@@ -574,7 +574,7 @@ def default_feature_extractor(self):
     def test_inference_for_base_model(self):
         model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
 
-        image_processor, audio_feature_extractor = self.default_feature_extractor
+        image_processor, audio_feature_extractor = self.default_processors
         video = prepare_video()
         audio = prepare_audio()
         video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
@@ -596,7 +596,7 @@ def test_inference_for_base_model(self):
     def test_inference_for_pretraining(self):
         model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
 
-        image_processor, audio_feature_extractor = self.default_feature_extractor
+        image_processor, audio_feature_extractor = self.default_processors
         video = prepare_video()
         video_mixed = prepare_video()
         audio = prepare_audio()
diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py
index 49df30a828a6..1296e0a22586 100644
--- a/tests/models/van/test_modeling_van.py
+++ b/tests/models/van/test_modeling_van.py
@@ -42,7 +42,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class VanModelTester:
@@ -254,16 +254,16 @@ def prepare_img():
 @require_vision
 class VanModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
 
     @slow
     def test_inference_image_classification_head(self):
         model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 1070a4ee2a2a..424fc7d84ad7 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -46,7 +46,7 @@
 
 
 if is_vision_available():
-    from transformers import VideoMAEFeatureExtractor
+    from transformers import VideoMAEImageProcessor
 
 
 class VideoMAEModelTester:
@@ -359,10 +359,10 @@ def prepare_video():
 @require_vision
 class VideoMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         # logits were tested with a different mean and std, so we use the same here
         return (
-            VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+            VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
             if is_vision_available()
             else None
         )
@@ -373,9 +373,9 @@ def test_inference_for_video_classification(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         video = prepare_video()
-        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
+        inputs = image_processor(video, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -393,9 +393,9 @@ def test_inference_for_video_classification(self):
     def test_inference_for_pretraining(self):
         model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         video = prepare_video()
-        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
+        inputs = image_processor(video, return_tensors="pt").to(torch_device)
 
         # add boolean mask, indicating which patches to mask
         local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
index c4f84012c73a..c6926e002a9b 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
@@ -48,7 +48,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 @require_flax
@@ -462,12 +462,12 @@ class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
     def test_inference_coco_en(self):
         loc = "ydshieh/vit-gpt2-coco-en"
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
+        image_processor = ViTImageProcessor.from_pretrained(loc)
         tokenizer = AutoTokenizer.from_pretrained(loc)
         model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
 
         img = prepare_img()
-        pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values
+        pixel_values = image_processor(images=img, return_tensors="np").pixel_values
 
         decoder_input_ids = np.array([[model.config.decoder_start_token_id]])
         logits = model(pixel_values, decoder_input_ids)[0]
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index 7a29bbd22115..e173e21a9b5d 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -45,7 +45,7 @@
 
     from transformers import (
         AutoConfig,
-        AutoFeatureExtractor,
+        AutoImageProcessor,
         AutoTokenizer,
         TFAutoModel,
         TFAutoModelForCausalLM,
@@ -64,7 +64,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 @require_tf
@@ -828,11 +828,11 @@ def test_encoder_decoder_from_pretrained(self):
         load_weight_prefix = TFVisionEncoderDecoderModel.load_weight_prefix
 
         config = self.get_encoder_decoder_config()
-        feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
+        image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
         decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
         img = prepare_img()
-        pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
+        pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
         decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
@@ -893,13 +893,13 @@ class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
     def test_inference_coco_en(self):
         loc = "ydshieh/vit-gpt2-coco-en"
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
+        image_processor = ViTImageProcessor.from_pretrained(loc)
         tokenizer = AutoTokenizer.from_pretrained(loc)
         model = TFVisionEncoderDecoderModel.from_pretrained(loc)
 
         # We will verify our results on an image of cute cats
         img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = feature_extractor(images=img, return_tensors="tf").pixel_values
+        pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
 
         decoder_input_ids = tf.constant([[model.config.decoder_start_token_id]])
 
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 52b23195b1db..8ef7ac469743 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -62,7 +62,7 @@
     import PIL
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 @require_torch
@@ -749,7 +749,7 @@ class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
     def test_inference_coco_en(self):
         loc = "ydshieh/vit-gpt2-coco-en"
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
+        image_processor = ViTImageProcessor.from_pretrained(loc)
         tokenizer = AutoTokenizer.from_pretrained(loc)
         model = VisionEncoderDecoderModel.from_pretrained(loc)
         model.to(torch_device)
@@ -757,7 +757,7 @@ def test_inference_coco_en(self):
 
         # We will verify our results on an image of cute cats
         img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(torch_device)
+        pixel_values = image_processor(images=img, return_tensors="pt").pixel_values.to(torch_device)
 
         decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]]).to(torch_device)
 
diff --git a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
index 30630256f970..7cd8b7645db9 100644
--- a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
@@ -170,10 +170,10 @@ def test_tokenizer_decode(self):
         self.assertListEqual(decoded_tok, decoded_processor)
 
     def test_model_input_names(self):
-        feature_extractor = self.get_image_processor()
+        image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
 
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
index 72ca1b19dc2b..53862da916e1 100644
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ b/tests/models/vit/test_modeling_tf_vit.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class TFViTModelTester:
@@ -228,16 +228,16 @@ def prepare_img():
 @require_vision
 class TFViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # forward pass
         outputs = model(**inputs)
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index d7ae3c162f59..67c6e4acb10b 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -45,7 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class ViTModelTester:
@@ -264,16 +264,16 @@ def prepare_img():
 @require_vision
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -295,9 +295,9 @@ def test_inference_interpolate_pos_encoding(self):
         # to visualize self-attention on higher resolution images.
         model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device)
 
-        feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/dino-vits8", size=480)
+        image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass
@@ -322,10 +322,10 @@ def test_inference_fp16(self):
         A small test to make sure that inference work in half precision without any problem.
         """
         model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
 
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass to make sure inference works in fp16
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index 7fbe2f9dca2f..fc816750e713 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -243,7 +243,7 @@ def prepare_img():
 @require_vision
 class ViTModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
+    def default_image_processor(self):
         return (
             ViTHybridImageProcessor.from_pretrained(VIT_HYBRID_PRETRAINED_MODEL_ARCHIVE_LIST[0])
             if is_vision_available()
@@ -256,9 +256,9 @@ def test_inference_image_classification_head(self):
             torch_device
         )
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -275,12 +275,12 @@ def test_inference_image_classification_head(self):
     @slow
     @require_accelerate
     def test_accelerate_inference(self):
-        feature_extractor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
+        image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
         model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", device_map="auto")
 
         image = prepare_img()
 
-        inputs = feature_extractor(images=image, return_tensors="pt")
+        inputs = image_processor(images=image, return_tensors="pt")
         outputs = model(**inputs)
         logits = outputs.logits
         # model predicts one of the 1000 ImageNet classes
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 34e5a636d825..bc7d481f5f55 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -46,7 +46,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class TFViTMAEModelTester:
@@ -424,8 +424,8 @@ def prepare_img():
 @require_vision
 class TFViTMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
 
     @slow
     def test_inference_for_pretraining(self):
@@ -434,9 +434,9 @@ def test_inference_for_pretraining(self):
 
         model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="tf")
+        inputs = image_processor(images=image, return_tensors="tf")
 
         # prepare a noise vector that will be also used for testing the TF model
         # (this way we can ensure that the PT and TF models operate on the same inputs)
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index c58e2e94802e..8b63cd6d7540 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -42,7 +42,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class ViTMAEModelTester:
@@ -296,8 +296,8 @@ def prepare_img():
 @require_vision
 class ViTMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
 
     @slow
     def test_inference_for_pretraining(self):
@@ -306,9 +306,9 @@ def test_inference_for_pretraining(self):
 
         model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # prepare a noise vector that will be also used for testing the TF model
         # (this way we can ensure that the PT and TF models operate on the same inputs)
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index 966ec6c6d3de..173dca091391 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import ViTImageProcessor
 
 
 class ViTMSNModelTester:
@@ -220,17 +220,17 @@ def prepare_img():
 @require_vision
 class ViTMSNModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return ViTImageProcessor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
 
     @slow
     def test_inference_image_classification_head(self):
         torch.manual_seed(2)
         model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index 209849394b8e..7c49bb864e4c 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -38,7 +38,7 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import AutoFeatureExtractor
+    from transformers import AutoImageProcessor
 
 
 class YolosModelTester:
@@ -345,16 +345,16 @@ def prepare_img():
 @require_vision
 class YolosModelIntegrationTest(unittest.TestCase):
     @cached_property
-    def default_feature_extractor(self):
-        return AutoFeatureExtractor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
 
     @slow
     def test_inference_object_detection_head(self):
         model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small").to(torch_device)
 
-        feature_extractor = self.default_feature_extractor
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
@@ -375,7 +375,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
         # verify postprocessing
-        results = feature_extractor.post_process_object_detection(
+        results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
         expected_scores = torch.tensor([0.9994, 0.9790, 0.9964, 0.9972, 0.9861]).to(torch_device)

From 63cc30e71bc170efe410409f65aa7b956dd1675c Mon Sep 17 00:00:00 2001
From: "MS Kim(tony9402)" <tony9402@naver.com>
Date: Thu, 29 Jun 2023 21:04:07 +0900
Subject: [PATCH 567/935] Fix Typo (#24559)

---
 src/transformers/models/fsmt/modeling_fsmt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 608efabf7885..db239437e125 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -770,7 +770,7 @@ def forward(
         x += positions
         x = nn.functional.dropout(x, p=self.dropout, training=self.training)
 
-        # Convert to FSMT output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        # Convert to FSMT output format: (BS, seq_len, model_dim) -> (seq_len, BS, model_dim)
         x = x.transpose(0, 1)
         encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
 

From 1fd52e6e60ae9d9a1f5f76b10f441386d231d656 Mon Sep 17 00:00:00 2001
From: "MS Kim(tony9402)" <tony9402@naver.com>
Date: Thu, 29 Jun 2023 21:05:19 +0900
Subject: [PATCH 568/935] Fix annotations (#24571)

* fix annotations

* fix copies
---
 src/transformers/models/blenderbot/modeling_blenderbot.py       | 2 +-
 src/transformers/models/m2m_100/modeling_m2m_100.py             | 2 +-
 src/transformers/models/mbart/modeling_mbart.py                 | 2 +-
 src/transformers/models/pegasus/modeling_pegasus.py             | 2 +-
 .../models/speech_to_text/modeling_speech_to_text.py            | 2 +-
 src/transformers/models/whisper/modeling_whisper.py             | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index a3aaf6b4a812..4bdf693eaa2a 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -307,7 +307,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 20db884c6366..e895e0864d8c 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -375,7 +375,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 577d950c932d..28ab886c235c 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -315,7 +315,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index e9121655d13f..a81a76e8d136 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -322,7 +322,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 1af805a17905..e98f5a46c558 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -382,7 +382,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index fa9eae4c4797..699f3a8e9df3 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -491,7 +491,7 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size

From 4f1b31c2ee2822618d8433a71627ec18e9f2e2d3 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 29 Jun 2023 13:13:20 +0100
Subject: [PATCH 569/935] Docs: 4 bit doc corrections (#24572)

4 bit doc corrections
---
 docs/source/en/perf_infer_gpu_one.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 080d0709cd0b..d08d84dd484b 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -56,7 +56,7 @@ Note that this feature can also be used in a multi GPU setup.
 - Install latest `accelerate` from source
 `pip install git+https://github.com/huggingface/accelerate.git`
 
-- Install latest `transformers` from source 
+- Install latest `transformers` from source
 `pip install git+https://github.com/huggingface/transformers.git`
 
 ### Running FP4 models - single GPU setup - Quickstart
@@ -67,29 +67,29 @@ You can quickly run a FP4 model on a single GPU by running the following code:
 from transformers import AutoModelForCausalLM
 
 model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
 ```
 Note that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
 
-### Running FP4 models - multi GPU setup 
+### Running FP4 models - multi GPU setup
 
-The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup):
+The way to load your mixed 4-bit model in multiple GPUs is as follows (same command as single GPU setup):
 ```py
 model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
 ```
 But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
 
 ```py
 max_memory_mapping = {0: "600MB", 1: "1GB"}
 model_name = "bigscience/bloom-3b"
-model_8bit = AutoModelForCausalLM.from_pretrained(
+model_4bit = AutoModelForCausalLM.from_pretrained(
     model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
 )
 ```
 In this example, the first GPU will use 600MB of memory and the second 1GB.
 
-### Advanced usage 
+### Advanced usage
 
 For more advanced usage of this method, please have a look at the [quantization](main_classes/quantization) documentation page.
 
@@ -111,7 +111,7 @@ For more details regarding the method, check out the [paper](https://arxiv.org/a
 
 ![MixedInt8.gif](https://s3.amazonaws.com/moonup/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
 
-Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature. 
+Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature.
 Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
 
 ### Requirements
@@ -174,7 +174,7 @@ In this example, the first GPU will use 1GB of memory and the second 2GB.
 
 ### Colab demos
 
-With this method you can infer on models that were not possible to infer on a Google Colab before. 
+With this method you can infer on models that were not possible to infer on a Google Colab before.
 Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
 
 [![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)

From 2dc5e1a120176594ed2dcb7d2f02a5dd62266232 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 29 Jun 2023 08:14:43 -0400
Subject: [PATCH 570/935] Revert "Fix typing annotations for FSDP and DeepSpeed
 in TrainingArguments" (#24574)

Revert "Fix typing annotations for FSDP and DeepSpeed in TrainingArguments (#24549)"

This reverts commit c5e29d4381d4b9739e6cb427adbca87fbb43a3ad.
---
 src/transformers/training_args.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index ac875e0570cc..e8c2823f3793 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -976,12 +976,12 @@ class TrainingArguments:
             )
         },
     )
-    fsdp_config: Optional[Union[str, Dict]] = field(
+    fsdp_config: Optional[str] = field(
         default=None,
         metadata={
             "help": (
-                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The value is either a"
-                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`."
+                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The  value is either a"
+                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded  json file as `dict`."
             )
         },
     )
@@ -994,11 +994,11 @@ class TrainingArguments:
             )
         },
     )
-    deepspeed: Optional[Union[str, Dict]] = field(
+    deepspeed: Optional[str] = field(
         default=None,
         metadata={
             "help": (
-                "Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already"
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
                 " loaded json file as a dict"
             )
         },

From 1c1c90756db082a0e997eaa24e71f5f6d25ff315 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 29 Jun 2023 14:48:59 +0100
Subject: [PATCH 571/935] Add Musicgen (#24109)

* Add Audiocraft

* add cross attention

* style

* add for lm

* convert and verify

* introduce t5

* split configs

* load t5 + lm

* clean conversion

* copy from t5

* style

* start pattern provider

* make generation work

* style

* fix pos embs

* propagate shape changes

* propagate shape changes

* style

* delay pattern: pad tokens at end

* audiocraft -> musicgen

* fix inits

* add mdx

* style

* fix pad token in processor

* override generate and add todos

* add init to test

* undo pattern delay mask after gen

* remove cfg logits processor

* remove cfg logits processor

* remove logits processor in favour of mask

* clean pos embs

* make fix copies

* update readmes

* clean pos emb

* refactor encoder/decoder

* make fix copies

* update conversion

* fix config imports

* update config docs

* make style

* send pattern mask to device

* pattern mask with delay

* recover prompted audio tokens

* fix docstrings

* laydown test file

* pattern edge case

* remove t5 ref

* add processing class

* config refactor

* better pattern comment

* check if mask is not present

* check if mask is not present

* refactor to auto class

* remove encoder configs

* fix processor

* processor import

* start updating conversion

* start updating tests

* make style

* convert t5, encodec, lm

* convert as composite

* also convert processor

* run generate

* classifier free gen

* comments and clean up

* make style

* docs for logit proc

* docstring for uncond gen

* start lm tests

* work tests

* let the lm generate

* refactor: reshape inside forward

* undo greedy loop changes

* from_enc_dec -> from_sub_model

* fix input id shapes in docstrings

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* undo generate changes

* from sub model config

* Update src/transformers/models/musicgen/modeling_musicgen.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* make generate work again

* generate uncond -> get uncond inputs

* remove prefix allowed tokens fn

* better error message

* logit proc checks

* Apply suggestions from code review

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* make decoder only tests work

* composite fast tests

* make style

* uncond generation

* feat extr padding

* make audio prompt work

* fix inputs docstrings

* unconditional inputs: dict -> model output

* clean up tests

* more clean up tests

* make style

* t5 encoder -> auto text encoder

* remove comments

* deal with frames

* fix auto text

* slow tests

* nice mdx

* remove can generate

* todo - hub id

* convert m/l

* make fix copies

* only import generation with torch

* ignore decoder from tests

* don't wrap uncond inputs

* make style

* cleaner uncond inputs

* add example to musicgen forward

* fix docs

* ignore MusicGen Model/ForConditionalGeneration in auto mapping

* add doc section to toctree

* add to doc tests

* add processor tests

* fix push to hub in conversion

* tips for decoder only loading

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fix conversion for s / m / l checkpoints

* import stopping criteria from module

* remove from pipeline tests

* fix uncond docstring

* decode audio method

* fix docs

* org: sanchit-gandhi -> facebook

* fix max pos embeddings

* remove auto doc (not compatible with shapes)

* bump max pos emb

* make style

* fix doc

* fix config doc

* fix config doc

* ignore musicgen config from docstring

* make style

* fix config

* fix config for doctest

* consistent from_sub_models

* don't automap decoder

* fix mdx save audio file

* fix mdx save audio file

* processor batch decode for audio

* remove keys to ignore

* update doc md

* update generation config

* allow changes for default generation config

* update tests

* make style

* fix docstring for uncond

* fix processor test

* fix processor test

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/musicgen.md          |  277 ++
 docs/source/en/tasks/language_modeling.md     |    2 +-
 src/transformers/__init__.py                  |   28 +
 .../generation/configuration_utils.py         |    5 +
 src/transformers/generation/logits_process.py |   37 +
 src/transformers/generation/utils.py          |    3 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 src/transformers/models/musicgen/__init__.py  |   67 +
 .../models/musicgen/configuration_musicgen.py |  243 ++
 .../musicgen/convert_musicgen_transformers.py |  209 ++
 .../models/musicgen/modeling_musicgen.py      | 2512 +++++++++++++++++
 .../models/musicgen/processing_musicgen.py    |  139 +
 src/transformers/utils/dummy_pt_objects.py    |   38 +
 tests/models/musicgen/__init__.py             |    0
 .../models/musicgen/test_modeling_musicgen.py | 1346 +++++++++
 .../musicgen/test_processing_musicgen.py      |  173 ++
 utils/check_config_docstrings.py              |    1 +
 utils/check_repo.py                           |    3 +
 utils/documentation_tests.txt                 |    3 +
 30 files changed, 5101 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/model_doc/musicgen.md
 create mode 100644 src/transformers/models/musicgen/__init__.py
 create mode 100644 src/transformers/models/musicgen/configuration_musicgen.py
 create mode 100644 src/transformers/models/musicgen/convert_musicgen_transformers.py
 create mode 100644 src/transformers/models/musicgen/modeling_musicgen.py
 create mode 100644 src/transformers/models/musicgen/processing_musicgen.py
 create mode 100644 tests/models/musicgen/__init__.py
 create mode 100644 tests/models/musicgen/test_modeling_musicgen.py
 create mode 100644 tests/models/musicgen/test_processing_musicgen.py

diff --git a/README.md b/README.md
index 2a8e515e14fc..cc1fd458f5af 100644
--- a/README.md
+++ b/README.md
@@ -411,6 +411,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
diff --git a/README_es.md b/README_es.md
index f5077993aa91..fcb6049870be 100644
--- a/README_es.md
+++ b/README_es.md
@@ -386,6 +386,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
diff --git a/README_hd.md b/README_hd.md
index 146af0585c51..9b694ed60781 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -358,6 +358,7 @@ conda install -c huggingface transformers
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 5584a84fac7e..60b14191c165 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -420,6 +420,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
diff --git a/README_ko.md b/README_ko.md
index a476f4493b02..cdbeec9a4b8a 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -335,6 +335,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index bdc661091a9f..db1cbd423725 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -359,6 +359,7 @@ conda install -c huggingface transformers
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 94a275dc2167..e66cd1f867a2 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -371,6 +371,7 @@ conda install -c huggingface transformers
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 414e67ffe79d..e36282b21c7d 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -549,6 +549,8 @@
         title: MCTCT
       - local: model_doc/mms
         title: MMS
+      - local: model_doc/musicgen
+        title: MusicGen
       - local: model_doc/sew
         title: SEW
       - local: model_doc/sew-d
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 4c7f23881515..91c57e0f393f 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -175,6 +175,7 @@ The documentation is organized into five sections:
 1. **[MobileViTV2](model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -380,6 +381,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          MobileViTV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           MusicGen            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md
new file mode 100644
index 000000000000..72250a86fca9
--- /dev/null
+++ b/docs/source/en/model_doc/musicgen.md
@@ -0,0 +1,277 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MusicGen
+
+## Overview
+
+The MusicGen model was proposed in the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284)
+by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
+
+MusicGen is a single stage auto-regressive Transformer model capable of generating high-quality music samples conditioned
+on text descriptions or audio prompts. The text descriptions are passed through a frozen text encoder model to obtain a
+sequence of hidden-state representations. MusicGen is then trained to predict discrete audio tokens, or *audio codes*,
+conditioned on these hidden-states. These audio tokens are then decoded using an audio compression model, such as EnCodec,
+to recover the audio waveform.
+
+Through an efficient token interleaving pattern, MusicGen does not require a self-supervised semantic representation of
+the text/audio prompts, thus eliminating the need to cascade multiple models to predict a set of codebooks (e.g.
+hierarchically or upsampling). Instead, it is able to generate all the codebooks in a single forward pass.
+
+The abstract from the paper is the following:
+
+*We tackle the task of conditional music generation. We introduce MusicGen, a single Language Model (LM) that operates
+over several streams of compressed discrete music representation, i.e., tokens. Unlike prior work, MusicGen is comprised
+of a single-stage transformer LM together with efficient token interleaving patterns, which eliminates the need for
+cascading several models, e.g., hierarchically or upsampling. Following this approach, we demonstrate how MusicGen
+can generate high-quality samples, while being conditioned on textual description or melodic features, allowing better
+controls over the generated output. We conduct extensive empirical evaluation, considering both automatic and human
+studies, showing the proposed approach is superior to the evaluated baselines on a standard text-to-music benchmark.
+Through ablation studies, we shed light over the importance of each of the components comprising MusicGen.*
+
+This model was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original code can be found
+[here](https://github.com/facebookresearch/audiocraft). The pre-trained checkpoints can be found on the
+[Hugging Face Hub](https://huggingface.co/models?sort=downloads&search=facebook%2Fmusicgen-).
+
+## Generation
+
+MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
+better results than greedy, thus we encourage sampling mode to be used where possible. Sampling is enabled by default,
+and can be explicitly specified by setting `do_sample=True` in the call to [`MusicgenForConditionalGeneration.generate`],
+or by overriding the model's generation config (see below).
+
+### Unconditional Generation
+
+The inputs for unconditional (or 'null') generation can be obtained through the method
+[`MusicgenForConditionalGeneration.get_unconditional_inputs`]:
+
+```python
+>>> from transformers import MusicgenForConditionalGeneration
+
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+>>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
+
+>>> audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256)
+```
+
+The audio outputs are a three-dimensional Torch tensor of shape `(batch_size, num_channels, sequence_length)`. To listen
+to the generated audio samples, you can either play them in an ipynb notebook:
+
+```python
+from IPython.display import Audio
+
+sampling_rate = model.config.audio_encoder.sampling_rate
+Audio(audio_values[0].numpy(), rate=sampling_rate)
+```
+
+Or save them as a `.wav` file using a third-party library, e.g. `scipy`:
+
+```python
+>>> import scipy
+
+>>> sampling_rate = model.config.audio_encoder.sampling_rate
+>>> scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())
+```
+
+### Text-Conditional Generation
+
+The model can generate an audio sample conditioned on a text prompt through use of the [`MusicgenProcessor`] to pre-process
+the inputs:
+
+```python
+>>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+
+>>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> inputs = processor(
+...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+...     padding=True,
+...     return_tensors="pt",
+... )
+>>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+```
+
+The `guidance_scale` is used in classifier free guidance (CFG), setting the weighting between the conditional logits
+(which are predicted from the text prompts) and the unconditional logits (which are predicted from an unconditional or
+'null' prompt). Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+prompt, usually at the expense of poorer audio quality. CFG is enabled by setting `guidance_scale > 1`. For best results,
+use `guidance_scale=3` (default).
+
+### Audio-Prompted Generation
+
+The same [`MusicgenProcessor`] can be used to pre-process an audio prompt that is used for audio continuation. In the
+following example, we load an audio file using the 🤗 Datasets library, which can be pip installed through the command
+below:
+
+```
+pip install --upgrade pip
+pip install datasets[audio]
+```
+
+```python
+>>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+>>> from datasets import load_dataset
+
+>>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True)
+>>> sample = next(iter(dataset))["audio"]
+
+>>> # take the first half of the audio sample
+>>> sample["array"] = sample["array"][: len(sample["array"]) // 2]
+
+>>> inputs = processor(
+...     audio=sample["array"],
+...     sampling_rate=sample["sampling_rate"],
+...     text=["80s blues track with groovy saxophone"],
+...     padding=True,
+...     return_tensors="pt",
+... )
+>>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+```
+
+For batched audio-prompted generation, the generated `audio_values` can be post-processed to remove padding by using the
+[`MusicgenProcessor`] class:
+
+```python
+>>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+>>> from datasets import load_dataset
+
+>>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True)
+>>> sample = next(iter(dataset))["audio"]
+
+>>> # take the first quarter of the audio sample
+>>> sample_1 = sample["array"][: len(sample["array"]) // 4]
+
+>>> # take the first half of the audio sample
+>>> sample_2 = sample["array"][: len(sample["array"]) // 2]
+
+>>> inputs = processor(
+...     audio=[sample_1, sample_2],
+...     sampling_rate=sample["sampling_rate"],
+...     text=["80s blues track with groovy saxophone", "90s rock song with loud guitars and heavy drums"],
+...     padding=True,
+...     return_tensors="pt",
+... )
+>>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+
+>>> # post-process to remove padding from the batched audio
+>>> audio_values = processor.batch_decode(audio_values, padding_mask=inputs.padding_mask)
+```
+
+### Generation Configuration
+
+The default parameters that control the generation process, such as sampling, guidance scale and number of generated 
+tokens, can be found in the model's generation config, and updated as desired:
+
+```python
+>>> from transformers import MusicgenForConditionalGeneration
+
+>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+>>> # inspect the default generation config
+>>> model.generation_config
+
+>>> # increase the guidance scale to 4.0
+>>> model.generation_config.guidance_scale = 4.0
+
+>>> # decrease the max length to 256 tokens
+>>> model.generation_config.max_length = 256
+```
+
+Note that any arguments passed to the generate method will **supersede** those in the generation config, so setting 
+`do_sample=False` in the call to generate will supersede the setting of `model.generation_config.do_sample` in the 
+generation config.
+
+## Model Structure
+
+The MusicGen model can be de-composed into three distinct stages:
+1. Text encoder: maps the text inputs to a sequence of hidden-state representations. The pre-trained MusicGen models use a frozen text encoder from either T5 or Flan-T5
+2. MusicGen decoder: a language model (LM) that auto-regressively generates audio tokens (or codes) conditional on the encoder hidden-state representations
+3. Audio encoder/decoder: used to encode an audio prompt to use as prompt tokens, and recover the audio waveform from the audio tokens predicted by the decoder
+
+Thus, the MusicGen model can either be used as a standalone decoder model, corresponding to the class [`MusicgenForCausalLM`],
+or as a composite model that includes the text encoder and audio encoder/decoder, corresponding to the class
+[`MusicgenForConditionalGeneration`].
+
+Since the text encoder and audio encoder/decoder models are frozen during training, the MusicGen decoder [`MusicgenForCausalLM`]
+can be trained standalone on a dataset of encoder hidden-states and audio codes. For inference, the trained decoder can
+be combined with the frozen text encoder and audio encoder/decoders to recover the composite [`MusicgenForConditionalGeneration`]
+model.
+
+Below, we demonstrate how to construct the composite [`MusicgenForConditionalGeneration`] model from its three constituent
+parts, as would typically be done following training of the MusicGen decoder LM:
+
+```python
+>>> from transformers import AutoConfig, AutoModelForTextEncoding, AutoModel, MusicgenForCausalLM, MusicgenForConditionalGeneration
+
+>>> text_encoder = AutoModelForTextEncoding.from_pretrained("t5-base")
+>>> audio_encoder = AutoModel.from_pretrained("facebook/encodec_32khz")
+>>> decoder_config = AutoConfig.from_pretrained("facebook/musicgen-small").decoder
+>>> decoder = MusicgenForCausalLM.from_pretrained("facebook/musicgen-small", **decoder_config)
+
+>>> model = MusicgenForConditionalGeneration.from_sub_models_pretrained(text_encoder, audio_encoder, decoder)
+```
+
+If only the decoder needs to be loaded from the pre-trained checkpoint for the composite model, it can be loaded by first 
+specifying the correct config, or be accessed through the `.decoder` attribute of the composite model:
+
+```python
+>>> from transformers import AutoConfig, MusicgenForCausalLM, MusicgenForConditionalGeneration
+
+>>> # Option 1: get decoder config and pass to `.from_pretrained`
+>>> decoder_config = AutoConfig.from_pretrained("facebook/musicgen-small").decoder
+>>> decoder = MusicgenForCausalLM.from_pretrained("facebook/musicgen-small", **decoder_config)
+
+>>> # Option 2: load the entire composite model, but only return the decoder
+>>> decoder = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").decoder
+```
+
+Tips:
+* MusicGen is trained on the 32kHz checkpoint of Encodec. You should ensure you use a compatible version of the Encodec model.
+* Sampling mode tends to deliver better results than greedy - you can toggle sampling with the variable `do_sample` in the call to [`MusicgenForConditionalGeneration.generate`]
+
+## MusicgenDecoderConfig
+
+[[autodoc]] MusicgenDecoderConfig
+
+## MusicgenConfig
+
+[[autodoc]] MusicgenConfig
+
+## MusicgenProcessor
+
+[[autodoc]] MusicgenProcessor
+
+## MusicgenModel
+
+[[autodoc]] MusicgenModel
+    - forward
+
+## MusicgenForCausalLM
+
+[[autodoc]] MusicgenForCausalLM
+    - forward
+
+## MusicgenForConditionalGeneration
+
+[[autodoc]] MusicgenForConditionalGeneration
+    - forward
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index f6f9b37afe13..4986d90b28fc 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 
 <!--End of the generated tip-->
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 202d9a610153..3747e1951fbd 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -402,6 +402,11 @@
     "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
     "models.mt5": ["MT5Config"],
+    "models.musicgen": [
+        "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MusicgenConfig",
+        "MusicgenDecoderConfig",
+    ],
     "models.mvp": ["MvpConfig", "MvpTokenizer"],
     "models.nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"],
     "models.nezha": ["NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP", "NezhaConfig"],
@@ -2134,6 +2139,16 @@
     _import_structure["models.mt5"].extend(
         ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5ForQuestionAnswering", "MT5Model", "MT5PreTrainedModel"]
     )
+    _import_structure["models.musicgen"].extend(
+        [
+            "MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MusicgenForCausalLM",
+            "MusicgenForConditionalGeneration",
+            "MusicgenModel",
+            "MusicgenPreTrainedModel",
+            "MusicgenProcessor",
+        ]
+    )
     _import_structure["models.mvp"].extend(
         [
             "MVP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4249,6 +4264,11 @@
     from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
     from .models.mt5 import MT5Config
+    from .models.musicgen import (
+        MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MusicgenConfig,
+        MusicgenDecoderConfig,
+    )
     from .models.mvp import MvpConfig, MvpTokenizer
     from .models.nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig
     from .models.nezha import NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP, NezhaConfig
@@ -5709,6 +5729,14 @@
             MT5Model,
             MT5PreTrainedModel,
         )
+        from .models.musicgen import (
+            MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MusicgenForCausalLM,
+            MusicgenForConditionalGeneration,
+            MusicgenModel,
+            MusicgenPreTrainedModel,
+            MusicgenProcessor,
+        )
         from .models.mvp import (
             MVP_PRETRAINED_MODEL_ARCHIVE_LIST,
             MvpForCausalLM,
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 99426790db4c..4514ccef768b 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -185,6 +185,10 @@ class GenerationConfig(PushToHubMixin):
             Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
             sequence being selected, while negative biases do the opposite. Check
             [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
+        guidance_scale (`float`, *optional*):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
 
         > Parameters that define the output variables of `generate`
 
@@ -265,6 +269,7 @@ def __init__(self, **kwargs):
         self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
         self.forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
         self.sequence_bias = kwargs.pop("sequence_bias", None)
+        self.guidance_scale = kwargs.pop("guidance_scale", None)
 
         # Parameters that define the output variables of `generate`
         self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 6a73fcf9649e..a7fad6784579 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1065,3 +1065,40 @@ def __call__(self, input_ids, scores):
                 scores[k, : self.timestamp_begin] = -float("inf")
 
         return scores
+
+
+class ClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
+    r"""Logits processor for classifier free guidance (CFG). The scores are split over the batch dimension,
+    where the first half correspond to the conditional logits (predicted from the input prompt) and the second half
+    correspond to the unconditional logits (predicted from an empty or 'null' prompt). The processor computes a
+    weighted average across the conditional and unconditional logits, parameterised by the `guidance_scale`.
+
+    Args:
+        guidance_scale (float):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
+    """
+
+    def __init__(self, guidance_scale):
+        if guidance_scale > 1:
+            self.guidance_scale = guidance_scale
+        else:
+            raise ValueError(
+                "Require guidance scale >1 to use the classifier free guidance processor, got guidance scale "
+                f"{guidance_scale}."
+            )
+
+    def __call__(self, input_ids, scores):
+        # simple check to make sure we have compatible batch sizes between our
+        # logits scores (cond + uncond) and input ids (cond only)
+        if scores.shape[0] != 2 * input_ids.shape[0]:
+            raise ValueError(
+                f"Logits should have twice the batch size of the input ids, the first half of batches corresponding to "
+                f"the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got "
+                f"batch size {scores.shape[0]} for the logits and {input_ids.shape[0]} for the input ids."
+            )
+        unguided_bsz = scores.shape[0] // 2
+        cond_logits, uncond_logits = scores.split(unguided_bsz, dim=0)
+        scores = uncond_logits + (cond_logits - uncond_logits) * self.guidance_scale
+        return scores
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index cb681cc72053..69120ab8f89b 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -38,6 +38,7 @@
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .configuration_utils import GenerationConfig
 from .logits_process import (
+    ClassifierFreeGuidanceLogitsProcessor,
     EncoderNoRepeatNGramLogitsProcessor,
     EncoderRepetitionPenaltyLogitsProcessor,
     EpsilonLogitsWarper,
@@ -940,6 +941,8 @@ def _get_logits_processor(
             )
         if generation_config.forced_decoder_ids is not None:
             processors.append(ForceTokensLogitsProcessor(generation_config.forced_decoder_ids))
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            processors.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
         processors = self._merge_criteria_processor_list(processors, logits_processor)
         # `LogitNormalization` should always be the last logit processor, when present
         if generation_config.renormalize_logits is True:
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 771ba9be099c..d8345c9ef8c0 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -135,6 +135,7 @@
     mobilevitv2,
     mpnet,
     mt5,
+    musicgen,
     mvp,
     nat,
     nezha,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2f2714e61966..5cbaa0705afb 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -138,6 +138,7 @@
         ("mobilevitv2", "MobileViTV2Config"),
         ("mpnet", "MPNetConfig"),
         ("mt5", "MT5Config"),
+        ("musicgen", "MusicgenConfig"),
         ("mvp", "MvpConfig"),
         ("nat", "NatConfig"),
         ("nezha", "NezhaConfig"),
@@ -327,6 +328,7 @@
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nezha", "NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -532,6 +534,7 @@
         ("mobilevitv2", "MobileViTV2"),
         ("mpnet", "MPNet"),
         ("mt5", "MT5"),
+        ("musicgen", "MusicGen"),
         ("mvp", "MVP"),
         ("nat", "NAT"),
         ("nezha", "Nezha"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8c5e5e8c157e..8bb6ea37aab2 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -389,6 +389,7 @@
         ("mbart", "MBartForCausalLM"),
         ("mega", "MegaForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
+        ("musicgen", "MusicgenForCausalLM"),
         ("mvp", "MvpForCausalLM"),
         ("open-llama", "OpenLlamaForCausalLM"),
         ("openai-gpt", "OpenAIGPTLMHeadModel"),
diff --git a/src/transformers/models/musicgen/__init__.py b/src/transformers/models/musicgen/__init__.py
new file mode 100644
index 000000000000..7fa695eba808
--- /dev/null
+++ b/src/transformers/models/musicgen/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_musicgen": [
+        "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MusicgenConfig",
+        "MusicgenDecoderConfig",
+    ],
+    "processing_musicgen": ["MusicgenProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_musicgen"] = [
+        "MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MusicgenForConditionalGeneration",
+        "MusicgenForCausalLM",
+        "MusicgenModel",
+        "MusicgenPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_musicgen import (
+        MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MusicgenConfig,
+        MusicgenDecoderConfig,
+    )
+    from .processing_musicgen import MusicgenProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_musicgen import (
+            MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MusicgenForCausalLM,
+            MusicgenForConditionalGeneration,
+            MusicgenModel,
+            MusicgenPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
new file mode 100644
index 000000000000..2882c49f75b2
--- /dev/null
+++ b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MusicGen model configuration"""
+import copy
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/musicgen-small": "https://huggingface.co/facebook/musicgen-small/resolve/main/config.json",
+    # See all Musicgen models at https://huggingface.co/models?filter=musicgen
+}
+
+
+class MusicgenDecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MusicgenDecoder`]. It is used to instantiate a
+    MusicGen decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MusicGen
+    [facebook/musicgen-small](https://huggingface.co/facebook/musicgen-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 2048):
+            Vocabulary size of the MusicgenDecoder model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`MusicgenDecoder`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of decoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer block.
+        ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer block.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, text_encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically, set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_factor (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(hidden_size).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models)
+        num_codebooks (`int`, *optional*, defaults to 4):
+            The number of parallel codebooks forwarded to the model.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether input and output word embeddings should be tied.
+    """
+    model_type = "musicgen_decoder"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=2048,
+        max_position_embeddings=2048,
+        num_hidden_layers=24,
+        ffn_dim=4096,
+        num_attention_heads=16,
+        layerdrop=0.0,
+        use_cache=True,
+        activation_function="gelu",
+        hidden_size=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        initializer_factor=0.02,
+        scale_embedding=False,
+        num_codebooks=4,
+        pad_token_id=2048,
+        bos_token_id=2048,
+        eos_token_id=None,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.ffn_dim = ffn_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.initializer_factor = initializer_factor
+        self.layerdrop = layerdrop
+        self.use_cache = use_cache
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.num_codebooks = num_codebooks
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MusicgenConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MusicgenModel`]. It is used to instantiate a
+    MusicGen model according to the specified arguments, defining the text encoder, audio encoder and MusicGen decoder
+    configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+
+                - **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the text encoder config.
+                - **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the audio encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     MusicgenConfig,
+    ...     MusicgenDecoderConfig,
+    ...     T5Config,
+    ...     EncodecConfig,
+    ...     MusicgenForConditionalGeneration,
+    ... )
+
+    >>> # Initializing text encoder, audio encoder, and decoder model configurations
+    >>> text_encoder_config = T5Config()
+    >>> audio_encoder_config = EncodecConfig()
+    >>> decoder_config = MusicgenDecoderConfig()
+
+    >>> configuration = MusicgenConfig.from_sub_models_config(
+    ...     text_encoder_config, audio_encoder_config, decoder_config
+    ... )
+
+    >>> # Initializing a MusicgenForConditionalGeneration (with random weights) from the facebook/musicgen-small style configuration
+    >>> model = MusicgenForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> config_text_encoder = model.config.text_encoder
+    >>> config_audio_encoder = model.config.audio_encoder
+    >>> config_decoder = model.config.decoder
+
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("musicgen-model")
+
+    >>> # loading model and config from pretrained folder
+    >>> musicgen_config = MusicgenConfig.from_pretrained("musicgen-model")
+    >>> model = MusicgenForConditionalGeneration.from_pretrained("musicgen-model", config=musicgen_config)
+    ```"""
+
+    model_type = "musicgen"
+    is_composition = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
+
+        text_encoder_config = kwargs.pop("text_encoder")
+        text_encoder_model_type = text_encoder_config.pop("model_type")
+
+        audio_encoder_config = kwargs.pop("audio_encoder")
+        audio_encoder_model_type = audio_encoder_config.pop("model_type")
+
+        decoder_config = kwargs.pop("decoder")
+
+        self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
+        self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
+        self.decoder = MusicgenDecoderConfig(**decoder_config)
+        self.is_encoder_decoder = True
+
+    @classmethod
+    def from_sub_models_config(
+        cls,
+        text_encoder_config: PretrainedConfig,
+        audio_encoder_config: PretrainedConfig,
+        decoder_config: MusicgenDecoderConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`MusicgenConfig`] (or a derived class) from text encoder, audio encoder and decoder
+        configurations.
+
+        Returns:
+            [`MusicgenConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            text_encoder=text_encoder_config.to_dict(),
+            audio_encoder=audio_encoder_config.to_dict(),
+            decoder=decoder_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_encoder"] = self.text_encoder.to_dict()
+        output["audio_encoder"] = self.audio_encoder.to_dict()
+        output["decoder"] = self.decoder.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/musicgen/convert_musicgen_transformers.py b/src/transformers/models/musicgen/convert_musicgen_transformers.py
new file mode 100644
index 000000000000..517f0099d0cd
--- /dev/null
+++ b/src/transformers/models/musicgen/convert_musicgen_transformers.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MusicGen checkpoints from the original repository."""
+import argparse
+from pathlib import Path
+from typing import Dict, OrderedDict, Tuple
+
+import torch
+from audiocraft.models import MusicGen
+
+from transformers import (
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    EncodecModel,
+    MusicgenDecoderConfig,
+    MusicgenForConditionalGeneration,
+    MusicgenProcessor,
+    T5EncoderModel,
+)
+from transformers.models.musicgen.modeling_musicgen import MusicgenForCausalLM
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
+
+
+def rename_keys(name):
+    if "emb" in name:
+        name = name.replace("emb", "model.decoder.embed_tokens")
+    if "transformer" in name:
+        name = name.replace("transformer", "model.decoder")
+    if "cross_attention" in name:
+        name = name.replace("cross_attention", "encoder_attn")
+    if "linear1" in name:
+        name = name.replace("linear1", "fc1")
+    if "linear2" in name:
+        name = name.replace("linear2", "fc2")
+    if "norm1" in name:
+        name = name.replace("norm1", "self_attn_layer_norm")
+    if "norm_cross" in name:
+        name = name.replace("norm_cross", "encoder_attn_layer_norm")
+    if "norm2" in name:
+        name = name.replace("norm2", "final_layer_norm")
+    if "out_norm" in name:
+        name = name.replace("out_norm", "model.decoder.layer_norm")
+    if "linears" in name:
+        name = name.replace("linears", "lm_heads")
+    if "condition_provider.conditioners.description.output_proj" in name:
+        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
+    return name
+
+
+def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
+    """Function that takes the fairseq Musicgen state dict and renames it according to the HF
+    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
+    encoder-decoder projection."""
+    keys = list(state_dict.keys())
+    enc_dec_proj_state_dict = {}
+    for key in keys:
+        val = state_dict.pop(key)
+        key = rename_keys(key)
+        if "in_proj_weight" in key:
+            # split fused qkv proj
+            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
+            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
+            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
+        elif "enc_to_dec_proj" in key:
+            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
+        else:
+            state_dict[key] = val
+    return state_dict, enc_dec_proj_state_dict
+
+
+def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenDecoderConfig:
+    if checkpoint == "small":
+        # default config values
+        hidden_size = 1024
+        num_hidden_layers = 24
+        num_attention_heads = 16
+    elif checkpoint == "medium":
+        hidden_size = 1536
+        num_hidden_layers = 48
+        num_attention_heads = 24
+    elif checkpoint == "large":
+        hidden_size = 2048
+        num_hidden_layers = 48
+        num_attention_heads = 32
+    else:
+        raise ValueError(f"Checkpoint should be one of `['small', 'medium', 'large']`, got {checkpoint}.")
+    config = MusicgenDecoderConfig(
+        hidden_size=hidden_size,
+        ffn_dim=hidden_size * 4,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+    )
+    return config
+
+
+@torch.no_grad()
+def convert_musicgen_checkpoint(checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu"):
+    fairseq_model = MusicGen.get_pretrained(checkpoint, device=device)
+    decoder_config = decoder_config_from_checkpoint(checkpoint)
+
+    decoder_state_dict = fairseq_model.lm.state_dict()
+    decoder_state_dict, enc_dec_proj_state_dict = rename_state_dict(
+        decoder_state_dict, hidden_size=decoder_config.hidden_size
+    )
+
+    text_encoder = T5EncoderModel.from_pretrained("t5-base")
+    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
+    decoder = MusicgenForCausalLM(decoder_config).eval()
+
+    # load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
+    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
+
+    for key in missing_keys.copy():
+        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
+            missing_keys.remove(key)
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
+
+    # init the composite model
+    model = MusicgenForConditionalGeneration(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder)
+
+    # load the pre-trained enc-dec projection (from the decoder state dict)
+    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
+
+    # check we can do a forward pass
+    input_ids = torch.arange(0, 8, dtype=torch.long).reshape(2, -1)
+    decoder_input_ids = input_ids.reshape(2 * 4, -1)
+
+    with torch.no_grad():
+        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
+
+    if logits.shape != (8, 1, 2048):
+        raise ValueError("Incorrect shape for logits")
+
+    # now construct the processor
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/encodec_32khz", padding_side="left")
+
+    processor = MusicgenProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    # set the appropriate bos/pad token ids
+    model.generation_config.decoder_start_token_id = 2048
+    model.generation_config.pad_token_id = 2048
+
+    # set other default generation config params
+    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
+    model.generation_config.do_sample = True
+    model.generation_config.guidance_scale = 3.0
+
+    if pytorch_dump_folder is not None:
+        Path(pytorch_dump_folder).mkdir(exist_ok=True)
+        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
+        model.save_pretrained(pytorch_dump_folder)
+        processor.save_pretrained(pytorch_dump_folder)
+
+    if repo_id:
+        logger.info(f"Pushing model {checkpoint} to {repo_id}")
+        model.push_to_hub(repo_id)
+        processor.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint",
+        default="small",
+        type=str,
+        help="Checkpoint size of the MusicGen model you'd like to convert. Can be one of: `['small', 'medium', 'large']`.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder",
+        required=True,
+        default=None,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    parser.add_argument(
+        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
+    )
+
+    args = parser.parse_args()
+    convert_musicgen_checkpoint(args.checkpoint, args.pytorch_dump_folder, args.push_to_hub)
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
new file mode 100644
index 000000000000..bcd83e476fe2
--- /dev/null
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -0,0 +1,2512 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Musicgen model."""
+import copy
+import inspect
+import math
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+
+from ...activations import ACT2FN
+from ...generation.configuration_utils import GenerationConfig
+from ...generation.logits_process import LogitsProcessorList
+from ...generation.stopping_criteria import StoppingCriteriaList
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    ModelOutput,
+    Seq2SeqLMOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_auto import AutoModel
+from .configuration_musicgen import MusicgenConfig, MusicgenDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MusicgenConfig"
+_CHECKPOINT_FOR_DOC = "facebook/musicgen-small"
+
+MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/musicgen-small",
+    # See all Musicgen models at https://huggingface.co/models?filter=musicgen
+]
+
+
+@dataclass
+class MusicgenUnconditionalInput(ModelOutput):
+    """
+    Args:
+        encoder_outputs  (`Tuple[torch.FloatTensor]` of length 1, with tensor shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the text encoder model.
+        attention_mask (`torch.LongTensor`)  of shape `(batch_size, sequence_length)`, *optional*):
+            Encoder attention mask to avoid performing attention on padding token indices. Mask values selected in `[0,
+            1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
+        guidance_scale (`float`, *optional*):
+            Guidance scale for classifier free guidance, setting the balance between the conditional logits (predicted
+            from the prompts) and the unconditional logits (predicted without prompts).
+    """
+
+    encoder_outputs: Tuple[torch.FloatTensor] = None
+    attention_mask: torch.LongTensor = None
+    guidance_scale: float = None
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+class MusicgenSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.make_weights(num_positions, embedding_dim)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, codebooks, seq_len = input_ids.size()
+        # Create the position ids from the input token ids.
+        position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
+        # expand embeddings if needed
+        if seq_len > self.weights.size(0):
+            self.make_weights(seq_len + self.offset, self.embedding_dim)
+        return self.weights.index_select(0, position_ids.view(-1)).detach()
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention
+class MusicgenAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class MusicgenDecoderLayer(nn.Module):
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = MusicgenAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = MusicgenAttention(
+            self.embed_dim,
+            config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)
+        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MusicgenPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MusicgenDecoderConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_factor
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MusicgenDecoder):
+            module.gradient_checkpointing = value
+
+
+MUSICGEN_START_DOCSTRING = r"""
+
+    The Musicgen model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by
+    Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, Alexandre Défossez. It is an
+    encoder decoder transformer trained on the task of conditional music generation
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MusicgenConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MUSICGEN_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
+
+            Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
+            such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            <Tip warning={true}>
+
+            The `decoder_input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
+            target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
+            you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
+            frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
+            target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
+            `decoder_input_ids`.
+
+            </Tip>
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+MUSICGEN_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
+
+            Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
+            such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            <Tip warning={true}>
+
+            The `input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
+            target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
+            you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
+            frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
+            target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
+            `input_ids`.
+
+            </Tip>
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+            cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
+            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
+            can choose to directly pass an embedded representation. This is useful if you want more control over how to
+            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class MusicgenDecoder(MusicgenPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MusicgenDecoderLayer`]
+    """
+
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.max_target_positions = config.max_position_embeddings
+        self.d_model = config.hidden_size
+        self.num_codebooks = config.num_codebooks
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        embed_dim = config.vocab_size + 1
+        self.embed_tokens = nn.ModuleList(
+            [nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
+        )
+
+        self.embed_positions = MusicgenSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+        )
+
+        self.layers = nn.ModuleList([MusicgenDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            # (bsz * codebooks, seq_len) -> (bsz, codebooks, seq_len)
+            input = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
+            bsz, num_codebooks, seq_len = input.shape
+            input_shape = (bsz, seq_len)
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1:]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = torch.zeros((bsz, seq_len, self.d_model), device=input_ids.device)
+
+            for codebook in range(num_codebooks):
+                inputs_embeds += self.embed_tokens[codebook](input[:, codebook])
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {attn_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Musicgen decoder model outputting raw hidden-states without any specific head on top.",
+    MUSICGEN_START_DOCSTRING,
+)
+class MusicgenModel(MusicgenPreTrainedModel):
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__(config)
+        self.decoder = MusicgenDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            hidden_states=decoder_outputs.hidden_states,
+            attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The MusicGen decoder model with a language modelling head on top.",
+    MUSICGEN_START_DOCSTRING,
+)
+class MusicgenForCausalLM(MusicgenPreTrainedModel):
+    def __init__(self, config: MusicgenDecoderConfig):
+        super().__init__(config)
+
+        self.model = MusicgenModel(config)
+
+        self.num_codebooks = config.num_codebooks
+        self.lm_heads = nn.ModuleList(
+            [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_heads
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_heads = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+                Returns:
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented for Musicgen.")
+
+        # (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
+        lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=True,
+        delay_pattern_mask=None,
+        guidance_scale=None,
+        **kwargs,
+    ):
+        if delay_pattern_mask is None:
+            input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
+                input_ids,
+                pad_token_id=self.generation_config.pad_token_id,
+                max_length=self.generation_config.max_length,
+            )
+
+        # apply the delay pattern mask
+        input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
+
+        if guidance_scale is not None and guidance_scale > 1:
+            # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
+            # before sampling)
+            input_ids = input_ids.repeat((2, 1))
+            if attention_mask is not None:
+                attention_mask = attention_mask.repeat((2, 1))
+
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_attention_mask": encoder_attention_mask,
+            "head_mask": head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int = None):
+        """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
+        one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
+        are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
+        seq_len)`:
+        - [P, -1, -1, -1, -1, P, P, P]
+        - [P, P, -1, -1, -1, -1, P, P]
+        - [P, P, P, -1, -1, -1, -1, P]
+        - [P, P, P, P, -1, -1, -1, -1]
+        where P is the special padding token id and -1 indicates that the token is valid for prediction. If we include
+        a prompt (decoder input ids), the -1 positions indicate where new tokens should be predicted. Otherwise, the
+        mask is set to the value in the prompt:
+        - [P, a, b, -1, -1, P, P, P]
+        - [P, P, c, d, -1, -1, P, P]
+        - [P, P, P, e, f, -1, -1, P]
+        - [P, P, P, P, g, h, -1, -1]
+        where a-h indicate the input prompt (decoder input ids) that are offset by 1. Now, we only override the -1
+        tokens in our prediction.
+        """
+        # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+        input_ids = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
+        bsz, num_codebooks, seq_len = input_ids.shape
+
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        input_ids_shifted = (
+            torch.ones((bsz, num_codebooks, max_length), dtype=torch.long, device=input_ids.device) * -1
+        )
+
+        # we only apply the mask if we have a large enough seq len - otherwise we return as is
+        if max_length < 2 * num_codebooks - 1:
+            return input_ids.reshape(bsz * num_codebooks, -1), input_ids_shifted.reshape(bsz * num_codebooks, -1)
+
+        # fill the shifted ids with the prompt entries, offset by the codebook idx
+        for codebook in range(num_codebooks):
+            input_ids_shifted[:, codebook, codebook : seq_len + codebook] = input_ids[:, codebook]
+
+        # construct a pattern mask that indicates the positions of padding tokens for each codebook
+        # first fill the upper triangular part (the EOS padding)
+        delay_pattern = torch.triu(
+            torch.ones((num_codebooks, max_length), dtype=torch.bool), diagonal=max_length - num_codebooks + 1
+        )
+        # then fill the lower triangular part (the BOS padding)
+        delay_pattern = delay_pattern + torch.tril(torch.ones((num_codebooks, max_length), dtype=torch.bool))
+        mask = ~delay_pattern.to(input_ids.device)
+        input_ids = mask * input_ids_shifted + ~mask * pad_token_id
+
+        # find the first position to start generating - this is the first place we have the -1 token
+        # and will always be in the first codebook (since it has no codebook offset)
+        first_codebook_ids = input_ids[:, 0, :]
+        start_ids = (first_codebook_ids == -1).nonzero()[:, 1]
+        if len(start_ids) > 0:
+            first_start_id = min(start_ids)
+        else:
+            # we have no tokens that need to be filled - return entire matrix of input ids
+            first_start_id = seq_len
+
+        # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+        pattern_mask = input_ids.reshape(bsz * num_codebooks, -1)
+        input_ids = input_ids[..., :first_start_id].reshape(bsz * num_codebooks, -1)
+        return input_ids, pattern_mask
+
+    @staticmethod
+    def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
+        """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
+        the mask is set to -1, and otherwise setting to the value detailed in the mask."""
+        seq_len = input_ids.shape[-1]
+        decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
+        input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
+        return input_ids
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        synced_gpus: Optional[bool] = None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            kwargs:
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation.SampleDecoderOnlyOutput`],
+                    - [`~generation.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask", None) is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+            generation_config.pad_token_id = eos_token_id
+
+        # 3. Define model inputs
+        # inputs_tensor has to be defined
+        # model_input_name is defined if model-specific keyword input is passed
+        # otherwise model_input_name is None
+        # all model-specific keyword inputs are removed from `model_kwargs`
+        input_ids, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = input_ids.shape[0] // self.num_codebooks
+
+        # 4. Define other model kwargs
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        model_kwargs["use_cache"] = generation_config.use_cache
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                input_ids, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        # 5. Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            logger.warning(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            raise ValueError(
+                f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
+                f" the maximum length ({generation_config.max_length})"
+            )
+        if input_ids_seq_length >= generation_config.max_length:
+            logger.warning(
+                f"Input length of decoder_input_ids is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 6. Prepare `input_ids` which will be used for auto-regressive generation
+        # Build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
+        input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
+            input_ids,
+            pad_token_id=generation_config.decoder_start_token_id,
+            max_length=generation_config.max_length,
+        )
+
+        # stash the delay mask so that we don't have to recompute it in each forward pass
+        model_kwargs["delay_pattern_mask"] = delay_pattern_mask
+
+        # 7. determine generation mode
+        is_greedy_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is False
+        )
+        is_sample_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is True
+        )
+
+        # 8. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+        )
+
+        # 9. prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        if is_greedy_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing greedy search, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+
+            # 8. run greedy search
+            outputs = self.greedy_search(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # 9. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config)
+
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                **model_kwargs,
+            )
+
+            # 10. run sample
+            outputs = self.sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling."
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        if generation_config.return_dict_in_generate:
+            output_ids = outputs.sequences
+        else:
+            output_ids = outputs
+
+        # apply the pattern mask to the final ids
+        output_ids = self.apply_delay_pattern_mask(output_ids, model_kwargs["delay_pattern_mask"])
+
+        # revert the pattern delay mask by filtering the pad token id
+        output_ids = output_ids[output_ids != generation_config.pad_token_id].reshape(
+            batch_size, self.num_codebooks, -1
+        )
+
+        if generation_config.return_dict_in_generate:
+            outputs.sequences = output_ids
+            return outputs
+        else:
+            return output_ids
+
+
+@add_start_docstrings(
+    "The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder,"
+    "for music generation tasks with one or both of text and audio prompts.",
+    MUSICGEN_START_DOCSTRING,
+)
+class MusicgenForConditionalGeneration(PreTrainedModel):
+    config_class = MusicgenConfig
+    base_model_prefix = "encoder_decoder"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        config: Optional[MusicgenConfig] = None,
+        text_encoder: Optional[PreTrainedModel] = None,
+        audio_encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[MusicgenForCausalLM] = None,
+    ):
+        if config is None and (text_encoder is None or audio_encoder is None or decoder is None):
+            raise ValueError(
+                "Either a configuration has to be provided, or all three of text encoder, audio encoder and MusicGen decoder."
+            )
+        if config is None:
+            config = MusicgenConfig.from_sub_models_config(text_encoder.config, audio_encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.text_encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the MusicGen decoder's configuration, it has to be equal"
+                    f" to the text encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.text_encoder.hidden_size} for"
+                    " `config.text_encoder.hidden_size`."
+                )
+
+        # initialize with config
+        super().__init__(config)
+
+        if text_encoder is None:
+            from ..auto.modeling_auto import AutoModelForTextEncoding
+
+            text_encoder = AutoModelForTextEncoding.from_config(config.text_encoder)
+
+        if audio_encoder is None:
+            from ..auto.modeling_auto import AutoModel
+
+            audio_encoder = AutoModel.from_config(config.audio_encoder)
+
+        if decoder is None:
+            decoder = MusicgenForCausalLM(config.decoder)
+
+        self.text_encoder = text_encoder
+        self.audio_encoder = audio_encoder
+        self.decoder = decoder
+
+        if self.text_encoder.config.to_dict() != self.config.text_encoder.to_dict():
+            logger.warning(
+                f"Config of the text_encoder: {self.text_encoder.__class__} is overwritten by shared text_encoder config:"
+                f" {self.config.text_encoder}"
+            )
+        if self.audio_encoder.config.to_dict() != self.config.audio_encoder.to_dict():
+            logger.warning(
+                f"Config of the audio_encoder: {self.audio_encoder.__class__} is overwritten by shared audio_encoder config:"
+                f" {self.config.audio_encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.text_encoder.config = self.config.text_encoder
+        self.audio_encoder.config = self.config.audio_encoder
+        self.decoder.config = self.config.decoder
+
+        # text encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Linear(self.text_encoder.config.hidden_size, self.decoder.config.hidden_size)
+
+        if self.text_encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.text_encoder} should not have a LM Head. Please use a model without and LM Head"
+            )
+
+        decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
+        # tie text encoder, decoder weights if config set accordingly
+        self.tie_weights()
+
+    def tie_weights(self):
+        # tie text encoder & decoder if needed
+        if self.config.tie_encoder_decoder:
+            # tie text encoder and decoder base model
+            decoder_base_model_prefix = self.decoder.base_model_prefix
+            self._tie_encoder_decoder_weights(
+                self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
+            )
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        # call both encoder and decoder function on gradient checkpointing
+        self.text_encoder._set_gradient_checkpointing(module, value=value)
+        self.decoder._set_gradient_checkpointing(module, value=value)
+
+    def get_audio_encoder(self):
+        return self.audio_encoder
+
+    def get_text_encoder(self):
+        return self.text_encoder
+
+    def get_encoder(self):
+        # get the text encoder to compute the encoder hidden-states for generation
+        return self.get_text_encoder()
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.text_encoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import MusicgenForConditionalGeneration
+
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+        ```"""
+
+        # At the moment fast initialization is not supported for composite models
+        if kwargs.get("_fast_init", False):
+            logger.warning(
+                "Fast initialization is currently not supported for MusicgenForConditionalGeneration. "
+                "Falling back to slow initialization..."
+            )
+        kwargs["_fast_init"] = False
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+    @classmethod
+    def from_sub_models_pretrained(
+        cls,
+        text_encoder_pretrained_model_name_or_path: str = None,
+        audio_encoder_pretrained_model_name_or_path: str = None,
+        decoder_pretrained_model_name_or_path: str = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        r"""
+        Instantiate a text encoder, an audio encoder, and a MusicGen decoder from one, two or three base classes of the
+        library from pretrained model checkpoints.
+
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you need to first set it back in training mode with `model.train()`.
+
+        Params:
+            text_encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the text encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `t5-base`, or namespaced under a user or
+                      organization name, like `google/flan-t5-base.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            audio_encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the audio encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `facebook/encodec_24khz`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `gpt2`, or namespaced under a user or
+                      organization name, like `facebook/musicgen-small`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the text encoder configuration, use the prefix *text_encoder_* for each configuration
+                  parameter.
+                - To update the audio encoder configuration, use the prefix *audio_encoder_* for each configuration
+                  parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import MusicgenForConditionalGeneration
+
+        >>> # initialize a musicgen model from a t5 text encoder, encodec audio encoder, and musicgen decoder
+        >>> model = MusicgenForConditionalGeneration.from_sub_models_pretrained(
+        ...     text_encoder_pretrained_model_name_or_path="t5-base",
+        ...     audio_encoder_pretrained_model_name_or_path="facebook/encodec_24khz",
+        ...     decoder_pretrained_model_name_or_path="facebook/musicgen-small",
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./musicgen-ft")
+        >>> # load fine-tuned model
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("./musicgen-ft")
+        ```"""
+
+        kwargs_text_encoder = {
+            argument[len("text_encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_encoder_")
+        }
+
+        kwargs_audio_encoder = {
+            argument[len("audio_encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("audio_encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove text encoder, audio encoder and decoder kwargs from kwargs
+        for key in kwargs_text_encoder.keys():
+            del kwargs["text_encoder_" + key]
+        for key in kwargs_audio_encoder.keys():
+            del kwargs["audio_encoder_" + key]
+        for key in kwargs_decoder.keys():
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        text_encoder = kwargs_text_encoder.pop("model", None)
+        if text_encoder is None:
+            if text_encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `text_encoder_model` is not defined as an argument, a `text_encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_text_encoder:
+                encoder_config, kwargs_text_encoder = AutoConfig.from_pretrained(
+                    text_encoder_pretrained_model_name_or_path, **kwargs_text_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {text_encoder_pretrained_model_name_or_path} as a text_encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_text_encoder["config"] = encoder_config
+
+            text_encoder = AutoModel.from_pretrained(
+                text_encoder_pretrained_model_name_or_path, *model_args, **kwargs_text_encoder
+            )
+
+        audio_encoder = kwargs_audio_encoder.pop("model", None)
+        if audio_encoder is None:
+            if audio_encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `audio_encoder_model` is not defined as an argument, an `audio_encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_audio_encoder:
+                encoder_config, kwargs_audio_encoder = AutoConfig.from_pretrained(
+                    audio_encoder_pretrained_model_name_or_path, **kwargs_audio_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {audio_encoder_pretrained_model_name_or_path} as an audio_encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_audio_encoder["config"] = encoder_config
+
+            audio_encoder = AutoModel.from_pretrained(
+                audio_encoder_pretrained_model_name_or_path, *model_args, **kwargs_audio_encoder
+            )
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+
+                if isinstance(decoder_config, MusicgenConfig):
+                    decoder_config = decoder_config.decoder
+
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_sub_models_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_sub_models_pretrained(...)`"
+                )
+
+            decoder = MusicgenForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        config = MusicgenConfig.from_sub_models_config(
+            text_encoder.config, audio_encoder.config, decoder.config, **kwargs
+        )
+        return cls(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder, config=config)
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        padding_mask: Optional[torch.BoolTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+        >>> inputs = processor(
+        ...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+        ...     padding=True,
+        ...     return_tensors="pt",
+        ... )
+
+        >>> pad_token_id = model.generation_config.pad_token_id
+        >>> decoder_input_ids = (
+        ...     torch.ones((inputs.input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long)
+        ...     * pad_token_id
+        ... )
+
+        >>> logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
+        >>> logits.shape  # (bsz * num_codebooks, tgt_len, vocab_size)
+        torch.Size([8, 1, 2048])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        kwargs_text_encoder = {
+            argument[len("text_encoder_")]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_encoder_")
+        }
+
+        kwargs_audio_encoder = {
+            argument[len("audio_encoder_")]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("audio_encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs_text_encoder,
+            )
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(*encoder_outputs)
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if (
+            self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        if attention_mask is not None:
+            encoder_hidden_states = encoder_hidden_states * attention_mask[..., None]
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        elif decoder_input_ids is None and decoder_inputs_embeds is None:
+            audio_encoder_outputs = self.audio_encoder(
+                input_values=input_values,
+                padding_mask=padding_mask,
+                **kwargs_audio_encoder,
+            )
+            audio_codes = audio_encoder_outputs.audio_codes
+            frames, bsz, codebooks, seq_len = audio_codes.shape
+            if frames != 1:
+                raise ValueError(
+                    f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
+                    "disabled by setting `chunk_length=None` in the audio encoder."
+                )
+            decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            **kwargs_decoder,
+        )
+
+        loss = None
+        if labels is not None:
+            logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            if loss is not None:
+                return (loss,) + decoder_outputs + encoder_outputs
+            else:
+                return decoder_outputs + encoder_outputs
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_attention_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        decoder_delay_pattern_mask=None,
+        guidance_scale=None,
+        **kwargs,
+    ):
+        if decoder_delay_pattern_mask is None:
+            decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+                decoder_input_ids,
+                self.generation_config.pad_token_id,
+                max_length=self.generation_config.max_length,
+            )
+
+        # apply the delay pattern mask
+        decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)
+
+        if guidance_scale is not None and guidance_scale > 1:
+            # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
+            # before sampling)
+            decoder_input_ids = decoder_input_ids.repeat((2, 1))
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.repeat((2, 1))
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        device: torch.device = None,
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
+        else:
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        if device is None:
+            device = self.device
+        decoder_input_ids_start = (
+            torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
+            * decoder_start_token_id
+        )
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
+            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
+
+    def _prepare_text_encoder_kwargs_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        model_kwargs,
+        model_input_name: Optional[str] = None,
+        guidance_scale: Optional[float] = None,
+    ) -> Dict[str, Any]:
+        # 1. get text encoder
+        encoder = self.get_text_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = inputs_tensor
+        last_hidden_state = encoder(**encoder_kwargs).last_hidden_state
+
+        # for classifier free guidance we need to add a 'null' input to our encoder hidden states
+        if guidance_scale is not None and guidance_scale > 1:
+            last_hidden_state = torch.concatenate([last_hidden_state, torch.zeros_like(last_hidden_state)], dim=0)
+            if "attention_mask" in model_kwargs:
+                model_kwargs["attention_mask"] = torch.concatenate(
+                    [model_kwargs["attention_mask"], torch.zeros_like(model_kwargs["attention_mask"])], dim=0
+                )
+
+        model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=last_hidden_state)
+
+        return model_kwargs
+
+    def _prepare_audio_encoder_kwargs_for_generation(
+        self, input_values, model_kwargs, model_input_name: Optional[str] = None
+    ):
+        # 1. get audio encoder
+        encoder = self.get_audio_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.audio_encoder.main_input_name
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = input_values
+
+        audio_encoder_outputs = encoder.encode(**encoder_kwargs)
+
+        audio_codes = audio_encoder_outputs.audio_codes
+        frames, bsz, codebooks, seq_len = audio_codes.shape
+
+        if frames != 1:
+            raise ValueError(
+                f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
+                "disabled by setting `chunk_length=None` in the audio encoder."
+            )
+
+        decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
+
+        model_kwargs["decoder_input_ids"] = decoder_input_ids
+        model_kwargs["audio_scales"] = audio_encoder_outputs.audio_scales
+        return model_kwargs
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
+            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
+            " model.decoder.resize_token_embeddings(...))"
+        )
+
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.LongTensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
+        if encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs[0].size()[:-1]
+            return torch.ones(shape, dtype=torch.long, device=self.device) * -100
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, torch.Tensor):
+                batch_size = value.shape[0]
+                break
+        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        synced_gpus: Optional[bool] = None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            kwargs:
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation.SampleDecoderOnlyOutput`],
+                    - [`~generation.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) == tuple:
+            # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate
+            model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=model_kwargs["encoder_outputs"][0])
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask", None) is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+            generation_config.pad_token_id = eos_token_id
+
+        # 3. Define model inputs
+        # inputs_tensor has to be defined
+        # model_input_name is defined if model-specific keyword input is passed
+        # otherwise model_input_name is None
+        # all model-specific keyword inputs are removed from `model_kwargs`
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        # 4. Define other model kwargs
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        model_kwargs["use_cache"] = generation_config.use_cache
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        if "encoder_outputs" not in model_kwargs:
+            # encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_text_encoder_kwargs_for_generation(
+                inputs_tensor,
+                model_kwargs,
+                model_input_name,
+                guidance_scale=generation_config.guidance_scale,
+            )
+
+        if "decoder_input_ids" not in model_kwargs and "input_values" in model_kwargs:
+            model_kwargs = self._prepare_audio_encoder_kwargs_for_generation(
+                model_kwargs["input_values"],
+                model_kwargs,
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+            batch_size=batch_size,
+            model_input_name=model_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=generation_config.decoder_start_token_id,
+            bos_token_id=generation_config.bos_token_id,
+            device=inputs_tensor.device,
+        )
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            logger.warning(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            raise ValueError(
+                f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
+                f" the maximum length ({generation_config.max_length})"
+            )
+        if input_ids_seq_length >= generation_config.max_length:
+            logger.warning(
+                f"Input length of decoder_input_ids is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
+        input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+            input_ids,
+            pad_token_id=generation_config.decoder_start_token_id,
+            max_length=generation_config.max_length,
+        )
+        # stash the delay mask so that we don't have to recompute in each forward pass
+        model_kwargs["decoder_delay_pattern_mask"] = decoder_delay_pattern_mask
+
+        # 7. determine generation mode
+        is_greedy_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is False
+        )
+        is_sample_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is True
+        )
+
+        # 8. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+        )
+
+        # 9. prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        if is_greedy_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing greedy search, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+
+            # 10. run greedy search
+            outputs = self.greedy_search(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config)
+
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 12. run sample
+            outputs = self.sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling."
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        if generation_config.return_dict_in_generate:
+            output_ids = outputs.sequences
+        else:
+            output_ids = outputs
+
+        # apply the pattern mask to the final ids
+        output_ids = self.decoder.apply_delay_pattern_mask(output_ids, model_kwargs["decoder_delay_pattern_mask"])
+
+        # revert the pattern delay mask by filtering the pad token id
+        output_ids = output_ids[output_ids != generation_config.pad_token_id].reshape(
+            batch_size, self.decoder.num_codebooks, -1
+        )
+
+        # append the frame dimension back to the audio codes
+        output_ids = output_ids[None, ...]
+
+        audio_scales = model_kwargs.get("audio_scales")
+        if audio_scales is None:
+            audio_scales = [None] * batch_size
+
+        output_values = self.audio_encoder.decode(
+            output_ids,
+            audio_scales=audio_scales,
+        )
+
+        if generation_config.return_dict_in_generate:
+            outputs.sequences = output_values.audio_values
+            return outputs
+        else:
+            return output_values.audio_values
+
+    def get_unconditional_inputs(self, num_samples=1):
+        """
+        Helper function to get null inputs for unconditional generation, enabling the model to be used without the
+        feature extractor or tokenizer.
+
+        Args:
+            num_samples (int, *optional*):
+                Number of audio samples to unconditionally generate.
+            max_new_tokens (int, *optional*):
+                Number of tokens to generate for each sample. More tokens means longer audio samples, at the expense of
+                longer inference (since more audio tokens need to be generated per sample).
+
+        Example:
+        ```python
+        >>> from transformers import MusicgenForConditionalGeneration
+
+        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+        >>> # get the unconditional (or 'null') inputs for the model
+        >>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
+        >>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
+        ```"""
+        last_hidden_state = torch.zeros(
+            (num_samples, 1, self.config.text_encoder.hidden_size), device=self.device, dtype=self.dtype
+        )
+
+        attention_mask = torch.zeros((num_samples, 1), device=self.device, dtype=torch.long)
+
+        return MusicgenUnconditionalInput(
+            encoder_outputs=(last_hidden_state,),
+            attention_mask=attention_mask,
+            guidance_scale=1.0,
+        )
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
new file mode 100644
index 000000000000..ed8d1277f2f7
--- /dev/null
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Text/audio processor class for MusicGen
+"""
+from typing import List, Optional
+
+import numpy as np
+
+from ...processing_utils import ProcessorMixin
+from ...utils import to_numpy
+
+
+class MusicgenProcessor(ProcessorMixin):
+    r"""
+    Constructs a MusicGen processor which wraps an EnCodec feature extractor and a T5 tokenizer into a single processor
+    class.
+
+    [`MusicgenProcessor`] offers all the functionalities of [`EncodecFeatureExtractor`] and [`TTokenizer`]. See
+    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`EncodecFeatureExtractor`):
+            An instance of [`EncodecFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`T5Tokenizer`):
+            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "EncodecFeatureExtractor"
+    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
+        argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if text is not None:
+            inputs = self.tokenizer(text, **kwargs)
+
+        if audio is not None:
+            audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+
+        if audio is None:
+            return inputs
+
+        elif text is None:
+            return audio_inputs
+
+        else:
+            inputs["input_values"] = audio_inputs["input_values"]
+            if "padding_mask" in audio_inputs:
+                inputs["padding_mask"] = audio_inputs["padding_mask"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method is used to decode either batches of audio outputs from the MusicGen model, or batches of token ids
+        from the tokenizer. In the case of decoding token ids, this method forwards all its arguments to T5Tokenizer's
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information.
+        """
+        audio_values = kwargs.pop("audio", None)
+        padding_mask = kwargs.pop("padding_mask", None)
+
+        if len(args) > 0:
+            audio_values = args[0]
+            args = args[1:]
+
+        if audio_values is not None:
+            return self._decode_audio(audio_values, padding_mask=padding_mask)
+        else:
+            return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def _decode_audio(self, audio_values, padding_mask: Optional = None) -> List[np.ndarray]:
+        """
+        This method strips any padding from the audio values to return a list of numpy audio arrays.
+        """
+        audio_values = to_numpy(audio_values)
+        bsz, channels, seq_len = audio_values.shape
+
+        if padding_mask is None:
+            return list(audio_values)
+
+        padding_mask = to_numpy(padding_mask)
+
+        # match the sequence length of the padding mask to the generated audio arrays by padding with the **non-padding**
+        # token (so that the generated audio values are **not** treated as padded tokens)
+        difference = seq_len - padding_mask.shape[-1]
+        padding_value = 1 - self.feature_extractor.padding_value
+        padding_mask = np.pad(padding_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)
+
+        audio_values = audio_values.tolist()
+        for i in range(bsz):
+            sliced_audio = np.asarray(audio_values[i])[
+                padding_mask[i][None, :] != self.feature_extractor.padding_value
+            ]
+            audio_values[i] = sliced_audio.reshape(channels, -1)
+
+        return audio_values
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 451303260770..2c40f7143d4b 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4936,6 +4936,44 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MusicgenForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MusicgenForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MusicgenModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MusicgenPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MusicgenProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MVP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/musicgen/__init__.py b/tests/models/musicgen/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
new file mode 100644
index 000000000000..00f249b09d44
--- /dev/null
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -0,0 +1,1346 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Musicgen model. """
+import copy
+import inspect
+import math
+import unittest
+
+import numpy as np
+
+from transformers import (
+    EncodecConfig,
+    MusicgenConfig,
+    MusicgenDecoderConfig,
+    MusicgenProcessor,
+    PretrainedConfig,
+    T5Config,
+)
+from transformers.testing_utils import is_torch_available, require_torch, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MusicgenForCausalLM,
+        MusicgenForConditionalGeneration,
+        MusicgenModel,
+        set_seed,
+    )
+    from transformers.generation import (
+        GreedySearchDecoderOnlyOutput,
+        GreedySearchEncoderDecoderOutput,
+        InfNanRemoveLogitsProcessor,
+        LogitsProcessorList,
+        SampleDecoderOnlyOutput,
+        SampleEncoderDecoderOutput,
+    )
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
+            setattr(configs_no_init, key, 1e-10)
+        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
+            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
+            setattr(configs_no_init, key, no_init_subconfig)
+    return configs_no_init
+
+
+def prepare_musicgen_decoder_inputs_dict(
+    config,
+    input_ids,
+    attention_mask=None,
+    head_mask=None,
+    encoder_hidden_states=None,
+    encoder_attention_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])[:, 0, :]
+        attention_mask = attention_mask.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.num_hidden_layers, config.num_attention_heads, device=torch_device)
+    if encoder_attention_mask is None and encoder_hidden_states is not None:
+        encoder_attention_mask = torch.ones(encoder_hidden_states.shape[:2], device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.num_hidden_layers, config.num_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "encoder_hidden_states": encoder_hidden_states,
+        "encoder_attention_mask": encoder_attention_mask,
+        "head_mask": head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+class MusicgenDecoderTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=False,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=100,
+        pad_token_id=99,
+        bos_token_id=99,
+        num_codebooks=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.num_codebooks = num_codebooks
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+
+        config = self.get_config()
+        inputs_dict = prepare_musicgen_decoder_inputs_dict(
+            config, input_ids, encoder_hidden_states=encoder_hidden_states
+        )
+        return config, inputs_dict
+
+    def get_config(self):
+        config = MusicgenDecoderConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            d_ff=self.intermediate_size,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.bos_token_id,
+            bos_token_id=self.bos_token_id,
+            num_codebooks=self.num_codebooks,
+            tie_word_embeddings=False,
+        )
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (MusicgenModel, MusicgenForCausalLM) if is_torch_available() else ()
+    greedy_sample_model_classes = (
+        (MusicgenForCausalLM,) if is_torch_available() else ()
+    )  # we don't want to run all the generation tests, only a specific subset
+    pipeline_model_mapping = {}
+    test_pruning = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = MusicgenDecoderTester(self)
+        self.config_tester = ConfigTester(self, config_class=MusicgenDecoderConfig, hidden_size=16)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # override since we have to compute the input embeddings over codebooks
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            embed_tokens = model.get_input_embeddings()
+
+            input_ids = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])
+
+            inputs["inputs_embeds"] = sum(
+                [embed_tokens[codebook](input_ids[:, codebook]) for codebook in range(config.num_codebooks)]
+            )
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    # override since we have embeddings / LM heads over multiple codebooks
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            first_embed = model.get_input_embeddings()[0]
+            self.assertIsInstance(first_embed, torch.nn.Embedding)
+            lm_heads = model.get_output_embeddings()
+            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], torch.nn.Linear))
+
+    # skip as this model doesn't support all arguments tested
+    def test_model_outputs_equivalence(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tie_model_weights(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tied_weights_keys(self):
+        pass
+
+    def _get_input_ids_and_config(self, batch_size=2):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"]
+
+        # take max batch_size
+        sequence_length = input_ids.shape[-1]
+        input_ids = input_ids[: batch_size * config.num_codebooks, :]
+
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
+        return config, input_ids, attention_mask, max_length
+
+    @staticmethod
+    def _get_logits_processor_and_kwargs(
+        input_length,
+        eos_token_id,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        max_length=None,
+        diversity_penalty=None,
+    ):
+        process_kwargs = {
+            "min_length": input_length + 1 if max_length is None else max_length - 1,
+        }
+        logits_processor = LogitsProcessorList()
+        return process_kwargs, logits_processor
+
+    # override since we don't expect the outputs of `.generate` and `.greedy_search` to be the same, since we perform
+    # additional post-processing in the former
+    def test_greedy_generate_dict_outputs(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput)
+            self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
+
+            self.assertNotIn(config.pad_token_id, output_generate)
+
+    # override since we don't expect the outputs of `.generate` and `.greedy_search` to be the same, since we perform
+    # additional post-processing in the former
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        for model_class in self.greedy_sample_model_classes:
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput)
+            self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
+
+    # override since we don't expect the outputs of `.generate` and `.sample` to be the same, since we perform
+    # additional post-processing in the former
+    def test_sample_generate(self):
+        for model_class in self.greedy_sample_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
+
+            # check `generate()` and `sample()` are equal
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=3,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+            )
+            self.assertIsInstance(output_sample, torch.Tensor)
+            self.assertIsInstance(output_generate, torch.Tensor)
+
+    # override since we don't expect the outputs of `.generate` and `.sample` to be the same, since we perform
+    # additional post-processing in the former
+    def test_sample_generate_dict_output(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=1,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_sample, SampleDecoderOnlyOutput)
+            self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
+
+
+def prepare_musicgen_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.reshape(
+            -1, config.decoder.num_codebooks, decoder_input_ids.shape[-1]
+        )[:, 0, :]
+        decoder_attention_mask = decoder_attention_mask.ne(config.decoder.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(
+            config.text_encoder.num_hidden_layers, config.text_encoder.num_attention_heads, device=torch_device
+        )
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(
+            config.decoder.num_hidden_layers, config.decoder.num_attention_heads, device=torch_device
+        )
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(
+            config.decoder.num_hidden_layers, config.decoder.num_attention_heads, device=torch_device
+        )
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "decoder_input_ids": decoder_input_ids,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+class MusicgenTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=False,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=100,
+        pad_token_id=99,
+        bos_token_id=99,
+        num_codebooks=4,
+        num_filters=4,
+        codebook_size=128,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.num_codebooks = num_codebooks
+        self.num_filters = num_filters
+        self.codebook_size = codebook_size
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
+
+        config = self.get_config()
+        inputs_dict = prepare_musicgen_inputs_dict(config, input_ids, decoder_input_ids=decoder_input_ids)
+        return config, inputs_dict
+
+    def get_config(self):
+        text_encoder_config = T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.intermediate_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+        )
+        audio_encoder_config = EncodecConfig(
+            hidden_size=self.vocab_size,
+            compress=1,
+            num_filters=self.num_filters,
+            codebook_size=self.codebook_size,
+            codebook_dim=self.vocab_size,
+        )
+        decoder_config = MusicgenDecoderConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.bos_token_id,
+            bos_token_id=self.bos_token_id,
+            num_codebooks=self.num_codebooks,
+            tie_word_embeddings=False,
+        )
+        config = MusicgenConfig.from_sub_models_config(text_encoder_config, audio_encoder_config, decoder_config)
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
+    greedy_sample_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {}
+    test_pruning = False  # training is not supported yet for MusicGen
+    test_headmasking = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = MusicgenTester(self)
+
+    def _check_output_with_attentions(self, outputs, config, input_ids, decoder_input_ids):
+        text_encoder_config = config.text_encoder
+        decoder_config = config.decoder
+
+        encoder_attentions = outputs["encoder_attentions"]
+        self.assertEqual(len(encoder_attentions), text_encoder_config.num_hidden_layers)
+
+        self.assertEqual(
+            encoder_attentions[0].shape[-3:],
+            (text_encoder_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
+        )
+
+        decoder_attentions = outputs["decoder_attentions"]
+        num_decoder_layers = decoder_config.num_hidden_layers
+        self.assertEqual(len(decoder_attentions), num_decoder_layers)
+
+        self.assertEqual(
+            decoder_attentions[0].shape[-3:],
+            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
+        )
+
+        cross_attentions = outputs["cross_attentions"]
+        self.assertEqual(len(cross_attentions), num_decoder_layers)
+
+        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
+        self.assertEqual(
+            cross_attentions[0].shape[-3:],
+            (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]),
+        )
+
+    def check_musicgen_model_output_attentions(
+        self,
+        model_class,
+        config,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs,
+    ):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=True,
+                **kwargs,
+            )
+        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
+
+    def check_musicgen_model_output_attentions_from_config(
+        self,
+        model_class,
+        config,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs,
+    ):
+        # Similar to `check_musicgen_model_output_attentions`, but with `output_attentions` triggered from the
+        # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
+        # from the inner models' configurations.
+        config.output_attentions = True  # model config -> won't work
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                **kwargs,
+            )
+        self.assertTrue(
+            all(key not in outputs for key in ["encoder_attentions", "decoder_attentions", "cross_attentions"])
+        )
+        config.text_encoder.output_attentions = True  # inner model config -> will work
+        config.audio_encoder.output_attentions = True
+        config.decoder.output_attentions = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                **kwargs,
+            )
+        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
+
+    # override since changing `output_attentions` from the top-level model config won't work
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            self.check_musicgen_model_output_attentions(model_class, config, **inputs_dict)
+            self.check_musicgen_model_output_attentions_from_config(model_class, config, **inputs_dict)
+
+    # override since we have a specific forward signature for musicgen
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_ids",
+                "attention_mask",
+                "input_values",
+                "padding_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    # override since changing `gradient_checkpointing` from the top-level model config won't work
+    def test_gradient_checkpointing_backward_compatibility(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
+
+            config.text_encoder.gradient_checkpointing = True
+            config.audio_encoder.gradient_checkpointing = True
+            config.decoder.gradient_checkpointing = True
+            model = model_class(config)
+            self.assertTrue(model.is_gradient_checkpointing)
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tie_model_weights(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tied_model_weights_key_ignore(self):
+        pass
+
+    # skip as this model has multiple inputs embeds and lm heads that should not be tied
+    def test_tied_weights_keys(self):
+        pass
+
+    # override since changing `output_hidden_states` / `output_attentions` from the top-level model config won't work
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.text_encoder.output_hidden_states = True
+        config.audio_encoder.output_hidden_states = True
+        config.decoder.output_hidden_states = True
+
+        config.text_encoder.output_attentions = True
+        config.decoder.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_hidden_states.retain_grad()
+
+        decoder_hidden_states = outputs.decoder_hidden_states[0]
+        decoder_hidden_states.retain_grad()
+
+        if self.has_attentions:
+            encoder_attentions = outputs.encoder_attentions[0]
+            encoder_attentions.retain_grad()
+
+            decoder_attentions = outputs.decoder_attentions[0]
+            decoder_attentions.retain_grad()
+
+            cross_attentions = outputs.cross_attentions[0]
+            cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(decoder_hidden_states.grad)
+
+        if self.has_attentions:
+            self.assertIsNotNone(encoder_attentions.grad)
+            self.assertIsNotNone(decoder_attentions.grad)
+            self.assertIsNotNone(cross_attentions.grad)
+
+    # override since changing `output_hidden_states` from the top-level model config won't work
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states
+
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.seq_length
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            hidden_states = outputs.decoder_hidden_states
+            self.assertIsInstance(hidden_states, (list, tuple))
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.text_encoder.output_hidden_states = True
+            config.audio_encoder.output_hidden_states = True
+            config.decoder.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # override since the conv layers and lstm's in encodec are exceptions
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = ["conv"]
+                ignore_init = ["lstm"]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    elif not any([x in name for x in ignore_init]):
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # override since we have embeddings / LM heads over multiple codebooks
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), torch.nn.Embedding)
+            lm_heads = model.get_output_embeddings()
+            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], torch.nn.Linear))
+
+    def _get_input_ids_and_config(self, batch_size=2):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"]
+
+        # take max batch_size
+        sequence_length = input_ids.shape[-1]
+        input_ids = input_ids[:batch_size, :]
+        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
+
+        # generate max 3 tokens
+        decoder_input_ids = inputs_dict["decoder_input_ids"]
+        max_length = decoder_input_ids.shape[-1] + 3
+        decoder_input_ids = decoder_input_ids[: batch_size * config.decoder.num_codebooks, :]
+        return config, input_ids, attention_mask, decoder_input_ids, max_length
+
+    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
+    # different modalities -> different shapes)
+    def _greedy_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        max_length,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+            input_ids.shape[-1],
+            eos_token_id=model.config.eos_token_id,
+            forced_bos_token_id=model.config.forced_bos_token_id,
+            forced_eos_token_id=model.config.forced_eos_token_id,
+            max_length=max_length,
+        )
+
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        output_generate = model.generate(
+            input_ids,
+            do_sample=False,
+            num_beams=1,
+            max_length=max_length,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **logits_process_kwargs,
+            **model_kwargs,
+        )
+
+        encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
+            model,
+            input_ids,
+            attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+            output_greedy = model.greedy_search(
+                decoder_input_ids,
+                max_length=max_length,
+                logits_processor=logits_processor,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                encoder_outputs=encoder_outputs,
+                **model_kwargs,
+            )
+        return output_greedy, output_generate
+
+    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
+    # different modalities -> different shapes)
+    def _sample_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        max_length,
+        num_return_sequences,
+        logits_processor,
+        logits_warper,
+        logits_warper_kwargs,
+        process_kwargs,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        torch.manual_seed(0)
+        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        output_generate = model.generate(
+            input_ids,
+            do_sample=True,
+            num_beams=1,
+            max_length=max_length,
+            num_return_sequences=num_return_sequences,
+            output_scores=output_scores,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **logits_warper_kwargs,
+            **process_kwargs,
+            **model_kwargs,
+        )
+
+        torch.manual_seed(0)
+        encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
+            model,
+            input_ids,
+            attention_mask,
+            num_interleave=num_return_sequences,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        # prevent flaky generation test failures
+        logits_processor.append(InfNanRemoveLogitsProcessor())
+
+        with torch.no_grad():
+            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+            output_sample = model.sample(
+                decoder_input_ids.repeat_interleave(num_return_sequences, dim=0),
+                max_length=max_length,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+                encoder_outputs=encoder_outputs,
+                **model_kwargs,
+            )
+
+        return output_sample, output_generate
+
+    @staticmethod
+    def _get_logits_processor_and_kwargs(
+        input_length,
+        eos_token_id,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        max_length=None,
+        diversity_penalty=None,
+    ):
+        process_kwargs = {
+            "min_length": input_length + 1 if max_length is None else max_length - 1,
+        }
+        logits_processor = LogitsProcessorList()
+        return process_kwargs, logits_processor
+
+    def test_greedy_generate_dict_outputs(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput)
+            self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
+
+            self.assertNotIn(config.pad_token_id, output_generate)
+
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        for model_class in self.greedy_sample_model_classes:
+            # enable cache
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput)
+            self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
+
+    def test_sample_generate(self):
+        for model_class in self.greedy_sample_model_classes:
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
+
+            # check `generate()` and `sample()` are equal
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                num_return_sequences=1,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+            )
+            self.assertIsInstance(output_sample, torch.Tensor)
+            self.assertIsInstance(output_generate, torch.Tensor)
+
+    def test_sample_generate_dict_output(self):
+        for model_class in self.greedy_sample_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, decoder_input_ids, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                max_length=max_length,
+                num_return_sequences=3,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertIsInstance(output_sample, SampleEncoderDecoderOutput)
+            self.assertIsInstance(output_generate, SampleEncoderDecoderOutput)
+
+    def test_generate_without_input_ids(self):
+        config, _, _, _, max_length = self._get_input_ids_and_config()
+
+        # if no bos token id => cannot generate from None
+        if config.bos_token_id is None:
+            return
+
+        for model_class in self.greedy_sample_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+
+            output_ids_generate = model.generate(do_sample=False, max_length=max_length, remove_invalid_values=True)
+            self.assertIsNotNone(output_ids_generate)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+
+        for model_class in self.greedy_sample_model_classes:
+            model = model_class(config).eval().to(torch_device)
+            if torch_device == "cuda":
+                model.half()
+            model.generate(**input_dict, max_new_tokens=10)
+            model.generate(**input_dict, do_sample=True, max_new_tokens=10)
+
+
+def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
+    """Produces a series of 'bip bip' sounds at a given frequency."""
+    timesteps = np.arange(int(duration * sample_rate)) / sample_rate
+    wav = np.cos(2 * math.pi * 440 * timesteps)
+    time_period = (timesteps % (2 * bip_duration)) / (2 * bip_duration)
+    envelope = time_period >= 0.5
+    return wav * envelope
+
+
+def place_dict_on_device(dict_to_place, device):
+    for key in dict_to_place:
+        if dict_to_place[key] is not None and isinstance(dict_to_place[key], torch.Tensor):
+            dict_to_place[key] = dict_to_place[key].to(device)
+    return dict_to_place
+
+
+@require_torch
+class MusicgenIntegrationTests(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to(torch_device)
+
+    @cached_property
+    def processor(self):
+        return MusicgenProcessor.from_pretrained("facebook/musicgen-small")
+
+    @slow
+    def test_logits_text_prompt(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        # prepare the decoder inputs
+        pad_token_id = model.generation_config.pad_token_id
+        decoder_input_ids = (
+            torch.ones((input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long).to(torch_device)
+            * pad_token_id
+        )
+
+        with torch.no_grad():
+            logits = model(
+                input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+            ).logits
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                -0.9708, -3.0149, -4.6415, -1.4754, -0.2786, -2.3523, -2.6049, -6.7467,
+                -1.0206, -3.2984, -3.3968, -1.5108, -1.5786, -3.1493, -1.1503, -0.0545,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(logits.shape == (*decoder_input_ids.shape, model.decoder.config.vocab_size))
+        self.assertTrue(torch.allclose(logits[0, 0, :16].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_logits_text_audio_prompt(self):
+        model = self.model
+        processor = self.processor
+
+        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
+        text = ["80s music", "Club techno"]
+
+        inputs = processor(audio=audio, text=text, padding=True, return_tensors="pt")
+
+        # prepare the text encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        # prepare the audio encoder inputs
+        input_values = inputs.input_values.to(torch_device)
+        padding_mask = inputs.padding_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(
+                input_ids,
+                attention_mask=attention_mask,
+                input_values=input_values,
+                padding_mask=padding_mask,
+            ).logits
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                0.1841, -2.9324, -0.7898, 0.1857, 0.4971, -2.8685, -1.6525, -1.6541,
+                2.7757, -2.5942, -3.0959, -1.0120, -1.0147, -0.4605, -0.8885, 0.6820,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(logits.shape == (8, 50, 2048))
+        self.assertTrue(torch.allclose(logits[0, -1, :16].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    @slow
+    def test_generate_unconditional_greedy(self):
+        model = self.model
+
+        # only generate 1 sample with greedy - since it's deterministic all elements of the batch will be the same
+        unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
+        unconditional_inputs = place_dict_on_device(unconditional_inputs, device=torch_device)
+
+        output_values = model.generate(**unconditional_inputs, do_sample=False, max_new_tokens=5)
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                0.0056, 0.0064, 0.0063, 0.0054, 0.0042, 0.0033, 0.0024, 0.0015,
+                0.0015, 0.0010, 0.0004, -0.0012, -0.0036, -0.0055, -0.0067, -0.0071,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (1, 1, 3200))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_unconditional_sampling(self):
+        model = self.model
+
+        # for stochastic sampling we can generate multiple outputs
+        unconditional_inputs = model.get_unconditional_inputs(num_samples=2)
+        unconditional_inputs = place_dict_on_device(unconditional_inputs, device=torch_device)
+
+        set_seed(0)
+        output_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=10)
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                0.0765, 0.0758, 0.0749, 0.0759, 0.0759, 0.0771, 0.0775, 0.0760,
+                0.0762, 0.0765, 0.0767, 0.0760, 0.0738, 0.0714, 0.0713, 0.0730,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_prompt_greedy(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        output_values = model.generate(
+            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=None, max_new_tokens=10
+        )
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -1.1998e-04, -2.2302e-04, 4.6296e-04, 1.0524e-03, 2.4827e-04,
+                -4.0288e-05, -1.2468e-04, 4.9846e-05, 7.1485e-04, 4.4197e-04,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :10].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_prompt_greedy_with_classifier_free_guidance(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        output_values = model.generate(
+            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=3, max_new_tokens=10
+        )
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                0.0283, 0.0246, 0.0650, 0.0640, 0.0599, 0.0711, 0.0420, 0.0112,
+                0.0511, 0.0746, 0.1363, 0.1213, 0.0185, -0.0578, -0.0908, 0.0443,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_prompt_sampling(self):
+        model = self.model
+        processor = self.processor
+
+        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="pt")
+
+        # prepare the encoder inputs
+        input_ids = inputs.input_ids.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        set_seed(0)
+        output_values = model.generate(
+            input_ids, attention_mask=attention_mask, do_sample=True, guidance_scale=None, max_new_tokens=10
+        )
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -0.0047, -0.0094, -0.0028, -0.0018, -0.0057, -0.0007, -0.0104, -0.0211,
+                -0.0097, -0.0150, -0.0066, -0.0004, -0.0201, -0.0325, -0.0326, -0.0098,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(output_values.shape == (2, 1, 4480))
+        self.assertTrue(torch.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
+
+    @slow
+    def test_generate_text_audio_prompt(self):
+        model = self.model
+        processor = self.processor
+
+        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
+        text = ["80s music", "Club techno"]
+
+        inputs = processor(audio=audio, text=text, padding=True, return_tensors="pt")
+        inputs = place_dict_on_device(inputs, device=torch_device)
+
+        output_values = model.generate(**inputs, do_sample=False, guidance_scale=None, max_new_tokens=10)
+
+        # fmt: off
+        EXPECTED_VALUES = torch.tensor(
+            [
+                -0.0036, -0.0130, -0.0261, -0.0384, -0.0557, -0.0718, -0.0680, -0.0632,
+                -0.0529, -0.0403, -0.0289, -0.0198, -0.0136, -0.0101, -0.0095, -0.0040,
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(
+            output_values.shape == (2, 1, 36480)
+        )  # input values take shape 32000 and we generate from there
+        self.assertTrue(torch.allclose(output_values[0, 0, -16:].cpu(), EXPECTED_VALUES, atol=1e-4))
diff --git a/tests/models/musicgen/test_processing_musicgen.py b/tests/models/musicgen/test_processing_musicgen.py
new file mode 100644
index 000000000000..41962552ffb5
--- /dev/null
+++ b/tests/models/musicgen/test_processing_musicgen.py
@@ -0,0 +1,173 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the MusicGen processor."""
+
+import random
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import T5Tokenizer, T5TokenizerFast
+from transformers.testing_utils import require_sentencepiece, require_torch
+from transformers.utils.import_utils import is_speech_available, is_torch_available
+
+
+if is_torch_available():
+    pass
+
+if is_speech_available():
+    from transformers import EncodecFeatureExtractor, MusicgenProcessor
+
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+@require_sentencepiece
+class MusicgenProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "facebook/musicgen-small"
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def get_tokenizer(self, **kwargs):
+        return T5Tokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return EncodecFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = MusicgenProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, T5TokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, EncodecFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = MusicgenProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = MusicgenProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, T5TokenizerFast)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, EncodecFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(sequences=predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        self.assertListEqual(
+            processor.model_input_names,
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )
+
+    def test_decode_audio(self):
+        feature_extractor = self.get_feature_extractor(padding_side="left")
+        tokenizer = self.get_tokenizer()
+
+        processor = MusicgenProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = [floats_list((1, x))[0] for x in range(5, 20, 5)]
+        padding_mask = processor(raw_speech).padding_mask
+
+        generated_speech = np.asarray(floats_list((3, 20)))[:, None, :]
+        decoded_audios = processor.batch_decode(generated_speech, padding_mask=padding_mask)
+
+        self.assertIsInstance(decoded_audios, list)
+
+        for audio in decoded_audios:
+            self.assertIsInstance(audio, np.ndarray)
+
+        self.assertTrue(decoded_audios[0].shape == (1, 10))
+        self.assertTrue(decoded_audios[1].shape == (1, 15))
+        self.assertTrue(decoded_audios[2].shape == (1, 20))
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index 93385b127d75..de47348a9ef0 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -37,6 +37,7 @@
 CONFIG_CLASSES_TO_IGNORE_FOR_DOCSTRING_CHECKPOINT_CHECK = {
     "DecisionTransformerConfig",
     "EncoderDecoderConfig",
+    "MusicgenConfig",
     "RagConfig",
     "SpeechEncoderDecoderConfig",
     "TimmBackboneConfig",
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2de9fa61e1e3..46407eb1a5ac 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -119,6 +119,7 @@
     "MegatronBertEncoder",  # Building part of bigger (tested) model.
     "MegatronBertDecoder",  # Building part of bigger (tested) model.
     "MegatronBertDecoderWrapper",  # Building part of bigger (tested) model.
+    "MusicgenDecoder",  # Building part of bigger (tested) model.
     "MvpDecoderWrapper",  # Building part of bigger (tested) model.
     "MvpEncoder",  # Building part of bigger (tested) model.
     "PegasusEncoder",  # Building part of bigger (tested) model.
@@ -331,6 +332,8 @@
     "SpeechT5ForSpeechToSpeech",
     "SpeechT5ForTextToSpeech",
     "SpeechT5HifiGan",
+    "MusicgenModel",
+    "MusicgenForConditionalGeneration",
 ]
 
 # Update this list for models that have multiple model types for the same
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 0064a9999b62..2bbaabb07389 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -134,6 +134,9 @@ src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
 src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
 src/transformers/models/mobilevit/modeling_mobilevit.py
 src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+src/transformers/models/musicgen/modeling_musicgen.py
+src/transformers/models/musicgen/configuration_musicgen.py
+src/transformers/models/musicgen/processing_musicgen.py
 src/transformers/models/mvp/configuration_mvp.py
 src/transformers/models/nat/configuration_nat.py
 src/transformers/models/nat/modeling_nat.py

From 77db28dc520b74a4aaac44a8f6d6c5cfe9145a5d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 29 Jun 2023 16:05:24 +0200
Subject: [PATCH 572/935] Update some torchscript tests after #24505 (#24566)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/align/test_modeling_align.py     | 11 ++++
 tests/models/altclip/test_modeling_altclip.py | 11 ++++
 tests/models/blip/test_modeling_blip.py       | 60 +++++++++++++++++++
 .../test_modeling_chinese_clip.py             | 11 ++++
 tests/models/clap/test_modeling_clap.py       | 20 +++++++
 tests/models/clip/test_modeling_clip.py       | 20 +++++++
 tests/models/clipseg/test_modeling_clipseg.py | 20 +++++++
 tests/models/encodec/test_modeling_encodec.py | 11 ++++
 tests/models/flava/test_modeling_flava.py     | 20 +++++++
 .../graphormer/test_modeling_graphormer.py    | 11 ++++
 .../models/groupvit/test_modeling_groupvit.py | 20 +++++++
 .../models/imagegpt/test_modeling_imagegpt.py | 11 ++++
 tests/models/owlvit/test_modeling_owlvit.py   | 40 +++++++++++++
 .../pix2struct/test_modeling_pix2struct.py    | 20 +++++++
 .../test_modeling_speech_to_text.py           | 20 +++++++
 tests/models/whisper/test_modeling_whisper.py | 20 +++++++
 tests/models/x_clip/test_modeling_x_clip.py   | 20 +++++++
 17 files changed, 346 insertions(+)

diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 2357c20e213a..35b2aebf6c02 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -538,6 +538,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 28213de84df6..fdae7768da1f 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -509,6 +509,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 7d9c6b5ba58b..9a6e3da06c78 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -496,8 +496,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -920,8 +940,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -1116,8 +1156,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 57f532da8635..894f1e127999 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -647,6 +647,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 56a7dfdcfce0..53b4082c68da 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -575,8 +575,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 98a3f8383825..4239ce5ed0d5 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -547,8 +547,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 35dca117e2b8..e931bdc8d56f 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -528,8 +528,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index b6291aafedfa..d678b707a590 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -229,6 +229,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             model_buffers = list(model.buffers())
             for non_persistent_buffer in non_persistent_buffers.values():
                 found_buffer = False
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 2544b7ee93f6..e69d2226c890 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -964,8 +964,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             # Non persistent buffers won't be in original state dict
             loaded_model_state_dict.pop("text_model.embeddings.token_type_ids", None)
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index e874ebf0f44a..bc860e1a8aa4 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -321,6 +321,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             model_buffers = list(model.buffers())
             for non_persistent_buffer in non_persistent_buffers.values():
                 found_buffer = False
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index f87cec426b0a..261841277ad6 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -630,8 +630,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index af08725ef186..19fe688bf4c3 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -500,6 +500,17 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             model_buffers = list(model.buffers())
             for non_persistent_buffer in non_persistent_buffers.values():
                 found_buffer = False
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index acf078ffe800..4dbd1fb0a8cd 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -491,8 +491,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
@@ -670,8 +690,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 8ec023676d63..c49db3dcff2b 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -669,8 +669,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 16ad704fd510..d86fc43a8268 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -712,8 +712,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 614b29b160d1..c504c1005d2e 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -814,8 +814,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 2efece44caeb..ef2d11ac626e 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -612,8 +612,28 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             model_state_dict = model.state_dict()
             loaded_model_state_dict = loaded_model.state_dict()
 
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
             self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
             models_equal = True
             for layer_name, p1 in model_state_dict.items():
                 p2 = loaded_model_state_dict[layer_name]

From b324557aac65b2ae5d30240bd8e452f046e05730 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 29 Jun 2023 15:09:51 +0100
Subject: [PATCH 573/935] Removal of deprecated vision methods and specify
 deprecation versions (#24570)

* Removal of deprecated methods and specify versions

* Fix tests
---
 docs/source/en/model_doc/conditional_detr.md  |   2 -
 docs/source/en/model_doc/deformable_detr.md   |   2 -
 docs/source/en/model_doc/detr.md              |   1 -
 docs/source/en/preprocessing.md               |  66 +++++------
 docs/source/en/tasks/object_detection.md      |   2 +-
 docs/source/ko/preprocessing.md               | 108 +++++++++---------
 docs/source/ko/tasks/object_detection.md      |  52 ++++-----
 src/transformers/image_transforms.py          |  26 +----
 .../models/beit/image_processing_beit.py      |   9 --
 .../image_processing_bridgetower.py           |  37 ------
 .../image_processing_conditional_detr.py      |  57 +--------
 .../image_processing_deformable_detr.py       |  57 +--------
 .../models/deta/image_processing_deta.py      |  48 +-------
 .../models/detr/image_processing_detr.py      |  55 +--------
 .../models/donut/image_processing_donut.py    |   6 -
 .../image_processing_mask2former.py           |  27 +----
 .../maskformer/image_processing_maskformer.py |  34 +-----
 .../oneformer/image_processing_oneformer.py   |   8 +-
 .../models/owlvit/image_processing_owlvit.py  |   2 +-
 .../segformer/image_processing_segformer.py   |   9 --
 .../models/vilt/image_processing_vilt.py      |  36 ------
 .../models/yolos/image_processing_yolos.py    |  27 +----
 .../test_image_processing_bridgetower.py      |  20 ----
 .../test_image_processing_conditional_detr.py |  20 ----
 .../test_image_processing_deformable_detr.py  |  21 ----
 .../models/deta/test_image_processing_deta.py |  21 ----
 .../models/detr/test_image_processing_detr.py |  20 ----
 .../test_image_processing_mask2former.py      |  23 ----
 .../test_image_processing_maskformer.py       |  23 ----
 .../test_image_processing_oneformer.py        |  30 -----
 .../oneformer/test_processor_oneformer.py     |  35 ------
 .../models/vilt/test_image_processing_vilt.py |  20 ----
 32 files changed, 144 insertions(+), 760 deletions(-)

diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md
index 4dff1d0e353b..8993fb384316 100644
--- a/docs/source/en/model_doc/conditional_detr.md
+++ b/docs/source/en/model_doc/conditional_detr.md
@@ -43,7 +43,6 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o
 
 [[autodoc]] ConditionalDetrImageProcessor
     - preprocess
-    - pad_and_create_pixel_mask
     - post_process_object_detection
     - post_process_instance_segmentation
     - post_process_semantic_segmentation
@@ -53,7 +52,6 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o
 
 [[autodoc]] ConditionalDetrFeatureExtractor
     - __call__
-    - pad_and_create_pixel_mask
     - post_process_object_detection
     - post_process_instance_segmentation
     - post_process_semantic_segmentation
diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md
index a9b1267c15a8..0bceb0bdf39b 100644
--- a/docs/source/en/model_doc/deformable_detr.md
+++ b/docs/source/en/model_doc/deformable_detr.md
@@ -52,14 +52,12 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DeformableDetrImageProcessor
     - preprocess
-    - pad_and_create_pixel_mask
     - post_process_object_detection
 
 ## DeformableDetrFeatureExtractor
 
 [[autodoc]] DeformableDetrFeatureExtractor
     - __call__
-    - pad_and_create_pixel_mask
     - post_process_object_detection
 
 ## DeformableDetrConfig
diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md
index a83f3097bf48..2c03a0f8b851 100644
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@@ -190,7 +190,6 @@ If you're interested in submitting a resource to be included here, please feel f
 
 [[autodoc]] DetrFeatureExtractor
     - __call__
-    - pad_and_create_pixel_mask
     - post_process_object_detection
     - post_process_semantic_segmentation
     - post_process_instance_segmentation
diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md
index 1f8eb4b1547d..c90c6c2a2288 100644
--- a/docs/source/en/preprocessing.md
+++ b/docs/source/en/preprocessing.md
@@ -62,8 +62,8 @@ Then pass your text to the tokenizer:
 ```py
 >>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
 >>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 ```
 
@@ -93,14 +93,14 @@ If there are several sentences you want to preprocess, pass them as a list to th
 ... ]
 >>> encoded_inputs = tokenizer(batch_sentences)
 >>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1]]}
 ```
 
@@ -118,14 +118,14 @@ Set the `padding` parameter to `True` to pad the shorter sequences in the batch
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True)
 >>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
 ```
 
@@ -145,14 +145,14 @@ Set the `truncation` parameter to `True` to truncate a sequence to the maximum l
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
 >>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
 ```
 
@@ -181,10 +181,10 @@ Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for Tenso
 >>> print(encoded_input)
 {'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
                       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
                            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                            [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
@@ -203,11 +203,11 @@ Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for Tenso
 array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
        [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
        [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
-      dtype=int32)>, 
+      dtype=int32)>,
  'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
  'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
 array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
@@ -244,7 +244,7 @@ This returns three items:
 * `path` points to the location of the audio file.
 * `sampling_rate` refers to how many data points in the speech signal are measured per second.
 
-For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data. 
+For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data.
 
 1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz:
 
@@ -449,13 +449,13 @@ or segmentation maps.
 ### Pad
 
 In some cases, for instance, when fine-tuning [DETR](./model_doc/detr), the model applies scale augmentation at training
-time. This may cause images to be different sizes in a batch. You can use [`DetrImageProcessor.pad_and_create_pixel_mask`]
+time. This may cause images to be different sizes in a batch. You can use [`DetrImageProcessor.pad`]
 from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images together.
 
 ```py
 >>> def collate_fn(batch):
 ...     pixel_values = [item["pixel_values"] for item in batch]
-...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
 ...     labels = [item["labels"] for item in batch]
 ...     batch = {}
 ...     batch["pixel_values"] = encoding["pixel_values"]
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 56d6a1d8c4f8..457d96bfd3f6 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -305,7 +305,7 @@ to indicate which pixels are real (1) and which are padding (0).
 ```py
 >>> def collate_fn(batch):
 ...     pixel_values = [item["pixel_values"] for item in batch]
-...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
 ...     labels = [item["labels"] for item in batch]
 ...     batch = {}
 ...     batch["pixel_values"] = encoding["pixel_values"]
diff --git a/docs/source/ko/preprocessing.md b/docs/source/ko/preprocessing.md
index a7597f23a0dc..7a9d2987381c 100644
--- a/docs/source/ko/preprocessing.md
+++ b/docs/source/ko/preprocessing.md
@@ -62,8 +62,8 @@ pip install datasets
 ```py
 >>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
 >>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 ```
 
@@ -93,14 +93,14 @@ pip install datasets
 ... ]
 >>> encoded_inputs = tokenizer(batch_sentences)
 >>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1]]}
 ```
 
@@ -118,14 +118,14 @@ pip install datasets
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True)
 >>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
 ```
 
@@ -145,14 +145,14 @@ pip install datasets
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
 >>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
 ```
 
@@ -181,10 +181,10 @@ pip install datasets
 >>> print(encoded_input)
 {'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
                       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
                            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                            [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
@@ -203,11 +203,11 @@ pip install datasets
 array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
        [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
        [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
-      dtype=int32)>, 
+      dtype=int32)>,
  'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
  'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
 array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
@@ -244,7 +244,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 * `path`는 오디오 파일의 위치를 가리킵니다.
 * `sampling_rate`는 음성 신호에서 초당 측정되는 데이터 포인트 수를 나타냅니다.
 
-이 튜토리얼에서는 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) 모델을 사용합니다. 모델 카드를 보면 Wav2Vec2가 16kHz 샘플링된 음성 오디오를 기반으로 사전훈련된 것을 알 수 있습니다. 
+이 튜토리얼에서는 [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) 모델을 사용합니다. 모델 카드를 보면 Wav2Vec2가 16kHz 샘플링된 음성 오디오를 기반으로 사전훈련된 것을 알 수 있습니다.
 모델을 사전훈련하는 데 사용된 데이터 세트의 샘플링 레이트와 오디오 데이터의 샘플링 레이트가 일치해야 합니다. 데이터의 샘플링 레이트가 다르면 데이터를 리샘플링해야 합니다.
 
 1. 🤗 Datasets의 [`~datasets.Dataset.cast_column`] 메소드를 사용하여 샘플링 레이트를 16kHz로 업샘플링하세요:
@@ -263,7 +263,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
  'sampling_rate': 16000}
 ```
 
-다음으로, 입력을 정규화하고 패딩할 특성 추출기를 가져오세요. 텍스트 데이터의 경우, 더 짧은 시퀀스에 대해 `0`이 추가됩니다. 오디오 데이터에도 같은 개념이 적용됩니다. 
+다음으로, 입력을 정규화하고 패딩할 특성 추출기를 가져오세요. 텍스트 데이터의 경우, 더 짧은 시퀀스에 대해 `0`이 추가됩니다. 오디오 데이터에도 같은 개념이 적용됩니다.
 특성 추출기는 배열에 `0`(묵음으로 해석)을 추가합니다.
 
 [`AutoFeatureExtractor.from_pretrained`]를 사용하여 특성 추출기를 가져오세요:
@@ -327,7 +327,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 ## 컴퓨터 비전[[computer-vision]]
 
 컴퓨터 비전 작업의 경우, 모델에 대한 데이터 세트를 준비하기 위해 [이미지 프로세서](main_classes/image_processor)가 필요합니다.
-이미지 전처리는 이미지를 모델이 예상하는 입력으로 변환하는 여러 단계로 이루어집니다. 
+이미지 전처리는 이미지를 모델이 예상하는 입력으로 변환하는 여러 단계로 이루어집니다.
 이러한 단계에는 크기 조정, 정규화, 색상 채널 보정, 이미지의 텐서 변환 등이 포함됩니다.
 
 <Tip>
@@ -335,17 +335,17 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 이미지 전처리는 이미지 증강 기법을 몇 가지 적용한 뒤에 할 수도 있습니다.
 이미지 전처리 및 이미지 증강은 모두 이미지 데이터를 변형하지만, 서로 다른 목적을 가지고 있습니다:
 
-* 이미지 증강은 과적합(over-fitting)을 방지하고 모델의 견고함(resiliency)을 높이는 데 도움이 되는 방식으로 이미지를 수정합니다. 
-밝기와 색상 조정, 자르기, 회전, 크기 조정, 확대/축소 등 다양한 방법으로 데이터를 증강할 수 있습니다. 
+* 이미지 증강은 과적합(over-fitting)을 방지하고 모델의 견고함(resiliency)을 높이는 데 도움이 되는 방식으로 이미지를 수정합니다.
+밝기와 색상 조정, 자르기, 회전, 크기 조정, 확대/축소 등 다양한 방법으로 데이터를 증강할 수 있습니다.
 그러나 증강으로 이미지의 의미가 바뀌지 않도록 주의해야 합니다.
-* 이미지 전처리는 이미지가 모델이 예상하는 입력 형식과 일치하도록 보장합니다. 
+* 이미지 전처리는 이미지가 모델이 예상하는 입력 형식과 일치하도록 보장합니다.
 컴퓨터 비전 모델을 미세 조정할 때 이미지는 모델이 초기에 훈련될 때와 정확히 같은 방식으로 전처리되어야 합니다.
 
 이미지 증강에는 원하는 라이브러리를 무엇이든 사용할 수 있습니다. 이미지 전처리에는 모델과 연결된 `ImageProcessor`를 사용합니다.
 
 </Tip>
 
-[food101](https://huggingface.co/datasets/food101) 데이터 세트를 가져와서 컴퓨터 비전 데이터 세트에서 이미지 프로세서를 어떻게 사용하는지 알아보세요. 
+[food101](https://huggingface.co/datasets/food101) 데이터 세트를 가져와서 컴퓨터 비전 데이터 세트에서 이미지 프로세서를 어떻게 사용하는지 알아보세요.
 데이터 세트를 불러오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)을 참고하세요.
 
 <Tip>
@@ -382,7 +382,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 다른 데이터 증강 라이브러리를 사용해보고 싶다면, [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) 또는 [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)에서 어떻게 사용하는지 배울 수 있습니다.
 
 1. [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)로  [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)와 [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) 등 변환을 몇 가지 연결하세요.
-참고로 크기 조정에 필요한 이미지의 크기 요구사항은 `image_processor`에서 가져올 수 있습니다. 
+참고로 크기 조정에 필요한 이미지의 크기 요구사항은 `image_processor`에서 가져올 수 있습니다.
 일부 모델은 정확한 높이와 너비를 요구하지만, 제일 짧은 변의 길이(`shortest_edge`)만 정의된 모델도 있습니다.
 
 ```py
@@ -397,8 +397,8 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
 ```
 
-2. 모델은 입력으로 [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)를 받습니다. 
-`ImageProcessor`는 이미지 정규화 및 적절한 텐서 생성을 처리할 수 있습니다. 
+2. 모델은 입력으로 [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)를 받습니다.
+`ImageProcessor`는 이미지 정규화 및 적절한 텐서 생성을 처리할 수 있습니다.
 배치 이미지에 대한 이미지 증강 및 이미지 전처리를 결합하고 `pixel_values`를 생성하는 함수를 만듭니다:
 
 ```py
@@ -410,9 +410,9 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 <Tip>
 
-위의 예에서는 이미지 증강 중에 이미지 크기를 조정했기 때문에 `do_resize=False`로 설정하고, 해당 `image_processor`에서 `size` 속성을 활용했습니다. 
-이미지 증강 중에 이미지 크기를 조정하지 않은 경우 이 매개변수를 생략하세요. 
-기본적으로는 `ImageProcessor`가 크기 조정을 처리합니다. 
+위의 예에서는 이미지 증강 중에 이미지 크기를 조정했기 때문에 `do_resize=False`로 설정하고, 해당 `image_processor`에서 `size` 속성을 활용했습니다.
+이미지 증강 중에 이미지 크기를 조정하지 않은 경우 이 매개변수를 생략하세요.
+기본적으로는 `ImageProcessor`가 크기 조정을 처리합니다.
 
 증강 변환 과정에서 이미지를 정규화하려면 `image_processor.image_mean` 및 `image_processor.image_std` 값을 사용하세요.
 
@@ -424,7 +424,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> dataset.set_transform(transforms)
 ```
 
-4. 이제 이미지에 접근하면 이미지 프로세서가 `pixel_values`를 추가한 것을 알 수 있습니다. 
+4. 이제 이미지에 접근하면 이미지 프로세서가 `pixel_values`를 추가한 것을 알 수 있습니다.
 드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
 
 ```py
@@ -447,21 +447,21 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 <Tip>
 
-`ImageProcessor`는 객체 감지, 시맨틱 세그멘테이션(semantic segmentation), 인스턴스 세그멘테이션(instance segmentation), 파놉틱 세그멘테이션(panoptic segmentation)과 같은 작업에 대한 후처리 방법을 제공합니다. 
+`ImageProcessor`는 객체 감지, 시맨틱 세그멘테이션(semantic segmentation), 인스턴스 세그멘테이션(instance segmentation), 파놉틱 세그멘테이션(panoptic segmentation)과 같은 작업에 대한 후처리 방법을 제공합니다.
 이러한 방법은 모델의 원시 출력을 경계 상자나 세그멘테이션 맵과 같은 의미 있는 예측으로 변환해줍니다.
 
 </Tip>
 
 ### 패딩[[pad]]
 
-예를 들어, [DETR](./model_doc/detr)와 같은 경우에는 모델이 훈련할 때 크기 조정 증강을 적용합니다. 
-이로 인해 배치 내 이미지 크기가 달라질 수 있습니다. 
-[`DetrImageProcessor`]의 [`DetrImageProcessor.pad_and_create_pixel_mask`]를 사용하고 사용자 정의 `collate_fn`을 정의해서 배치 이미지를 처리할 수 있습니다.
+예를 들어, [DETR](./model_doc/detr)와 같은 경우에는 모델이 훈련할 때 크기 조정 증강을 적용합니다.
+이로 인해 배치 내 이미지 크기가 달라질 수 있습니다.
+[`DetrImageProcessor`]의 [`DetrImageProcessor.pad`]를 사용하고 사용자 정의 `collate_fn`을 정의해서 배치 이미지를 처리할 수 있습니다.
 
 ```py
 >>> def collate_fn(batch):
 ...     pixel_values = [item["pixel_values"] for item in batch]
-...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
 ...     labels = [item["labels"] for item in batch]
 ...     batch = {}
 ...     batch["pixel_values"] = encoding["pixel_values"]
@@ -472,10 +472,10 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 ## 멀티모달[[multimodal]]
 
-멀티모달 입력이 필요한 작업의 경우, 모델에 데이터 세트를 준비하기 위한 [프로세서](main_classes/processors)가 필요합니다. 
+멀티모달 입력이 필요한 작업의 경우, 모델에 데이터 세트를 준비하기 위한 [프로세서](main_classes/processors)가 필요합니다.
 프로세서는 토크나이저와 특성 추출기와 같은 두 가지 처리 객체를 결합합니다.
 
-[LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터 세트를 가져와서 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요. 
+[LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터 세트를 가져와서 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요.
 (데이터 세트를 가져오는 방법에 대한 자세한 내용은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 볼 수 있습니다.)
 
 ```py
@@ -517,7 +517,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
 ```
 
-1. `array`에 들어 있는 오디오 데이터를 `input_values`로 변환하고 `text`를 토큰화하여 `labels`로 변환하는 함수를 만듭니다. 
+1. `array`에 들어 있는 오디오 데이터를 `input_values`로 변환하고 `text`를 토큰화하여 `labels`로 변환하는 함수를 만듭니다.
 모델의 입력은 다음과 같습니다:
 
 ```py
@@ -535,5 +535,5 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> prepare_dataset(lj_speech[0])
 ```
 
-이제 프로세서가 `input_values`와 `labels`를 추가하고, 샘플링 레이트도 올바르게 16kHz로 다운샘플링했습니다. 
-드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
\ No newline at end of file
+이제 프로세서가 `input_values`와 `labels`를 추가하고, 샘플링 레이트도 올바르게 16kHz로 다운샘플링했습니다.
+드디어 처리된 데이터 세트를 모델에 전달할 수 있습니다!
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
index 9eb3f4f74aa6..cb573ed4e771 100644
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@@ -18,10 +18,10 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-객체 탐지는 이미지에서 인스턴스(예: 사람, 건물 또는 자동차)를 감지하는 컴퓨터 비전 작업입니다. 객체 탐지 모델은 이미지를 입력으로 받고 탐지된 바운딩 박스의 좌표와 관련된 레이블을 출력합니다. 
-하나의 이미지에는 여러 객체가 있을 수 있으며 각각은 자체적인 바운딩 박스와 레이블을 가질 수 있습니다(예: 차와 건물이 있는 이미지). 
-또한 각 객체는 이미지의 다른 부분에 존재할 수 있습니다(예: 이미지에 여러 대의 차가 있을 수 있음). 
-이 작업은 보행자, 도로 표지판, 신호등과 같은 것들을 감지하는 자율 주행에 일반적으로 사용됩니다. 
+객체 탐지는 이미지에서 인스턴스(예: 사람, 건물 또는 자동차)를 감지하는 컴퓨터 비전 작업입니다. 객체 탐지 모델은 이미지를 입력으로 받고 탐지된 바운딩 박스의 좌표와 관련된 레이블을 출력합니다.
+하나의 이미지에는 여러 객체가 있을 수 있으며 각각은 자체적인 바운딩 박스와 레이블을 가질 수 있습니다(예: 차와 건물이 있는 이미지).
+또한 각 객체는 이미지의 다른 부분에 존재할 수 있습니다(예: 이미지에 여러 대의 차가 있을 수 있음).
+이 작업은 보행자, 도로 표지판, 신호등과 같은 것들을 감지하는 자율 주행에 일반적으로 사용됩니다.
 다른 응용 분야로는 이미지 내 객체 수 계산 및 이미지 검색 등이 있습니다.
 
 이 가이드에서 다음을 배울 것입니다:
@@ -45,7 +45,7 @@ rendered properly in your Markdown viewer.
 pip install -q datasets transformers evaluate timm albumentations
 ```
 
-허깅페이스 허브에서 데이터 세트를 가져오기 위한 🤗 Datasets과 모델을 학습하기 위한 🤗 Transformers, 데이터를 증강하기 위한 `albumentations`를 사용합니다. 
+허깅페이스 허브에서 데이터 세트를 가져오기 위한 🤗 Datasets과 모델을 학습하기 위한 🤗 Transformers, 데이터를 증강하기 위한 `albumentations`를 사용합니다.
 DETR 모델의 합성곱 백본을 가져오기 위해서는 현재 `timm`이 필요합니다.
 
 커뮤니티에 모델을 업로드하고 공유할 수 있도록 Hugging Face 계정에 로그인하는 것을 권장합니다. 프롬프트가 나타나면 토큰을 입력하여 로그인하세요:
@@ -110,7 +110,7 @@ DatasetDict({
   - `bbox`: 객체의 바운딩 박스 ([COCO 포맷](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco)으로)
   - `category`: 객체의 카테고리, 가능한 값으로는 `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` 및 `Mask (4)` 가 포함됩니다.
 
-`bbox` 필드가 DETR 모델이 요구하는 COCO 형식을 따른다는 것을 알 수 있습니다. 
+`bbox` 필드가 DETR 모델이 요구하는 COCO 형식을 따른다는 것을 알 수 있습니다.
 그러나 `objects` 내부의 필드 그룹은 DETR이 요구하는 어노테이션 형식과 다릅니다. 따라서 이 데이터를 학습에 사용하기 전에 전처리를 적용해야 합니다.
 
 데이터를 더 잘 이해하기 위해서 데이터 세트에서 한 가지 예시를 시각화하세요.
@@ -143,13 +143,13 @@ DatasetDict({
     <img src="https://i.imgur.com/TdaqPJO.png" alt="CPPE-5 Image Example"/>
 </div>
 
-바운딩 박스와 연결된 레이블을 시각화하려면 데이터 세트의 메타 데이터, 특히 `category` 필드에서 레이블을 가져와야 합니다. 
-또한 레이블 ID를 레이블 클래스에 매핑하는 `id2label`과 반대로 매핑하는 `label2id` 딕셔너리를 만들어야 합니다. 
+바운딩 박스와 연결된 레이블을 시각화하려면 데이터 세트의 메타 데이터, 특히 `category` 필드에서 레이블을 가져와야 합니다.
+또한 레이블 ID를 레이블 클래스에 매핑하는 `id2label`과 반대로 매핑하는 `label2id` 딕셔너리를 만들어야 합니다.
 모델을 설정할 때 이러한 매핑을 사용할 수 있습니다. 이러한 매핑은 허깅페이스 허브에서 모델을 공유했을 때 다른 사람들이 재사용할 수 있습니다.
 
-데이터를 더 잘 이해하기 위한 최종 단계로, 잠재적인 문제를 찾아보세요. 
-객체 감지를 위한 데이터 세트에서 자주 발생하는 문제 중 하나는 바운딩 박스가 이미지의 가장자리를 넘어가는 것입니다. 
-이러한 바운딩 박스를 "넘어가는 것(run away)"은 훈련 중에 오류를 발생시킬 수 있기에 이 단계에서 처리해야 합니다. 
+데이터를 더 잘 이해하기 위한 최종 단계로, 잠재적인 문제를 찾아보세요.
+객체 감지를 위한 데이터 세트에서 자주 발생하는 문제 중 하나는 바운딩 박스가 이미지의 가장자리를 넘어가는 것입니다.
+이러한 바운딩 박스를 "넘어가는 것(run away)"은 훈련 중에 오류를 발생시킬 수 있기에 이 단계에서 처리해야 합니다.
 이 데이터 세트에도 같은 문제가 있는 몇 가지 예가 있습니다. 이 가이드에서는 간단하게하기 위해 데이터에서 이러한 이미지를 제거합니다.
 
 ```py
@@ -160,15 +160,15 @@ DatasetDict({
 
 ## 데이터 전처리하기 [[preprocess-the-data]]
 
-모델을 미세 조정 하려면, 미리 학습된 모델에서 사용한 전처리 방식과 정확하게 일치하도록 사용할 데이터를 전처리해야 합니다. 
-[`AutoImageProcessor`]는 이미지 데이터를 처리하여 DETR 모델이 학습에 사용할 수 있는 `pixel_values`, `pixel_mask`, 그리고 `labels`를 생성하는 작업을 담당합니다. 
+모델을 미세 조정 하려면, 미리 학습된 모델에서 사용한 전처리 방식과 정확하게 일치하도록 사용할 데이터를 전처리해야 합니다.
+[`AutoImageProcessor`]는 이미지 데이터를 처리하여 DETR 모델이 학습에 사용할 수 있는 `pixel_values`, `pixel_mask`, 그리고 `labels`를 생성하는 작업을 담당합니다.
 이 이미지 프로세서에는 걱정하지 않아도 되는 몇 가지 속성이 있습니다:
 
 - `image_mean = [0.485, 0.456, 0.406 ]`
 - `image_std = [0.229, 0.224, 0.225]`
 
 
-이 값들은 모델 사전 훈련 중 이미지를 정규화하는 데 사용되는 평균과 표준 편차입니다. 
+이 값들은 모델 사전 훈련 중 이미지를 정규화하는 데 사용되는 평균과 표준 편차입니다.
 이 값들은 추론 또는 사전 훈련된 이미지 모델을 세밀하게 조정할 때 복제해야 하는 중요한 값입니다.
 
 사전 훈련된 모델과 동일한 체크포인트에서 이미지 프로세서를 인스턴스화합니다.
@@ -187,7 +187,7 @@ DatasetDict({
 
 첫째로, 모델이 학습 데이터에 과적합 되지 않도록 데이터 증강 라이브러리 중 아무거나 사용하여 변환을 적용할 수 있습니다. 여기에서는 [Albumentations](https://albumentations.ai/docs/) 라이브러리를 사용합니다...
 이 라이브러리는 변환을 이미지에 적용하고 바운딩 박스를 적절하게 업데이트하도록 보장합니다.
-🤗 Datasets 라이브러리 문서에는 [객체 탐지를 위해 이미지를 보강하는 방법에 대한 자세한 가이드](https://huggingface.co/docs/datasets/object_detection)가 있으며, 
+🤗 Datasets 라이브러리 문서에는 [객체 탐지를 위해 이미지를 보강하는 방법에 대한 자세한 가이드](https://huggingface.co/docs/datasets/object_detection)가 있으며,
 이 예제와 정확히 동일한 데이터 세트를 사용합니다. 여기서는 각 이미지를 (480, 480) 크기로 조정하고, 좌우로 뒤집고, 밝기를 높이는 동일한 접근법을 적용합니다:
 
 
@@ -290,14 +290,14 @@ DatasetDict({
  'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}}
 ```
 
-각각의 이미지를 성공적으로 증강하고 이미지의 어노테이션을 준비했습니다. 
+각각의 이미지를 성공적으로 증강하고 이미지의 어노테이션을 준비했습니다.
 그러나 전처리는 아직 끝나지 않았습니다. 마지막 단계로, 이미지를 배치로 만들 사용자 정의 `collate_fn`을 생성합니다.
 해당 배치에서 가장 큰 이미지에 이미지(현재 `pixel_values` 인)를 패드하고, 실제 픽셀(1)과 패딩(0)을 나타내기 위해 그에 해당하는 새로운 `pixel_mask`를 생성해야 합니다.
 
 ```py
 >>> def collate_fn(batch):
 ...     pixel_values = [item["pixel_values"] for item in batch]
-...     encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
 ...     labels = [item["labels"] for item in batch]
 ...     batch = {}
 ...     batch["pixel_values"] = encoding["pixel_values"]
@@ -318,7 +318,7 @@ DatasetDict({
 3. 모델, 데이터 세트, 이미지 프로세서 및 데이터 콜레이터와 함께 [`Trainer`]에 훈련 인수를 전달합니다.
 4. [`~Trainer.train`]를 호출하여 모델을 미세 조정 합니다.
 
-전처리에 사용한 체크포인트와 동일한 체크포인트에서 모델을 가져올 때, 데이터 세트의 메타데이터에서 만든 `label2id`와 `id2label` 매핑을 전달해야 합니다. 
+전처리에 사용한 체크포인트와 동일한 체크포인트에서 모델을 가져올 때, 데이터 세트의 메타데이터에서 만든 `label2id`와 `id2label` 매핑을 전달해야 합니다.
 또한, `ignore_mismatched_sizes=True`를 지정하여 기존 분류 헤드(모델에서 분류에 사용되는 마지막 레이어)를 새 분류 헤드로 대체합니다.
 
 ```py
@@ -333,7 +333,7 @@ DatasetDict({
 ```
 
 [`TrainingArguments`]에서 `output_dir`을 사용하여 모델을 저장할 위치를 지정한 다음, 필요에 따라 하이퍼파라미터를 구성하세요.
-사용하지 않는 열을 제거하지 않도록 주의해야 합니다. 만약 `remove_unused_columns`가 `True`일 경우 이미지 열이 삭제됩니다. 
+사용하지 않는 열을 제거하지 않도록 주의해야 합니다. 만약 `remove_unused_columns`가 `True`일 경우 이미지 열이 삭제됩니다.
 이미지 열이 없는 경우 `pixel_values`를 생성할 수 없기 때문에 `remove_unused_columns`를 `False`로 설정해야 합니다.
 모델을 Hub에 업로드하여 공유하려면 `push_to_hub`를 `True`로 설정하십시오(허깅페이스에 로그인하여 모델을 업로드해야 합니다).
 
@@ -372,7 +372,7 @@ DatasetDict({
 >>> trainer.train()
 ```
 
-`training_args`에서 `push_to_hub`를 `True`로 설정한 경우, 학습 체크포인트는 허깅페이스 허브에 업로드됩니다. 
+`training_args`에서 `push_to_hub`를 `True`로 설정한 경우, 학습 체크포인트는 허깅페이스 허브에 업로드됩니다.
 학습 완료 후, [`~transformers.Trainer.push_to_hub`] 메소드를 호출하여 최종 모델을 허깅페이스 허브에 업로드합니다.
 
 ```py
@@ -381,14 +381,14 @@ DatasetDict({
 
 ## 평가하기 [[evaluate]]
 
-객체 탐지 모델은 일반적으로 일련의 <a href="https://cocodataset.org/#detection-eval">COCO-스타일 지표</a>로 평가됩니다. 
+객체 탐지 모델은 일반적으로 일련의 <a href="https://cocodataset.org/#detection-eval">COCO-스타일 지표</a>로 평가됩니다.
 기존에 구현된 평가 지표 중 하나를 사용할 수도 있지만, 여기에서는 허깅페이스 허브에 푸시한 최종 모델을 평가하는 데 `torchvision`에서 제공하는 평가 지표를 사용합니다.
 
-`torchvision` 평가자(evaluator)를 사용하려면 실측값인 COCO 데이터 세트를 준비해야 합니다. 
-COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 저장해야 하므로, 먼저 이미지와 어노테이션을 디스크에 저장해야 합니다. 
+`torchvision` 평가자(evaluator)를 사용하려면 실측값인 COCO 데이터 세트를 준비해야 합니다.
+COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 저장해야 하므로, 먼저 이미지와 어노테이션을 디스크에 저장해야 합니다.
 학습을 위해 데이터를 준비할 때와 마찬가지로, cppe5["test"]에서의 어노테이션은 포맷을 맞춰야 합니다. 그러나 이미지는 그대로 유지해야 합니다.
 
-평가 단계는 약간의 작업이 필요하지만, 크게 세 가지 주요 단계로 나눌 수 있습니다. 
+평가 단계는 약간의 작업이 필요하지만, 크게 세 가지 주요 단계로 나눌 수 있습니다.
 먼저, `cppe5["test"]` 세트를 준비합니다: 어노테이션을 포맷에 맞게 만들고 데이터를 디스크에 저장합니다.
 
 ```py
@@ -532,9 +532,9 @@ IoU metric: bbox
 
 ## 추론하기 [[inference]]
 
-DETR 모델을 미세 조정 및 평가하고, 허깅페이스 허브에 업로드 했으므로 추론에 사용할 수 있습니다. 
+DETR 모델을 미세 조정 및 평가하고, 허깅페이스 허브에 업로드 했으므로 추론에 사용할 수 있습니다.
 
-미세 조정된 모델을 추론에 사용하는 가장 간단한 방법은 [`pipeline`]에서 모델을 사용하는 것입니다. 
+미세 조정된 모델을 추론에 사용하는 가장 간단한 방법은 [`pipeline`]에서 모델을 사용하는 것입니다.
 모델과 함께 객체 탐지를 위한 파이프라인을 인스턴스화하고, 이미지를 전달하세요:
 
 ```py
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 0893792e2dad..63585d5e038b 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -24,7 +24,6 @@
     get_channel_dimension_axis,
     get_image_size,
     infer_channel_dimension_format,
-    to_numpy_array,
 )
 from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor
 from .utils.import_utils import (
@@ -345,18 +344,6 @@ def normalize(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the output image. If unset, will use the inferred format from the input.
     """
-    requires_backends(normalize, ["vision"])
-
-    if isinstance(image, PIL.Image.Image):
-        warnings.warn(
-            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning,
-        )
-        # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
-        # casting to numpy array and dividing by 255.
-        image = to_numpy_array(image)
-        image = rescale(image, scale=1 / 255)
-
     if not isinstance(image, np.ndarray):
         raise ValueError("image must be a numpy array")
 
@@ -418,15 +405,10 @@ def center_crop(
     """
     requires_backends(center_crop, ["vision"])
 
-    if isinstance(image, PIL.Image.Image):
-        warnings.warn(
-            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
-            FutureWarning,
-        )
-        image = to_numpy_array(image)
-        return_numpy = False if return_numpy is None else return_numpy
-    else:
-        return_numpy = True if return_numpy is None else return_numpy
+    if return_numpy is not None:
+        warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning)
+
+    return_numpy = True if return_numpy is None else return_numpy
 
     if not isinstance(image, np.ndarray):
         raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 4fafc5fda661..dd2ce35936e1 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -128,15 +128,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.do_reduce_labels = do_reduce_labels
 
-    @property
-    def reduce_labels(self) -> bool:
-        warnings.warn(
-            "The `reduce_labels` property is deprecated and will be removed in v4.27. Please use"
-            " `do_reduce_labels` instead.",
-            FutureWarning,
-        )
-        return self.do_reduce_labels
-
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index 76e0876c76c3..ede52b6fe5e1 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Image processor class for BridgeTower."""
 
-import warnings
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -352,42 +351,6 @@ def pad(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
-    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad_and_create_pixel_mask
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.26.0. Please use pad instead.", FutureWarning
-        )
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 52b46471ddcf..3de243cd8614 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -820,15 +820,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_pad = do_pad
 
-    @property
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size
-    def max_size(self):
-        logger.warning(
-            "The `max_size` parameter is deprecated and will be removed in v4.27. "
-            "Please specify in `size['longest_edge'] instead`.",
-        )
-        return self.size["longest_edge"]
-
     @classmethod
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->ConditionalDetr
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
@@ -873,7 +864,7 @@ def prepare_annotation(
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
         logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -882,23 +873,17 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        logger.warning_once(
-            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->ConditionalDetr
     def prepare_coco_detection(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -979,40 +964,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 908cc148c2fa..6aa3d5a82ffd 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -818,15 +818,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_pad = do_pad
 
-    @property
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size
-    def max_size(self):
-        logger.warning(
-            "The `max_size` parameter is deprecated and will be removed in v4.27. "
-            "Please specify in `size['longest_edge'] instead`.",
-        )
-        return self.size["longest_edge"]
-
     @classmethod
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
@@ -871,7 +862,7 @@ def prepare_annotation(
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
         logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -880,23 +871,17 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        logger.warning_once(
-            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
     def prepare_coco_detection(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -977,40 +962,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index d60f6f838c9c..573c71dafc28 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -544,7 +544,7 @@ def prepare_annotation(
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
         logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -553,23 +553,17 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        logger.warning_once(
-            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
     def prepare_coco_detection(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     def resize(
@@ -641,40 +635,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index c8ea81eac7b6..5983506eff84 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -802,14 +802,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_pad = do_pad
 
-    @property
-    def max_size(self):
-        logger.warning(
-            "The `max_size` parameter is deprecated and will be removed in v4.27. "
-            "Please specify in `size['longest_edge'] instead`.",
-        )
-        return self.size["longest_edge"]
-
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
@@ -851,7 +843,7 @@ def prepare_annotation(
 
     def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
         logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -859,21 +851,15 @@ def prepare(self, image, target, return_segmentation_masks=None, masks_path=None
         return image, target
 
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        logger.warning_once(
-            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     def prepare_coco_detection(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     def prepare_coco_panoptic(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     def resize(
@@ -949,39 +935,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
         """
         return normalize_annotation(annotation, image_size=image_size)
 
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.")
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     def _pad_image(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index c94927472915..294d42580dbd 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -151,12 +151,6 @@ def align_long_axis(
 
         return image
 
-    def rotate_image(self, *args, **kwargs):
-        logger.info(
-            "rotate_image is deprecated and will be removed in version 4.27. Please use align_long_axis instead."
-        )
-        return self.align_long_axis(*args, **kwargs)
-
     def pad_image(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 234e784bd67f..59cf03792531 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -29,7 +29,6 @@
     rescale,
     resize,
     to_channel_dimension_format,
-    to_numpy_array,
 )
 from ...image_utils import (
     ChannelDimension,
@@ -38,6 +37,7 @@
     get_image_size,
     infer_channel_dimension_format,
     is_batched,
+    to_numpy_array,
     valid_images,
 )
 from ...utils import (
@@ -441,24 +441,6 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    @property
-    def size_divisibility(self):
-        warnings.warn(
-            "The `size_divisibility` property is deprecated and will be removed in v4.27. Please use "
-            "`size_divisor` instead.",
-            FutureWarning,
-        )
-        return self.size_divisor
-
-    @property
-    def max_size(self):
-        warnings.warn(
-            "The `max_size` property is deprecated and will be removed in v4.27. Please use size['longest_edge']"
-            " instead.",
-            FutureWarning,
-        )
-        return self.size["longest_edge"]
-
     def resize(
         self,
         image: np.ndarray,
@@ -789,7 +771,6 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
@@ -840,12 +821,6 @@ def encode_inputs(
         """
         ignore_index = self.ignore_index if ignore_index is None else ignore_index
         reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels
-
-        if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
-                "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning
-            )
-
         pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
         encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors)
 
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index bfdc07431e35..c40212a75a73 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -29,7 +29,6 @@
     rescale,
     resize,
     to_channel_dimension_format,
-    to_numpy_array,
 )
 from ...image_utils import (
     ChannelDimension,
@@ -38,6 +37,7 @@
     get_image_size,
     infer_channel_dimension_format,
     make_list_of_images,
+    to_numpy_array,
     valid_images,
 )
 from ...utils import (
@@ -452,33 +452,6 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    @property
-    def size_divisibility(self):
-        warnings.warn(
-            "The `size_divisibility` property is deprecated and will be removed in v4.27. Please use "
-            "`size_divisor` instead.",
-            FutureWarning,
-        )
-        return self.size_divisor
-
-    @property
-    def max_size(self):
-        warnings.warn(
-            "The `max_size` property is deprecated and will be removed in v4.27. Please use size['longest_edge']"
-            " instead.",
-            FutureWarning,
-        )
-        return self.size["longest_edge"]
-
-    @property
-    def reduce_labels(self):
-        warnings.warn(
-            "The `reduce_labels` property is deprecated and will be removed in v4.27. Please use "
-            "`do_reduce_labels` instead.",
-            FutureWarning,
-        )
-        return self.do_reduce_labels
-
     def resize(
         self,
         image: np.ndarray,
@@ -820,7 +793,6 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
@@ -869,10 +841,6 @@ def encode_inputs(
               `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of
               `mask_labels[i][j]` if `class_labels[i][j]`.
         """
-        if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
-                "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning
-            )
         ignore_index = self.ignore_index if ignore_index is None else ignore_index
         reduce_labels = self.do_reduce_labels if reduce_labels is None else reduce_labels
 
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index a99a7182b858..6f59cb661cc6 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -30,7 +30,6 @@
     rescale,
     resize,
     to_channel_dimension_format,
-    to_numpy_array,
 )
 from ...image_utils import (
     ChannelDimension,
@@ -39,6 +38,7 @@
     get_image_size,
     infer_channel_dimension_format,
     make_list_of_images,
+    to_numpy_array,
     valid_images,
 )
 from ...utils import (
@@ -881,7 +881,6 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
@@ -935,11 +934,6 @@ def encode_inputs(
             - **text_inputs** -- Optional list of text string entries to be fed to a model (when `annotations` are
               provided). They identify the binary masks present in the image.
         """
-        if "pad_and_return_pixel_mask" in kwargs:
-            warnings.warn(
-                "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning
-            )
-
         ignore_index = self.ignore_index if ignore_index is None else ignore_index
         reduce_labels = self.do_reduce_labels if reduce_labels is None else reduce_labels
         pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 2d86599fffef..ea5cd7b7766c 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -27,7 +27,6 @@
     rescale,
     resize,
     to_channel_dimension_format,
-    to_numpy_array,
 )
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
@@ -36,6 +35,7 @@
     ImageInput,
     PILImageResampling,
     make_list_of_images,
+    to_numpy_array,
     valid_images,
 )
 from ...utils import TensorType, is_torch_available, logging
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index 36d171f8e2fe..df92c940ce76 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -116,15 +116,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_reduce_labels = do_reduce_labels
 
-    @property
-    def reduce_labels(self):
-        warnings.warn(
-            "The `reduce_labels` property is deprecated and will be removed in a v4.27. Please use "
-            "`do_reduce_labels` instead.",
-            FutureWarning,
-        )
-        return self.do_reduce_labels
-
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 87b6e682e93f..5377ffbb7cdc 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Image processor class for Vilt."""
 
-import warnings
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -331,41 +330,6 @@ def pad(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
-    def pad_and_create_pixel_mask(
-        self,
-        pixel_values_list: List[ImageInput],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = None,
-    ) -> BatchFeature:
-        """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
-        corresponding pixel mask.
-
-        Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        warnings.warn(
-            "This method is deprecated and will be removed in v4.26.0. Please use pad instead.", FutureWarning
-        )
-        # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
-        images = [to_numpy_array(image) for image in pixel_values_list]
-        return self.pad(
-            images=images,
-            return_pixel_mask=True,
-            return_tensors=return_tensors,
-            data_format=data_format,
-        )
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 1aa37fec4219..a47267417146 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -729,15 +729,6 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
         self.do_pad = do_pad
 
-    @property
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size
-    def max_size(self):
-        logger.warning(
-            "The `max_size` parameter is deprecated and will be removed in v4.27. "
-            "Please specify in `size['longest_edge'] instead`.",
-        )
-        return self.size["longest_edge"]
-
     @classmethod
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->Yolos
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
@@ -782,7 +773,7 @@ def prepare_annotation(
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
     def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
         logger.warning_once(
-            "The `prepare` method is deprecated and will be removed in a future version. "
+            "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
             "does not return the image anymore.",
         )
@@ -791,23 +782,17 @@ def prepare(self, image, target, return_segmentation_masks=False, masks_path=Non
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
     def convert_coco_poly_to_mask(self, *args, **kwargs):
-        logger.warning_once(
-            "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ")
         return convert_coco_poly_to_mask(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->Yolos
     def prepare_coco_detection(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_detection` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_detection_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
     def prepare_coco_panoptic(self, *args, **kwargs):
-        logger.warning_once(
-            "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. "
-        )
+        logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ")
         return prepare_coco_panoptic_annotation(*args, **kwargs)
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
@@ -1010,7 +995,7 @@ def preprocess(
         """
         if "pad_and_return_pixel_mask" in kwargs:
             logger.warning_once(
-                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.33, "
                 "use `do_pad` instead.",
             )
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -1018,7 +1003,7 @@ def preprocess(
         max_size = None
         if "max_size" in kwargs:
             logger.warning_once(
-                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                "The `max_size` argument is deprecated and will be removed in v4.33, use"
                 " `size['longest_edge']` instead.",
             )
             size = kwargs.pop("max_size")
diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py
index 9a9bff9fc36e..80ea966c2a8f 100644
--- a/tests/models/bridgetower/test_image_processing_bridgetower.py
+++ b/tests/models/bridgetower/test_image_processing_bridgetower.py
@@ -236,23 +236,3 @@ def test_call_pytorch(self):
                 expected_width,
             ),
         )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image processors
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
index ba77431467b8..98510a3c00fb 100644
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -244,26 +244,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index c0d927b9c980..40bf405eebeb 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -246,27 +246,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py
index e6e984766577..b3e550fc4cf0 100644
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -240,27 +240,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index d6354de43dbe..f923cb6726fa 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -247,26 +247,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py
index c750edd2e247..4ba6389b7f52 100644
--- a/tests/models/mask2former/test_image_processing_mask2former.py
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -147,7 +147,6 @@ def test_image_processor_properties(self):
         self.assertTrue(hasattr(image_processing, "do_normalize"))
         self.assertTrue(hasattr(image_processing, "do_resize"))
         self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "max_size"))
         self.assertTrue(hasattr(image_processing, "ignore_index"))
         self.assertTrue(hasattr(image_processing, "num_labels"))
 
@@ -263,28 +262,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(
-            do_resize=False, do_normalize=False, do_rescale=False, num_labels=self.image_processor_tester.num_classes
-        )
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.encode_inputs(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     def comm_get_image_processing_inputs(
         self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
     ):
diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py
index 3fff1022c1db..535582bc1f8d 100644
--- a/tests/models/maskformer/test_image_processing_maskformer.py
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -147,7 +147,6 @@ def test_image_processor_properties(self):
         self.assertTrue(hasattr(image_processing, "do_normalize"))
         self.assertTrue(hasattr(image_processing, "do_resize"))
         self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "max_size"))
         self.assertTrue(hasattr(image_processing, "ignore_index"))
         self.assertTrue(hasattr(image_processing, "num_labels"))
 
@@ -263,28 +262,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(
-            do_resize=False, do_normalize=False, do_rescale=False, num_labels=self.image_processor_tester.num_classes
-        )
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.encode_inputs(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     def comm_get_image_processing_inputs(
         self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
     ):
diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py
index d6fc5e228b0b..864c803f3544 100644
--- a/tests/models/oneformer/test_image_processing_oneformer.py
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -286,36 +286,6 @@ def test_call_pytorch(self):
             ),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processors
-        image_processor_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processor_2 = self.image_processing_class(
-            do_resize=False,
-            do_normalize=False,
-            do_rescale=False,
-            num_labels=self.image_processing_tester.num_classes,
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.image_processing_tester.num_text,
-            repo_path="shi-labs/oneformer_demo",
-        )
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processor_1.encode_inputs(
-            image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
-        )
-        encoded_images = image_processor_2(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     def comm_get_image_processor_inputs(
         self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
     ):
diff --git a/tests/models/oneformer/test_processor_oneformer.py b/tests/models/oneformer/test_processor_oneformer.py
index e97895d7ce01..c65807ca9127 100644
--- a/tests/models/oneformer/test_processor_oneformer.py
+++ b/tests/models/oneformer/test_processor_oneformer.py
@@ -355,41 +355,6 @@ def test_call_pytorch(self):
             (self.processing_tester.batch_size, expected_sequence_length),
         )
 
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize processors
-        processor_1 = self.processing_class(**self.processor_dict)
-
-        image_processor = OneFormerImageProcessor(
-            do_resize=False,
-            do_normalize=False,
-            do_rescale=False,
-            num_labels=self.processing_tester.num_classes,
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-        processor_2 = self.processing_class(
-            image_processor=image_processor, tokenizer=tokenizer, max_seq_length=77, task_seq_length=77
-        )
-
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.processing_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = processor_1.encode_inputs(
-            image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt"
-        )
-        encoded_images = processor_2(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )
-
     def comm_get_processor_inputs(self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"):
         processor = self.processing_class(**self.processor_dict)
         # prepare image and target
diff --git a/tests/models/vilt/test_image_processing_vilt.py b/tests/models/vilt/test_image_processing_vilt.py
index f33492d10205..28cf9f2fe6b2 100644
--- a/tests/models/vilt/test_image_processing_vilt.py
+++ b/tests/models/vilt/test_image_processing_vilt.py
@@ -237,23 +237,3 @@ def test_call_pytorch(self):
                 expected_width,
             ),
         )
-
-    def test_equivalence_pad_and_create_pixel_mask(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
-        encoded_images = image_processing_2(image_inputs, return_tensors="pt")
-
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
-        )
-        self.assertTrue(
-            torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
-        )

From 8c4471d1fcccd07aa4d2a4e8aa35f46b472b7e84 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 29 Jun 2023 10:55:21 -0400
Subject: [PATCH 574/935] Fix ESM models buffers (#24576)

* Fix ESM models buffers

* Remove modifs

* Tied weights keys are needed silly

* quality
---
 src/transformers/models/esm/modeling_esm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index 43ff7d7b52b5..2d5ca07f2fc6 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -96,7 +96,7 @@ def __init__(self, dim: int):
         # Generate and save the inverse frequency buffer (non trainable)
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
         inv_freq = inv_freq
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("inv_freq", inv_freq)
 
         self._seq_len_cached = None
         self._cos_cached = None

From c817bc44e2915593a292df9e7d4e2c0dfefb6620 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 29 Jun 2023 17:49:59 +0200
Subject: [PATCH 575/935] Check all objects are equally in the main `__init__`
 file (#24573)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docs/source/en/model_doc/auto.md              | 12 ++++
 src/transformers/__init__.py                  |  6 ++
 src/transformers/models/auto/__init__.py      |  6 ++
 .../models/auto/modeling_tf_auto.py           |  2 +-
 src/transformers/utils/dummy_tf_objects.py    | 21 +++++++
 utils/check_repo.py                           | 61 ++++++++++++++++++-
 6 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
index 07ff0b8d6d86..f493e208eea2 100644
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -142,6 +142,10 @@ The following auto classes are available for the following natural language proc
 
 [[autodoc]] AutoModelForMaskGeneration
 
+### TFAutoModelForMaskGeneration
+
+[[autodoc]] TFAutoModelForMaskGeneration
+
 ### AutoModelForSeq2SeqLM
 
 [[autodoc]] AutoModelForSeq2SeqLM
@@ -250,6 +254,10 @@ The following auto classes are available for the following computer vision tasks
 
 [[autodoc]] AutoModelForMaskedImageModeling
 
+### TFAutoModelForMaskedImageModeling
+
+[[autodoc]] TFAutoModelForMaskedImageModeling
+
 ### AutoModelForObjectDetection
 
 [[autodoc]] AutoModelForObjectDetection
@@ -296,6 +304,10 @@ The following auto classes are available for the following audio tasks.
 
 ### AutoModelForAudioFrameClassification
 
+[[autodoc]] TFAutoModelForAudioClassification
+
+### TFAutoModelForAudioFrameClassification
+
 [[autodoc]] AutoModelForAudioFrameClassification
 
 ### AutoModelForCTC
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3747e1951fbd..99683306d6b4 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3025,10 +3025,13 @@
             "TF_MODEL_MAPPING",
             "TF_MODEL_WITH_LM_HEAD_MAPPING",
             "TFAutoModel",
+            "TFAutoModelForAudioClassification",
             "TFAutoModelForCausalLM",
             "TFAutoModelForDocumentQuestionAnswering",
             "TFAutoModelForImageClassification",
+            "TFAutoModelForMaskedImageModeling",
             "TFAutoModelForMaskedLM",
+            "TFAutoModelForMaskGeneration",
             "TFAutoModelForMultipleChoice",
             "TFAutoModelForNextSentencePrediction",
             "TFAutoModelForPreTraining",
@@ -6453,10 +6456,13 @@
             TF_MODEL_MAPPING,
             TF_MODEL_WITH_LM_HEAD_MAPPING,
             TFAutoModel,
+            TFAutoModelForAudioClassification,
             TFAutoModelForCausalLM,
             TFAutoModelForDocumentQuestionAnswering,
             TFAutoModelForImageClassification,
+            TFAutoModelForMaskedImageModeling,
             TFAutoModelForMaskedLM,
+            TFAutoModelForMaskGeneration,
             TFAutoModelForMultipleChoice,
             TFAutoModelForNextSentencePrediction,
             TFAutoModelForPreTraining,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 26b6609c28fc..36286f2be0be 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -140,9 +140,12 @@
         "TF_MODEL_MAPPING",
         "TF_MODEL_WITH_LM_HEAD_MAPPING",
         "TFAutoModel",
+        "TFAutoModelForAudioClassification",
         "TFAutoModelForCausalLM",
         "TFAutoModelForImageClassification",
+        "TFAutoModelForMaskedImageModeling",
         "TFAutoModelForMaskedLM",
+        "TFAutoModelForMaskGeneration",
         "TFAutoModelForMultipleChoice",
         "TFAutoModelForNextSentencePrediction",
         "TFAutoModelForPreTraining",
@@ -313,10 +316,13 @@
             TF_MODEL_MAPPING,
             TF_MODEL_WITH_LM_HEAD_MAPPING,
             TFAutoModel,
+            TFAutoModelForAudioClassification,
             TFAutoModelForCausalLM,
             TFAutoModelForDocumentQuestionAnswering,
             TFAutoModelForImageClassification,
+            TFAutoModelForMaskedImageModeling,
             TFAutoModelForMaskedLM,
+            TFAutoModelForMaskGeneration,
             TFAutoModelForMultipleChoice,
             TFAutoModelForNextSentencePrediction,
             TFAutoModelForPreTraining,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 10e33cbebbe6..ecf9b06da5c6 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -593,7 +593,7 @@ class TFAutoModelForSemanticSegmentation(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
 
 
-TF_AutoModelForSemanticSegmentation = auto_class_update(
+TFAutoModelForSemanticSegmentation = auto_class_update(
     TFAutoModelForSemanticSegmentation, head_doc="semantic segmentation"
 )
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index e55684402805..46cde8ffbef4 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -289,6 +289,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForAudioClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForCausalLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -310,6 +317,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForMaskedImageModeling(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -317,6 +331,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFAutoModelForMaskGeneration(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFAutoModelForMultipleChoice(metaclass=DummyObject):
     _backends = ["tf"]
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 46407eb1a5ac..db947d834bad 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -16,6 +16,7 @@
 import inspect
 import os
 import re
+import sys
 import warnings
 from collections import OrderedDict
 from difflib import get_close_matches
@@ -336,6 +337,21 @@
     "MusicgenForConditionalGeneration",
 ]
 
+# DO NOT edit this list!
+# (The corresponding pytorch objects should never be in the main `__init__`, but it's too late to remove)
+OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK = [
+    "FlaxBertLayer",
+    "FlaxBigBirdLayer",
+    "FlaxRoFormerLayer",
+    "TFBertLayer",
+    "TFLxmertEncoder",
+    "TFLxmertXLayer",
+    "TFMPNetLayer",
+    "TFMobileBertLayer",
+    "TFSegformerLayer",
+    "TFViTMAELayer",
+]
+
 # Update this list for models that have multiple model types for the same
 # model doc
 MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
@@ -735,7 +751,48 @@ def check_all_auto_mappings_importable():
     for name, _ in mappings_to_check.items():
         name = name.replace("_MAPPING_NAMES", "_MAPPING")
         if not hasattr(transformers, name):
-            failures.append(f"`{name}` should be defined in the main `__init__` file.")
+            failures.append(f"`{name}`")
+    if len(failures) > 0:
+        raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
+
+
+def check_objects_being_equally_in_main_init():
+    """Check if an object is in the main __init__ if its counterpart in PyTorch is."""
+    attrs = dir(transformers)
+
+    failures = []
+    for attr in attrs:
+        obj = getattr(transformers, attr)
+        if hasattr(obj, "__module__"):
+            module_path = obj.__module__
+            module_name = module_path.split(".")[-1]
+            module_dir = ".".join(module_path.split(".")[:-1])
+            if (
+                module_name.startswith("modeling_")
+                and not module_name.startswith("modeling_tf_")
+                and not module_name.startswith("modeling_flax_")
+            ):
+                parent_module = sys.modules[module_dir]
+
+                frameworks = []
+                if is_tf_available():
+                    frameworks.append("TF")
+                if is_flax_available():
+                    frameworks.append("Flax")
+
+                for framework in frameworks:
+                    other_module_path = module_path.replace("modeling_", f"modeling_{framework.lower()}_")
+                    if os.path.isfile("src/" + other_module_path.replace(".", "/") + ".py"):
+                        other_module_name = module_name.replace("modeling_", f"modeling_{framework.lower()}_")
+                        other_module = getattr(parent_module, other_module_name)
+                        if hasattr(other_module, f"{framework}{attr}"):
+                            if not hasattr(transformers, f"{framework}{attr}"):
+                                if f"{framework}{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
+                                    failures.append(f"{framework}{attr}")
+                        if hasattr(other_module, f"{framework}_{attr}"):
+                            if not hasattr(transformers, f"{framework}_{attr}"):
+                                if f"{framework}_{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
+                                    failures.append(f"{framework}_{attr}")
     if len(failures) > 0:
         raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
 
@@ -1024,6 +1081,8 @@ def check_repo_quality():
     check_all_auto_mapping_names_in_config_mapping_names()
     print("Checking all auto mappings could be imported.")
     check_all_auto_mappings_importable()
+    print("Checking all objects are equally (across frameworks) in the main __init__.")
+    check_objects_being_equally_in_main_init()
 
 
 if __name__ == "__main__":

From 232c898f9fd1eec956e7d3f9fbc78999992c5b4a Mon Sep 17 00:00:00 2001
From: "MS Kim(tony9402)" <tony9402@naver.com>
Date: Fri, 30 Jun 2023 03:17:35 +0900
Subject: [PATCH 576/935] Fix annotations (#24582)

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations

* fix annotations
---
 src/transformers/models/autoformer/modeling_autoformer.py   | 2 +-
 src/transformers/models/bart/modeling_tf_bart.py            | 6 +++---
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py      | 2 +-
 .../models/blenderbot/modeling_tf_blenderbot.py             | 6 +++---
 .../models/blenderbot_small/modeling_tf_blenderbot_small.py | 6 +++---
 .../models/conditional_detr/modeling_conditional_detr.py    | 2 +-
 src/transformers/models/deta/modeling_deta.py               | 4 ++--
 src/transformers/models/detr/modeling_detr.py               | 6 +++---
 src/transformers/models/informer/modeling_informer.py       | 2 +-
 src/transformers/models/led/modeling_led.py                 | 6 +++---
 src/transformers/models/led/modeling_tf_led.py              | 6 +++---
 src/transformers/models/marian/modeling_tf_marian.py        | 6 +++---
 src/transformers/models/maskformer/modeling_maskformer.py   | 4 ++--
 src/transformers/models/mbart/modeling_tf_mbart.py          | 6 +++---
 src/transformers/models/mvp/modeling_mvp.py                 | 2 +-
 src/transformers/models/nllb_moe/modeling_nllb_moe.py       | 2 +-
 src/transformers/models/opt/modeling_tf_opt.py              | 2 +-
 src/transformers/models/pegasus/modeling_tf_pegasus.py      | 6 +++---
 .../models/speech_to_text/modeling_tf_speech_to_text.py     | 6 +++---
 .../models/speech_to_text_2/modeling_speech_to_text_2.py    | 4 ++--
 .../models/table_transformer/modeling_table_transformer.py  | 6 +++---
 src/transformers/models/trocr/modeling_trocr.py             | 4 ++--
 src/transformers/models/whisper/modeling_tf_whisper.py      | 6 +++---
 src/transformers/models/xglm/modeling_tf_xglm.py            | 4 ++--
 .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py     | 6 +++---
 .../modeling_{{cookiecutter.lowercase_modelname}}.py        | 6 +++---
 26 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 85dfe45ff468..b3584ea2212d 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -735,7 +735,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index e2555381f4bd..2199c80a216e 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -321,7 +321,7 @@ def call(
     ) -> tf.Tensor:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -394,11 +394,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index fe43c1e68e25..a16403d7095d 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1388,7 +1388,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             output_attentions (`bool`, *optional*):
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index d0e745503705..fdd85a7f8783 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -317,7 +317,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -391,11 +391,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 2e8d2e11cae7..09c49bea1b4d 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -317,7 +317,7 @@ def call(
     ) -> tf.Tensor:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -391,11 +391,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index e42c4fc3ed43..fc36732cc420 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -795,7 +795,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index b5fe0ea8a8e5..07e6c535f7ea 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -851,7 +851,7 @@ def forward(
         """
         Args:
             hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
             position_embeddings (`torch.FloatTensor`, *optional*):
                 Position embeddings that are added to the queries and keys in the self-attention layer.
             reference_points (`torch.FloatTensor`, *optional*):
@@ -861,7 +861,7 @@ def forward(
             level_start_index (`torch.LongTensor`, *optional*):
                 Level start index.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 3e8925a49d34..3476e1e2e715 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -642,7 +642,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -723,7 +723,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -734,7 +734,7 @@ def forward(
                 position embeddings that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 543db1d608ef..50919a8afaa2 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -738,7 +738,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index d98c8d29672e..cf9b19d28a0d 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -962,7 +962,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -1040,11 +1040,11 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index a661f4b703b2..b508c8432d7e 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1181,7 +1181,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -1256,11 +1256,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 9632ddeaac8f..8e0cecb99b59 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -354,7 +354,7 @@ def call(
     ) -> tf.Tensor:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -428,11 +428,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 39b0b4cdd4c1..98f0cdb5044f 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -571,7 +571,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -582,7 +582,7 @@ def forward(
                 position embeddings that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index b0e2d141f4fa..04d489ec2cbc 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -322,7 +322,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -395,11 +395,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 92e393b39e25..8982fbf1a147 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -327,7 +327,7 @@ def forward(
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 217314555840..f3c597fa3c24 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -672,7 +672,7 @@ def forward(
         """
         Args:
             hidden_states (`torch.FloatTensor`):
-                input to the layer of shape `(seq_len, batch, embed_dim)`
+                input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`):
                 attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                 large negative values.
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index bc76a30ce8c5..cabed90e9187 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -303,7 +303,7 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`, *optional*): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`, *optional*): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index 15c87b938bfa..52171b884ca8 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -356,7 +356,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -430,11 +430,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 86ec432e49f7..026d2241b461 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -400,7 +400,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -477,11 +477,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 822025e40ae2..e4270bc332d4 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -345,11 +345,11 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 2c4458b0ed3a..df2cf23b650d 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -584,7 +584,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -668,7 +668,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
@@ -679,7 +679,7 @@ def forward(
                 position embeddings that are added to the queries and keys
             in the self-attention layer.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                 values.
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index cd4e522bf547..e6452303563b 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -358,11 +358,11 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index c36450dd12f3..653dd168ae5e 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -313,7 +313,7 @@ def call(
     ):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -391,11 +391,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`tf.Tensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 639eb912f3d1..b18c50b79568 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -348,11 +348,11 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             encoder_hidden_states (`tf.Tensor`):
-                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 53057814e311..c59a78a3459d 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -1794,7 +1794,7 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs)
     def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training=False):
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -1867,10 +1867,10 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
         """
         Args:
-            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`tf.Tensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 879100aeaa41..426cb5c41385 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -1826,7 +1826,7 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -1907,10 +1907,10 @@ def forward(
     ):
         """
         Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
             attention_mask (`torch.FloatTensor`): attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
             encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                 *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
             layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size

From 9e28750287df57942d716083ae53bb4e766104c2 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 30 Jun 2023 00:07:44 +0530
Subject: [PATCH 577/935] fix peft ckpts not being pushed to hub  (#24578)

* fix push to hub for peft ckpts

* oops
---
 src/transformers/trainer.py        | 3 +++
 src/transformers/utils/__init__.py | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d846a76b2c63..60224ea6c113 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -119,6 +119,7 @@
 )
 from .training_args import OptimizerNames, ParallelMode, TrainingArguments
 from .utils import (
+    ADAPTER_CONFIG_NAME,
     ADAPTER_SAFE_WEIGHTS_NAME,
     ADAPTER_WEIGHTS_NAME,
     CONFIG_NAME,
@@ -3533,6 +3534,8 @@ def _push_from_checkpoint(self, checkpoint_folder):
         output_dir = self.args.output_dir
         # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
         modeling_files = [CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
+        if is_peft_available():
+            modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME])
         for modeling_file in modeling_files:
             if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
                 shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index d2d56bea10c6..78a49b2091ce 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -178,6 +178,7 @@
 
 WEIGHTS_NAME = "pytorch_model.bin"
 WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
+ADAPTER_CONFIG_NAME = "adapter_config.json"
 ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
 ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
 TF2_WEIGHTS_NAME = "tf_model.h5"

From b52a03cd3bec92d0ee84f0b1f7edee0d5117200a Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 30 Jun 2023 14:00:43 +0900
Subject: [PATCH 578/935] =?UTF-8?q?=E2=9A=A0=EF=B8=8F=E2=9A=A0=EF=B8=8F[`T?=
 =?UTF-8?q?5Tokenize`]=20Fix=20T5=20family=20tokenizers=E2=9A=A0=EF=B8=8F?=
 =?UTF-8?q?=E2=9A=A0=EF=B8=8F=20(#24565)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* don't add space before single letter chars that don't have a merge

* fix the fix

* fixup

* add a test

* more testing

* fixup

* hack to make sure fast is also fixed

* update switch transformers test

* revert convert slow

* Update src/transformers/models/t5/tokenization_t5.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* add typechecking

* quality

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/t5/tokenization_t5.py | 18 +++++++++--
 .../test_modeling_switch_transformers.py      | 10 +++---
 tests/models/t5/test_tokenization_t5.py       | 32 +++++++++++++++++++
 3 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 400c956a3d57..70821333bdef 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -19,11 +19,15 @@
 import re
 import warnings
 from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import sentencepiece as spm
 
 from ...tokenization_utils import PreTrainedTokenizer
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import TextInput
 from ...utils import logging
 
 
@@ -51,6 +55,8 @@
     "t5-11b": 512,
 }
 
+SPIECE_UNDERLINE = "▁"
+
 
 class T5Tokenizer(PreTrainedTokenizer):
     """
@@ -294,9 +300,17 @@ def __setstate__(self, d):
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(self.vocab_file)
 
+    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
+        if not text.startswith(" "):
+            text = " " + text
+        return super().tokenize(text, **kwargs)
+
     def _tokenize(self, text: str) -> List[str]:
         """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        return self.sp_model.encode(text, out_type=str)
+        tokens = self.sp_model.encode(text, out_type=str)
+        if not text.startswith(" ") and tokens[0] == SPIECE_UNDERLINE:
+            tokens = tokens[1:]
+        return tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index f8730d899329..e633533e1a31 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -1149,7 +1149,7 @@ def test_small_generate(self):
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False)
         model = model.to(torch_device)
 
         input_ids = tokenizer(
@@ -1160,13 +1160,13 @@ def test_small_generate(self):
         self.assertEqual(output_str, "drink.")
 
         input_ids = tokenizer(
-            "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
+            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
             return_tensors="pt",
         ).input_ids.to(torch_device)
         sequences = model.generate(input_ids)
         output_str = tokenizer.batch_decode(sequences, skip_special_tokens=False)[0]
 
-        EXPECTED_OUTPUT = "<pad><extra_id_0> man<extra_id_1> beer<extra_id_2> a<extra_id_3> salt<extra_id_4>.</s>"
+        EXPECTED_OUTPUT = "<pad><extra_id_0> man<extra_id_1> beer<extra_id_2> a<extra_id_3> whiskey<extra_id_4>.</s>"
         self.assertEqual(output_str, EXPECTED_OUTPUT)
 
     def test_small_batch_generate(self):
@@ -1174,10 +1174,10 @@ def test_small_batch_generate(self):
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False)
 
         inputs = [
-            "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
+            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
         ] * BATCH_SIZE
         encoded_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
 
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index 16ff9f04de43..afe0d32948a9 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -399,3 +399,35 @@ def test_get_sentinel_tokens_for_fasttokenizer(self):
     def test_get_sentinel_token_ids_for_fasttokenizer(self):
         tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
         self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
+
+    def test_encode_extra_ids(self):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0)
+        tokenizer.add_special_tokens({"additional_special_tokens": ["<extra_id_0>"]})
+        tokenizer._create_trie(tokenizer.all_special_tokens)
+        # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
+        # So the extra ids are split....
+
+        input_ids = tokenizer.encode(". Hello")
+        self.assertEquals(input_ids, [7, 4, 156, 86, 20, 2])
+        tokens = tokenizer.tokenize(". Hello")
+        self.assertEquals(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        input_ids = tokenizer.encode(" . Hello")
+        self.assertEquals(input_ids, [7, 4, 156, 86, 20, 2])
+        tokens = tokenizer.tokenize(" . Hello")
+        self.assertEquals(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        input_ids = tokenizer.encode("Hello, <extra_id_0>I")
+        self.assertEquals(input_ids, [156, 86, 20, 3, 999, 8, 2])
+        tokens = tokenizer.tokenize("Hello, <extra_id_0>I")
+        self.assertEquals(tokens, ["▁He", "ll", "o", ",", "<extra_id_0>", "▁I"])
+
+        input_ids = tokenizer.encode("Hello, <extra_id_0>,")
+        self.assertEquals(input_ids, [156, 86, 20, 3, 999, 3, 2])
+        tokens = tokenizer.tokenize("Hello, <extra_id_0>,")
+        self.assertEquals(tokens, ["▁He", "ll", "o", ",", "<extra_id_0>", ","])
+
+        input_ids = tokenizer.encode(" <extra_id_0> ,")
+        self.assertEquals(input_ids, [999, 3, 2])
+        tokens = tokenizer.tokenize(" <extra_id_0> ,")
+        self.assertEquals(tokens, ["<extra_id_0>", ","])  # spaces are eaten by rstrip / lstrip

From fd8dcd0953bf6f30b00c4edae9c1ca3b363e9780 Mon Sep 17 00:00:00 2001
From: Jeroen Van Goey <jeroen.vangoey@gmail.com>
Date: Fri, 30 Jun 2023 13:11:58 +0200
Subject: [PATCH 579/935] Udate link to RunHouse hardware setup documentation.
 (#24590)

* Udate link to RunHouse hardware setup documentation.

* Fix link to hardware setup in other location as well
---
 examples/README.md        | 4 ++--
 examples/run_on_remote.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index c1cddd2e4734..f01a0f0d43d1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -104,7 +104,7 @@ for running remotely as well. You can easily customize the example used, command
 and type of compute hardware, and then run the script to automatically launch the example.
 
 You can refer to 
-[hardware setup](https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup)
+[hardware setup](https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup)
 for more information about hardware and dependency setup with Runhouse, or this
 [Colab tutorial](https://colab.research.google.com/drive/1sh_aNQzJX5BKAdNeXthTNGxKz7sM9VPc) for a more in-depth 
 walkthrough.
@@ -131,4 +131,4 @@ python run_on_remote.py --instance <instance> --provider <provider> \
   --example <example> <args>
 ```
 
-You can also adapt the script to your own needs.
\ No newline at end of file
+You can also adapt the script to your own needs.
diff --git a/examples/run_on_remote.py b/examples/run_on_remote.py
index 9d42ed845c9e..46f87065d761 100644
--- a/examples/run_on_remote.py
+++ b/examples/run_on_remote.py
@@ -21,7 +21,7 @@
 
 
 if __name__ == "__main__":
-    # Refer to https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup for cloud access
+    # Refer to https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup for cloud access
     # setup instructions, if using on-demand hardware
 
     # If user passes --user <user> --host <host> --key_path <key_path> <example> <args>, fill them in as BYO cluster

From 78a2b19fc84ed55c65f4bf20a901edb7ceb73c5f Mon Sep 17 00:00:00 2001
From: "JB (Don)" <1557853+hackyon@users.noreply.github.com>
Date: Fri, 30 Jun 2023 21:19:39 +0900
Subject: [PATCH 580/935] Show a warning for missing attention masks when
 pad_token_id is not None (#24510)

* Adding warning messages to BERT for missing attention masks

These warning messages when there are pad tokens within the input ids and
no attention masks are given. The warning message should only show up once.

* Adding warning messages to BERT for missing attention masks

These warning messages are shown when the pad_token_id is not None
and no attention masks are given. The warning message should only
show up once.

* Ran fix copies to copy over the changes to some of the other models

* Add logger.warning_once.cache_clear() to the test

* Shows warning when there are no attention masks and input_ids start/end with pad tokens

* Using warning_once() instead and fix indexing in input_ids check

---------

Co-authored-by: JB Lau <hckyn@voyager2.local>
---
 src/transformers/modeling_utils.py            | 30 ++++++++
 .../models/altclip/modeling_altclip.py        |  1 +
 src/transformers/models/bert/modeling_bert.py |  1 +
 .../bridgetower/modeling_bridgetower.py       |  1 +
 .../models/camembert/modeling_camembert.py    |  1 +
 src/transformers/models/clap/modeling_clap.py |  1 +
 .../models/data2vec/modeling_data2vec_text.py |  1 +
 .../models/roberta/modeling_roberta.py        |  1 +
 .../xlm_roberta/modeling_xlm_roberta.py       |  1 +
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |  1 +
 tests/models/bert/test_modeling_bert.py       | 26 ++++++-
 tests/test_modeling_utils.py                  | 76 +++++++++++++++++++
 12 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 4fb8043f4410..4c761bc3119a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3477,6 +3477,36 @@ def reverse_bettertransformer(self):
 
         return BetterTransformer.reverse(self)
 
+    def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
+        """
+        Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
+        """
+        if (attention_mask is not None) or (self.config.pad_token_id is None):
+            return
+
+        # Check only the first and last input IDs to reduce overhead.
+        if self.config.pad_token_id in input_ids[:, [-1, 0]]:
+            warn_string = (
+                "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See "
+                "https://huggingface.co/docs/transformers/troubleshooting"
+                "#incorrect-output-when-padding-tokens-arent-masked."
+            )
+
+            # If the pad token is equal to either BOS, EOS, or SEP, we do not know whether the user should use an
+            # attention_mask or not. In this case, we should still show a warning because this is a rare case.
+            if (
+                (self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id)
+                or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id)
+                or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id)
+            ):
+                warn_string += (
+                    f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical "
+                    f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), "
+                    f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded."
+                )
+
+            logger.warning_once(warn_string)
+
 
 PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
 if PreTrainedModel.push_to_hub.__doc__ is not None:
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index fe2754cac808..90188c044e4e 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1305,6 +1305,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 17667e8443dd..452741c6d8fb 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -967,6 +967,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 1fb3cc131bc8..42b31c964f0c 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -1118,6 +1118,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index ed3afab11aa4..f0b40f56de5c 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -842,6 +842,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 0f3986ada0ce..c5533f3dae00 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1854,6 +1854,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 4c07acd11072..ed79b021fbf1 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -791,6 +791,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index cf71ceba7c45..0b19804dccf2 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -789,6 +789,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 881f60875dbb..14e2e22086a9 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -791,6 +791,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 4299880e0c4f..7b6c15033cfc 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -757,6 +757,7 @@ def forward(
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index efd540902084..db021740714f 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -18,7 +18,7 @@
 
 from transformers import BertConfig, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import CaptureLogger, require_torch, require_torch_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -40,6 +40,7 @@
         BertForTokenClassification,
         BertLMHeadModel,
         BertModel,
+        logging,
     )
     from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
@@ -567,6 +568,29 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
 
+    def test_for_warning_if_padding_and_no_attention_mask(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+
+        # Set pad tokens in the input_ids
+        input_ids[0, 0] = config.pad_token_id
+
+        # Check for warnings if the attention_mask is missing.
+        logger = logging.get_logger("transformers.modeling_utils")
+        with CaptureLogger(logger) as cl:
+            model = BertModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            model(input_ids, attention_mask=None, token_type_ids=token_type_ids)
+        self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 17ddf1963a28..5019d0ccb308 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -938,6 +938,82 @@ def test_unexpected_keys_warnings(self):
             self.assertIn("were not used when initializing ModelWithHead: ['added_key']", cl.out)
             self.assertEqual(loading_info["unexpected_keys"], ["added_key"])
 
+    def test_warn_if_padding_and_no_attention_mask(self):
+        logger = logging.get_logger("transformers.modeling_utils")
+
+        with self.subTest("Ensure no warnings when pad_token_id is None."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config_no_pad_token = PretrainedConfig()
+                config_no_pad_token.pad_token_id = None
+                model = ModelWithHead(config_no_pad_token)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure no warnings when there is an attention_mask."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure no warnings when there are no pad_token_ids in the input_ids."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[1, 345, 232, 328, 740, 140, 1695, 69, 6078, 2341, 25]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure a warning is shown when the input_ids start with a pad_token_id."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 432, 5232]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure a warning is shown when the input_ids end with a pad_token_id."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[432, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
+
+        with self.subTest("Ensure that the warning is shown at most once."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertEqual(cl.out.count("We strongly recommend passing in an `attention_mask`"), 1)
+
+        with self.subTest("Ensure a different warning is shown when the pad_token_id is equal to the bos_token_id."):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as cl:
+                config = PretrainedConfig()
+                config.pad_token_id = 0
+                config.bos_token_id = config.pad_token_id
+                model = ModelWithHead(config)
+                input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
+                model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
+            self.assertIn("You may ignore this warning if your `pad_token_id`", cl.out)
+
     @require_torch_gpu
     @slow
     def test_pretrained_low_mem_new_config(self):

From 3441ad7d438d0ca7caaa73ef2c022ffd0c150f5a Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 30 Jun 2023 16:54:54 +0200
Subject: [PATCH 581/935] Make (TF) CI faster (test only a subset of model
 classes) (#24592)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/test_modeling_tf_common.py     |  4 ++--
 tests/utils/test_modeling_tf_core.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index fe225e2723e2..e22635182e54 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -341,7 +341,7 @@ def test_onnx_runtime_optimize(self):
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             model = model_class(config)
             model.build()
 
@@ -689,7 +689,7 @@ def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
     def test_compile_tf_model(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             # Prepare our model
             model = model_class(config)
             # These are maximally general inputs for the model, with multiple None dimensions
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index b4fd805f5a99..ebd5dfda6bc0 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -111,7 +111,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> d
     @slow
     def test_graph_mode(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             inputs = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
 
@@ -125,7 +125,7 @@ def run_in_graph_mode():
     @slow
     def test_xla_mode(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             inputs = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
 
@@ -140,7 +140,7 @@ def run_in_graph_mode():
     def test_xla_fit(self):
         # This is a copy of the test_keras_fit method, but we use XLA compilation instead of eager
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             model = model_class(config)
             if getattr(model, "hf_compute_loss", None):
                 # Test that model correctly compute the loss with kwargs
@@ -214,7 +214,7 @@ def test_saved_model_creation_extended(self):
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
             model = model_class(config)
             model.build()
@@ -269,7 +269,7 @@ def test_mixed_precision(self):
         # try/finally block to ensure subsequent tests run in float32
         try:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            for model_class in self.all_model_classes:
+            for model_class in self.all_model_classes[:2]:
                 class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
                 model = model_class(config)
                 outputs = model(class_inputs_dict)
@@ -352,7 +352,7 @@ def test_train_pipeline_custom_model(self):
     def test_graph_mode_with_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        for model_class in self.all_model_classes:
+        for model_class in self.all_model_classes[:2]:
             model = model_class(config)
 
             inputs = copy.deepcopy(inputs_dict)

From 134caef31a40eff778bb8dc7b917dac9d7e3177a Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 30 Jun 2023 16:30:33 +0100
Subject: [PATCH 582/935] Speed up TF tests by reducing hidden layer counts
 (#24595)

* hidden layers, huh, what are they good for (absolutely nothing)

* Some tests break with 1 hidden layer, use 2

* Use 1 hidden layer in a few slow models

* Use num_hidden_layers=2 everywhere

* Slightly higher tol for groupvit

* Slightly higher tol for groupvit
---
 .../models/albert/test_modeling_tf_albert.py  |  4 +--
 tests/models/bart/test_modeling_tf_bart.py    |  2 +-
 tests/models/bert/test_modeling_tf_bert.py    |  4 +--
 .../blenderbot/test_modeling_tf_blenderbot.py |  2 +-
 .../test_modeling_tf_blenderbot_small.py      |  2 +-
 tests/models/blip/test_modeling_tf_blip.py    |  4 +--
 .../models/blip/test_modeling_tf_blip_text.py |  2 +-
 tests/models/clip/test_modeling_tf_clip.py    |  4 +--
 .../convbert/test_modeling_tf_convbert.py     |  4 +--
 tests/models/ctrl/test_modeling_tf_ctrl.py    |  2 +-
 .../test_modeling_tf_data2vec_vision.py       |  2 +-
 .../deberta/test_modeling_tf_deberta.py       |  4 +--
 .../deberta_v2/test_modeling_tf_deberta_v2.py |  2 +-
 tests/models/deit/test_modeling_tf_deit.py    |  2 +-
 .../distilbert/test_modeling_tf_distilbert.py |  2 +-
 tests/models/dpr/test_modeling_tf_dpr.py      |  2 +-
 .../electra/test_modeling_tf_electra.py       |  2 +-
 tests/models/esm/test_modeling_tf_esm.py      |  2 +-
 .../flaubert/test_modeling_tf_flaubert.py     |  2 +-
 tests/models/gpt2/test_modeling_tf_gpt2.py    |  2 +-
 tests/models/gptj/test_modeling_tf_gptj.py    |  2 +-
 .../groupvit/test_modeling_tf_groupvit.py     | 14 ++++++++-
 .../models/hubert/test_modeling_tf_hubert.py  |  2 +-
 .../layoutlm/test_modeling_tf_layoutlm.py     |  2 +-
 .../layoutlmv3/test_modeling_tf_layoutlmv3.py |  2 +-
 tests/models/led/test_modeling_tf_led.py      |  2 +-
 .../longformer/test_modeling_tf_longformer.py |  2 +-
 .../models/marian/test_modeling_tf_marian.py  |  2 +-
 tests/models/mbart/test_modeling_tf_mbart.py  |  2 +-
 .../mobilebert/test_modeling_tf_mobilebert.py |  2 +-
 tests/models/mpnet/test_modeling_tf_mpnet.py  |  2 +-
 .../models/openai/test_modeling_tf_openai.py  |  2 +-
 .../pegasus/test_modeling_tf_pegasus.py       |  2 +-
 .../rembert/test_modeling_tf_rembert.py       |  4 +--
 .../roberta/test_modeling_tf_roberta.py       |  2 +-
 .../test_modeling_tf_roberta_prelayernorm.py  |  2 +-
 .../roformer/test_modeling_tf_roformer.py     |  4 +--
 tests/models/t5/test_modeling_tf_t5.py        |  4 +--
 tests/models/tapas/test_modeling_tf_tapas.py  |  2 +-
 .../transfo_xl/test_modeling_tf_transfo_xl.py |  2 +-
 tests/models/vit/test_modeling_tf_vit.py      |  2 +-
 .../vit_mae/test_modeling_tf_vit_mae.py       |  2 +-
 .../wav2vec2/test_modeling_tf_wav2vec2.py     |  2 +-
 tests/models/xglm/test_modeling_tf_xglm.py    |  2 +-
 tests/models/xlm/test_modeling_tf_xlm.py      |  2 +-
 tests/models/xlnet/test_modeling_tf_xlnet.py  |  2 +-
 tests/test_modeling_tf_common.py              | 30 -------------------
 47 files changed, 67 insertions(+), 85 deletions(-)

diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
index eaa6791f9284..7314eb4749a8 100644
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ b/tests/models/albert/test_modeling_tf_albert.py
@@ -56,7 +56,7 @@ def __init__(
         vocab_size=99,
         embedding_size=16,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -80,7 +80,7 @@ def __init__(
         self.vocab_size = 99
         self.embedding_size = 16
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index 60b182672990..05720f297807 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -52,7 +52,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index b68a81a1030b..335a184d2929 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -57,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -80,7 +80,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/blenderbot/test_modeling_tf_blenderbot.py b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
index 7553bb908ea5..26b03a5d6a3f 100644
--- a/tests/models/blenderbot/test_modeling_tf_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_tf_blenderbot.py
@@ -48,7 +48,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
index 2118ec6837a8..021f171789e4 100644
--- a/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -48,7 +48,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index a58939c09d9a..ac6f8e3a67c9 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -64,7 +64,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -207,7 +207,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/blip/test_modeling_tf_blip_text.py b/tests/models/blip/test_modeling_tf_blip_text.py
index 2733a9fa6a4c..a3da1a7f675d 100644
--- a/tests/models/blip/test_modeling_tf_blip_text.py
+++ b/tests/models/blip/test_modeling_tf_blip_text.py
@@ -46,7 +46,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py
index 10b9954fc884..897b89d5c36b 100644
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ b/tests/models/clip/test_modeling_tf_clip.py
@@ -57,7 +57,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -328,7 +328,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py
index 84ed4de818f6..5c5d83de300a 100644
--- a/tests/models/convbert/test_modeling_tf_convbert.py
+++ b/tests/models/convbert/test_modeling_tf_convbert.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -74,7 +74,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 384
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index 5c9750d4dedc..9a5ebbe34a77 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -52,7 +52,7 @@ def __init__(
         self.use_mc_token_ids = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index 9e3701c17ad3..fa6764344068 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -61,7 +61,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py
index 9b69d55001cf..14a99ea947ec 100644
--- a/tests/models/deberta/test_modeling_tf_deberta.py
+++ b/tests/models/deberta/test_modeling_tf_deberta.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -73,7 +73,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
index 96ebe375d97b..8b9bcc15ea2f 100644
--- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py
index 3987b11d1968..0e34f35b60bb 100644
--- a/tests/models/deit/test_modeling_tf_deit.py
+++ b/tests/models/deit/test_modeling_tf_deit.py
@@ -60,7 +60,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/distilbert/test_modeling_tf_distilbert.py b/tests/models/distilbert/test_modeling_tf_distilbert.py
index 4e96c909765a..937dd24d6d77 100644
--- a/tests/models/distilbert/test_modeling_tf_distilbert.py
+++ b/tests/models/distilbert/test_modeling_tf_distilbert.py
@@ -54,7 +54,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/dpr/test_modeling_tf_dpr.py b/tests/models/dpr/test_modeling_tf_dpr.py
index f788a5163398..113514086233 100644
--- a/tests/models/dpr/test_modeling_tf_dpr.py
+++ b/tests/models/dpr/test_modeling_tf_dpr.py
@@ -53,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py
index fe60c5627103..537cb1df2f9c 100644
--- a/tests/models/electra/test_modeling_tf_electra.py
+++ b/tests/models/electra/test_modeling_tf_electra.py
@@ -54,7 +54,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py
index d06e3c59ba87..b687da355a31 100644
--- a/tests/models/esm/test_modeling_tf_esm.py
+++ b/tests/models/esm/test_modeling_tf_esm.py
@@ -53,7 +53,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py
index b751445d12cc..6d74b55ce344 100644
--- a/tests/models/flaubert/test_modeling_tf_flaubert.py
+++ b/tests/models/flaubert/test_modeling_tf_flaubert.py
@@ -61,7 +61,7 @@ def __init__(
         self.vocab_size = 99
         self.n_special = 0
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.hidden_dropout_prob = 0.1
         self.attention_probs_dropout_prob = 0.1
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index cce37aa95e60..a88435acba3e 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -55,7 +55,7 @@ def __init__(
         self.use_mc_token_ids = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
index 649eab2889f9..896df148058c 100644
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ b/tests/models/gptj/test_modeling_tf_gptj.py
@@ -51,7 +51,7 @@ def __init__(self, parent):
         self.vocab_size = 99
         self.hidden_size = 32
         self.rotary_dim = 4
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index 6cd6ae7aaecd..1a1a14e30188 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -150,6 +150,10 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_onnx = False
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as this model tends to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def setUp(self):
         self.model_tester = TFGroupViTVisionModelTester(self)
         self.config_tester = ConfigTester(
@@ -381,7 +385,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -459,6 +463,10 @@ class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_onnx = False
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as this model tends to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def setUp(self):
         self.model_tester = TFGroupViTTextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37)
@@ -581,6 +589,10 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
     test_attention_outputs = False
     test_onnx = False
 
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as this model tends to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
     def setUp(self):
         self.model_tester = TFGroupViTModelTester(self)
 
diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py
index 0b8e1e2df94a..3685e6598740 100644
--- a/tests/models/hubert/test_modeling_tf_hubert.py
+++ b/tests/models/hubert/test_modeling_tf_hubert.py
@@ -59,7 +59,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/layoutlm/test_modeling_tf_layoutlm.py b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
index 2d134f23d42c..96ce692a6682 100644
--- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
@@ -52,7 +52,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index 3a6f34185b53..5ea4cb625c46 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -69,7 +69,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/led/test_modeling_tf_led.py b/tests/models/led/test_modeling_tf_led.py
index a06a29fd3838..a4f8ad6a9c58 100644
--- a/tests/models/led/test_modeling_tf_led.py
+++ b/tests/models/led/test_modeling_tf_led.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index 67d6d234c1c6..0eda06522681 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -56,7 +56,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/marian/test_modeling_tf_marian.py b/tests/models/marian/test_modeling_tf_marian.py
index 50cff4219a14..9cb9d0061f05 100644
--- a/tests/models/marian/test_modeling_tf_marian.py
+++ b/tests/models/marian/test_modeling_tf_marian.py
@@ -49,7 +49,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/mbart/test_modeling_tf_mbart.py b/tests/models/mbart/test_modeling_tf_mbart.py
index 9ca4a7d69324..04aad6cfc635 100644
--- a/tests/models/mbart/test_modeling_tf_mbart.py
+++ b/tests/models/mbart/test_modeling_tf_mbart.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
index 607ba5b88a92..b2b1e58ec0b3 100644
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
@@ -97,7 +97,7 @@ def __init__(
             vocab_size=99,
             hidden_size=32,
             embedding_size=32,
-            num_hidden_layers=5,
+            num_hidden_layers=2,
             num_attention_heads=4,
             intermediate_size=37,
             hidden_act="gelu",
diff --git a/tests/models/mpnet/test_modeling_tf_mpnet.py b/tests/models/mpnet/test_modeling_tf_mpnet.py
index 381b6e81dd8d..b27b33d8103d 100644
--- a/tests/models/mpnet/test_modeling_tf_mpnet.py
+++ b/tests/models/mpnet/test_modeling_tf_mpnet.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index cf4c81dd6473..231758064f2d 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -53,7 +53,7 @@ def __init__(
         self.use_mc_token_ids = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/pegasus/test_modeling_tf_pegasus.py b/tests/models/pegasus/test_modeling_tf_pegasus.py
index 33e908ab23ea..21ddaa9f3451 100644
--- a/tests/models/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/models/pegasus/test_modeling_tf_pegasus.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py
index e70bd7033fc1..8698d3febc84 100644
--- a/tests/models/rembert/test_modeling_tf_rembert.py
+++ b/tests/models/rembert/test_modeling_tf_rembert.py
@@ -54,7 +54,7 @@ def __init__(
         hidden_size=32,
         input_embedding_size=18,
         output_embedding_size=43,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -79,7 +79,7 @@ def __init__(
         self.hidden_size = 32
         self.input_embedding_size = input_embedding_size
         self.output_embedding_size = output_embedding_size
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
index 3d7b6953c085..2f2859391ad3 100644
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -56,7 +56,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
index 4e1bd2e319af..9c1a25ccb982 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
@@ -57,7 +57,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py
index 52c630e2beae..d0e795b6dd9c 100644
--- a/tests/models/roformer/test_modeling_tf_roformer.py
+++ b/tests/models/roformer/test_modeling_tf_roformer.py
@@ -56,7 +56,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -79,7 +79,7 @@ def __init__(
         self.use_labels = True
         self.vocab_size = 99
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.intermediate_size = 37
         self.hidden_act = "gelu"
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 46b0f4c5911c..201bcdd7aeda 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -46,7 +46,7 @@ def __init__(
         self.vocab_size = 99
         self.n_positions = 14
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.d_ff = 37
         self.relative_attention_num_buckets = 8
@@ -325,7 +325,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
index ce98394cb868..7687144eaf2f 100644
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -77,7 +77,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
index ac820ea8fab0..88e759307ed6 100644
--- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
@@ -59,7 +59,7 @@ def __init__(
         self.d_head = 8
         self.d_inner = 128
         self.div_val = 2
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.scope = None
         self.seed = 1
         self.eos_token_id = 0
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
index 53862da916e1..0db27dfb2ebf 100644
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ b/tests/models/vit/test_modeling_tf_vit.py
@@ -52,7 +52,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index bc7d481f5f55..8f6064e01650 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -60,7 +60,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index 3554d18957cb..bd931ea83190 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -130,7 +130,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index 3c9e4770f730..54641693c771 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
index 5b576f02c91e..7bfa33828f70 100644
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ b/tests/models/xlm/test_modeling_tf_xlm.py
@@ -61,7 +61,7 @@ def __init__(
         self.vocab_size = 99
         self.n_special = 0
         self.hidden_size = 32
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.num_attention_heads = 4
         self.hidden_dropout_prob = 0.1
         self.attention_probs_dropout_prob = 0.1
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index c33579392dc9..327bfea6d0aa 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -61,7 +61,7 @@ def __init__(
         self.hidden_size = 32
         self.num_attention_heads = 4
         self.d_inner = 128
-        self.num_hidden_layers = 5
+        self.num_hidden_layers = 2
         self.type_sequence_label_size = 2
         self.untie_r = True
         self.bi_data = False
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index e22635182e54..6e5fd6ce6cac 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1527,36 +1527,6 @@ def test_keras_fit(self):
             if metrics:
                 self.assertTrue(len(accuracy1) == len(accuracy2) > 0, "Missing metrics!")
 
-            # Make sure fit works with tf.data.Dataset and results are consistent
-            dataset = tf.data.Dataset.from_tensor_slices(prepared_for_class)
-
-            if sample_weight is not None:
-                # Add in the sample weight
-                weighted_dataset = dataset.map(lambda x: (x, None, tf.convert_to_tensor(0.5, dtype=tf.float32)))
-            else:
-                weighted_dataset = dataset
-            # Pass in all samples as a batch to match other `fit` calls
-            weighted_dataset = weighted_dataset.batch(len(dataset))
-            dataset = dataset.batch(len(dataset))
-            # Reinitialize to fix batchnorm again
-            model.set_weights(model_weights)
-
-            # To match the other calls, don't pass sample weights in the validation data
-            history3 = model.fit(
-                weighted_dataset,
-                validation_data=dataset,
-                steps_per_epoch=1,
-                validation_steps=1,
-                shuffle=False,
-            )
-            val_loss3 = history3.history["val_loss"][0]
-            self.assertTrue(not isnan(val_loss3))
-            accuracy3 = {key: val[0] for key, val in history3.history.items() if key.endswith("accuracy")}
-            self.check_keras_fit_results(val_loss1, val_loss3)
-            self.assertEqual(history1.history.keys(), history3.history.keys())
-            if metrics:
-                self.assertTrue(len(accuracy1) == len(accuracy3) > 0, "Missing metrics!")
-
     def test_int_support(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:

From 49e812d12b01df02f03e40d1b6dd9cff653534df Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 30 Jun 2023 11:27:27 -0700
Subject: [PATCH 583/935] [several models] improve readability (#24585)

* [modeling_clip.py] improve readability

* apply to other models

* fix
---
 src/transformers/models/align/modeling_align.py               | 2 +-
 src/transformers/models/altclip/modeling_altclip.py           | 2 +-
 src/transformers/models/blip/modeling_blip.py                 | 2 +-
 src/transformers/models/bridgetower/modeling_bridgetower.py   | 2 +-
 src/transformers/models/chinese_clip/modeling_chinese_clip.py | 2 +-
 src/transformers/models/clap/modeling_clap.py                 | 4 ++--
 src/transformers/models/clip/modeling_clip.py                 | 2 +-
 src/transformers/models/clipseg/modeling_clipseg.py           | 2 +-
 src/transformers/models/flava/modeling_flava.py               | 2 +-
 src/transformers/models/groupvit/modeling_groupvit.py         | 2 +-
 src/transformers/models/oneformer/modeling_oneformer.py       | 2 +-
 src/transformers/models/owlvit/modeling_owlvit.py             | 2 +-
 .../modeling_vision_text_dual_encoder.py                      | 2 +-
 src/transformers/models/x_clip/modeling_x_clip.py             | 2 +-
 14 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index a7d31775cf40..a60be94224db 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -1444,7 +1444,7 @@ def __init__(self, config: AlignConfig):
         self.vision_model = AlignVisionModel(vision_config)
 
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim)
-        self.temperature = nn.Parameter(torch.ones([]) * self.config.temperature_init_value)
+        self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 90188c044e4e..820c0ad026f4 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1506,7 +1506,7 @@ def __init__(self, config: AltCLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 115aa14e83fa..cb6a78639cfa 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -743,7 +743,7 @@ def __init__(self, config: BlipConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 42b31c964f0c..080c13f10f22 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -1778,7 +1778,7 @@ def __init__(self, config):
         self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)
         self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size)
 
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 86da1c7b6a8b..32159c5ab854 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -1376,7 +1376,7 @@ def __init__(self, config: ChineseCLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index c5533f3dae00..d0d99c1a1c66 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1956,8 +1956,8 @@ def __init__(self, config: ClapConfig):
         text_config = config.text_config
         audio_config = config.audio_config
 
-        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
-        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
+        self.logit_scale_a = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value)))
+        self.logit_scale_t = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value)))
 
         self.projection_dim = config.projection_dim
 
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 487f756d3ff0..292f932d9357 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -977,7 +977,7 @@ def __init__(self, config: CLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index b1d120e365a8..f15553763190 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -979,7 +979,7 @@ def __init__(self, config: CLIPSegConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index d986a17b7503..9e106e3c2197 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -1229,7 +1229,7 @@ def __init__(self, config: FlavaConfig):
 
         self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
         self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
         self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 9c312c0ff811..41fe8127132a 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1368,7 +1368,7 @@ def __init__(self, config: GroupViTConfig):
             nn.ReLU(inplace=True),
             nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
         )
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index a874611acde8..06d64709540f 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -399,7 +399,7 @@ def __init__(
         self.importance_sample_ratio = importance_sample_ratio
         self.contrastive_temperature = contrastive_temperature
         if self.contrastive_temperature is not None:
-            self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / contrastive_temperature))
+            self.logit_scale = nn.Parameter(torch.tensor(np.log(1 / contrastive_temperature)))
 
     def _max_by_axis(self, the_list: List[List[int]]) -> List[int]:
         maxes = the_list[0]
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 9984cc516213..a5880962b29d 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1065,7 +1065,7 @@ def __init__(self, config: OwlViTConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
 
         # Initialize weights and apply final processing
         self.post_init()
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 9a65ec02ee4a..106ff462e3e3 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -204,7 +204,7 @@ def __init__(
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
     @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index bcf91b0b51d5..248a9248a801 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1309,7 +1309,7 @@ def __init__(self, config: XCLIPConfig):
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
 
         self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
         self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))

From 299aafe55ff03c565c059682c6fd312e4b89bc2f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 30 Jun 2023 20:56:55 +0200
Subject: [PATCH 584/935] Use protobuf 4 (#24599)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 setup.py                                      |  2 +-
 src/transformers/dependency_versions_table.py |  2 +-
 src/transformers/utils/__init__.py            | 12 +++++
 .../utils/sentencepiece_model_pb2_new.py      | 47 +++++++++++++++++++
 4 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 src/transformers/utils/sentencepiece_model_pb2_new.py

diff --git a/setup.py b/setup.py
index 17a3e8c128bf..029bed04d27a 100644
--- a/setup.py
+++ b/setup.py
@@ -142,7 +142,7 @@
     "packaging>=20.0",
     "parameterized",
     "phonemizer",
-    "protobuf<=3.20.3",
+    "protobuf",
     "psutil",
     "pyyaml>=5.1",
     "pydantic",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 9c525d79102a..3b89ae5e6925 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -47,7 +47,7 @@
     "packaging": "packaging>=20.0",
     "parameterized": "parameterized",
     "phonemizer": "phonemizer",
-    "protobuf": "protobuf<=3.20.3",
+    "protobuf": "protobuf",
     "psutil": "psutil",
     "pyyaml": "pyyaml>=5.1",
     "pydantic": "pydantic",
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 78a49b2091ce..764ca38b7232 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -176,6 +176,18 @@
 )
 
 
+if is_protobuf_available():
+    import google.protobuf
+
+    if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
+        from . import sentencepiece_model_pb2
+    else:
+        from . import sentencepiece_model_pb2_new as sentencepiece_model_pb2
+else:
+    # just to get the expected `No module named 'google.protobuf'` error
+    from . import sentencepiece_model_pb2
+
+
 WEIGHTS_NAME = "pytorch_model.bin"
 WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
 ADAPTER_CONFIG_NAME = "adapter_config.json"
diff --git a/src/transformers/utils/sentencepiece_model_pb2_new.py b/src/transformers/utils/sentencepiece_model_pb2_new.py
new file mode 100644
index 000000000000..4a2e29b1bdc3
--- /dev/null
+++ b/src/transformers/utils/sentencepiece_model_pb2_new.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
+)
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sentencepiece_model_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS is False:
+    DESCRIPTOR._options = None
+    DESCRIPTOR._serialized_options = b"H\003"
+    # (generated by protobuf compiler, but `_TRAINERSPEC` is not defined)
+    # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
+    # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._serialized_options = b"\030\001"
+    # _TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
+    # _TRAINERSPEC.fields_by_name["training_sentence_size"]._serialized_options = b"\030\001"
+    _globals["_TRAINERSPEC"]._serialized_start = 45
+    _globals["_TRAINERSPEC"]._serialized_end = 1581
+    _globals["_TRAINERSPEC_MODELTYPE"]._serialized_start = 1517
+    _globals["_TRAINERSPEC_MODELTYPE"]._serialized_end = 1570
+    _globals["_NORMALIZERSPEC"]._serialized_start = 1584
+    _globals["_NORMALIZERSPEC"]._serialized_end = 1793
+    _globals["_SELFTESTDATA"]._serialized_start = 1795
+    _globals["_SELFTESTDATA"]._serialized_end = 1916
+    _globals["_SELFTESTDATA_SAMPLE"]._serialized_start = 1864
+    _globals["_SELFTESTDATA_SAMPLE"]._serialized_end = 1905
+    _globals["_MODELPROTO"]._serialized_start = 1919
+    _globals["_MODELPROTO"]._serialized_end = 2429
+    _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_start = 2208
+    _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_end = 2418
+    _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_start = 2323
+    _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_end = 2407
+# @@protoc_insertion_point(module_scope)

From d51aa48a76b470d95fe665bbe7ee97a3d4c21fb9 Mon Sep 17 00:00:00 2001
From: Serge Matveenko <lig@countzero.co>
Date: Sat, 1 Jul 2023 00:04:03 +0200
Subject: [PATCH 585/935] Limit Pydantic to V1 in dependencies (#24596)

* Limit Pydantic to V1 in dependencies

Pydantic is about to release V2 release which will break a lot of things. This change prevents `transformers` to be used with Pydantic V2 to avoid breaking things.

* more

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +-
 setup.py                                                    | 2 +-
 src/transformers/dependency_versions_table.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 782c97e4cabd..eb7fd2db5b67 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -50,5 +50,5 @@ RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip in
 RUN cd transformers && python3 setup.py develop
 
 # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
-RUN python3 -m pip install -U --no-cache-dir pydantic
+RUN python3 -m pip install -U --no-cache-dir pydantic<2
 RUN python3 -c "from deepspeed.launcher.runner import main"
diff --git a/setup.py b/setup.py
index 029bed04d27a..1ce726d9555c 100644
--- a/setup.py
+++ b/setup.py
@@ -145,7 +145,7 @@
     "protobuf",
     "psutil",
     "pyyaml>=5.1",
-    "pydantic",
+    "pydantic<2",
     "pytest>=7.2.0",
     "pytest-timeout",
     "pytest-xdist",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 3b89ae5e6925..8b360558c40f 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -50,7 +50,7 @@
     "protobuf": "protobuf",
     "psutil": "psutil",
     "pyyaml": "pyyaml>=5.1",
-    "pydantic": "pydantic",
+    "pydantic": "pydantic<2",
     "pytest": "pytest>=7.2.0",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",

From 66ded238cd04e29ba98485984dd647e7d37d1603 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Sat, 1 Jul 2023 09:29:21 +0200
Subject: [PATCH 586/935] fix pydantic install command

---
 docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index eb7fd2db5b67..c2ce626b474e 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -50,5 +50,5 @@ RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip in
 RUN cd transformers && python3 setup.py develop
 
 # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
-RUN python3 -m pip install -U --no-cache-dir pydantic<2
+RUN python3 -m pip install -U --no-cache-dir "pydantic<2"
 RUN python3 -c "from deepspeed.launcher.runner import main"

From 799df10aef3abfe6158c83daf0a9eacf8f6f0a1f Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Mon, 3 Jul 2023 14:38:21 +0900
Subject: [PATCH 587/935] [`Umt5`]  Add google's umt5 to `transformers`
 (#24477)

* add tokenization template

* update conversion script

* update modeling code

* update

* update convert checkpoint

* update modeling

* revert changes on convert script

* new conversion script for new format

* correct position bias

* cleaning a bit

* Credit co authors

Co-authored-by: agemagician
<ahmed.elnaggar@tum.de>

Co-authored-by: stefan-it
<>

* styling

* Add docq

* fix copies

* add co author

* Other Author

* Merge branch 'main' of https://github.com/huggingface/transformers into add-umt5

* add testing

* nit

* Update docs/source/en/model_doc/umt5.mdx

Co-authored-by: Stefan Schweter <stefan@schweter.it>

* fix t5

* actual fix?

* revert wrong changes

* remove

* update test

* more fixes

* revert some changes

* add SPIECE_UNDERLINE

* add a commone xample

* upfate

* fix copies

* revert changes on t5 conversion script

* revert bytefallback changes since there was no addition yet

* fixup

* fixup

* ingore umt5 cutom testing folder

* fix readmes

* revertT5 changes

* same outputs

* fixup

* update example

* Apply suggestions from code review

* style

* draft addition of all new files

* current update

* fix attention and stuff

* finish refactoring

* auto config

* fixup

* more nits

* add umt5 to init

* use md format

* Update README.md

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* revert changes on mt5

* revert mt4 changes

* update test

* more fixes

* add to mapping

* fix-copies

* fix copies

* foix retain grad

* fix some tests

* nits

* done

* Update src/transformers/models/umt5/modeling_umt5.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update docs/source/en/model_doc/umt5.md

* Update src/transformers/models/umt5/__init__.py

* Update docs/source/en/model_doc/umt5.md

Co-authored-by: Stefan Schweter <stefan@schweter.it>

* Update src/transformers/models/umt5/modeling_umt5.py

* update conversion script + use google checkpoints

* nits

* update test and modelling

* stash slow convert

* update fixupd

* don't change slow

---------

Co-authored-by: stefan-it <>
Co-authored-by: Stefan Schweter <stefan@schweter.it>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/de/index.md                       |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/t5.md                |    2 +
 docs/source/en/model_doc/umt5.md              |   96 +
 docs/source/en/tasks/question_answering.md    |    2 +-
 docs/source/en/tasks/summarization.md         |    2 +-
 docs/source/en/tasks/translation.md           |    2 +-
 src/transformers/__init__.py                  |   17 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    2 +
 src/transformers/models/auto/modeling_auto.py |    4 +
 src/transformers/models/umt5/__init__.py      |   53 +
 .../convert_umt5_checkpoint_to_pytorch.py     |  274 +++
 src/transformers/models/umt5/modeling_umt5.py | 1583 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   35 +
 tests/models/umt5/__init__.py                 |    1 +
 tests/models/umt5/test_modeling_umt5.py       |  387 ++++
 utils/check_repo.py                           |    2 +
 utils/create_dummy_models.py                  |    1 +
 27 files changed, 2473 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/en/model_doc/umt5.md
 create mode 100644 src/transformers/models/umt5/__init__.py
 create mode 100644 src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/umt5/modeling_umt5.py
 create mode 100644 tests/models/umt5/__init__.py
 create mode 100644 tests/models/umt5/test_modeling_umt5.py

diff --git a/README.md b/README.md
index cc1fd458f5af..7b4ffff3dc7b 100644
--- a/README.md
+++ b/README.md
@@ -468,6 +468,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
diff --git a/README_es.md b/README_es.md
index fcb6049870be..2ff7917abee9 100644
--- a/README_es.md
+++ b/README_es.md
@@ -443,6 +443,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
diff --git a/README_hd.md b/README_hd.md
index 9b694ed60781..b22ffd26cd47 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -415,6 +415,7 @@ conda install -c huggingface transformers
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (Google Research से) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. द्वाराअनुसंधान पत्र [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) के साथ जारी किया गया
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
diff --git a/README_ja.md b/README_ja.md
index 60b14191c165..bcc513874017 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -477,6 +477,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (Google Research から) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. から公開された研究論文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
diff --git a/README_ko.md b/README_ko.md
index cdbeec9a4b8a..044cb1c454c5 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -392,6 +392,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (Google Research 에서 제공)은 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.의 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)논문과 함께 발표했습니다.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index db1cbd423725..d0e06a86752d 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -416,6 +416,7 @@ conda install -c huggingface transformers
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (来自 Google Research) 伴随论文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 由 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index e66cd1f867a2..6ff8f9c41fee 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -428,6 +428,7 @@ conda install -c huggingface transformers
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
diff --git a/docs/source/de/index.md b/docs/source/de/index.md
index 22f34aa84758..b2661345adff 100644
--- a/docs/source/de/index.md
+++ b/docs/source/de/index.md
@@ -173,6 +173,7 @@ Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen,
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e36282b21c7d..35a9d4514119 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -425,6 +425,8 @@
         title: Transformer XL
       - local: model_doc/ul2
         title: UL2
+      - local: model_doc/umt5
+        title: UMT5
       - local: model_doc/xmod
         title: X-MOD
       - local: model_doc/xglm
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 91c57e0f393f..f45806e90b75 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -232,6 +232,7 @@ The documentation is organized into five sections:
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
@@ -438,6 +439,7 @@ Flax), PyTorch, and/or TensorFlow.
 |        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             UMT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index 29068a3a4255..674143ddd1de 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -88,6 +88,8 @@ Based on the original T5 model, Google has released some follow-up works:
 
 - **FLan-UL2** : the UL2 model finetuned using the "Flan" prompt tuning and dataset collection.
 
+- **UMT5**: UmT5 is a multilingual T5 model trained on an improved and refreshed mC4 multilingual corpus,  29 trillion characters across 107 language, using a new sampling method, UniMax. Refer to
+ the documentation of mT5 which can be found [here](umt5).
 
 All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
 
diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md
new file mode 100644
index 000000000000..05a6b1431941
--- /dev/null
+++ b/docs/source/en/model_doc/umt5.md
@@ -0,0 +1,96 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UMT5
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=umt5">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/mt5-small-finetuned-arxiv-cs-finetuned-arxiv-cs-full">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+The UMT5 model was proposed in [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+
+The abstract from the paper is the following:
+
+*Pretrained multilingual large language models have typically used heuristic temperature-based sampling to balance between different languages. However previous work has not systematically evaluated the efficacy of different pretraining language distributions across model scales. In this paper, we propose a new sampling method, UniMax, that delivers more uniform coverage of head languages while mitigating overfitting on tail languages by explicitly capping the number of repeats over each language's corpus. We perform an extensive series of ablations testing a range of sampling strategies on a suite of multilingual benchmarks, while varying model scale. We find that UniMax outperforms standard temperature-based sampling, and the benefits persist as scale increases. As part of our contribution, we release: (i) an improved and refreshed mC4 multilingual corpus consisting of 29 trillion characters across 107 languages, and (ii) a suite of pretrained umT5 model checkpoints trained with UniMax sampling.*
+
+Tips: 
+
+- UMT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
+- Since umT5 was pre-trained in an unsupervise manner, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- [google/umt5-small](https://huggingface.co/google/umt5-small)
+- [google/umt5-base](https://huggingface.co/google/umt5-base)
+- [google/umt5-xl](https://huggingface.co/google/umt5-xl)
+- [google/umt5-xxl](https://huggingface.co/google/umt5-xxl).
+
+This model was contributed by [agemagician](https://huggingface.co/agemagician) and [stefan-it](https://huggingface.co/stefan-it). The original code can be
+found [here](https://github.com/google-research/t5x).
+
+One can refer to [T5's documentation page](t5) for more tips, code examples and notebooks.
+
+## Differences with mT5?
+`UmT5` is based on mT5, with a non-shared relative positional bias that is computed for each layer. This means that the model set `has_relative_bias` for each layer.
+The conversion script is also different because the model was saved in t5x's latest checkpointing format.
+
+# Sample usage
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/umt5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+
+>>> inputs = tokenizer(
+...     "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
+...     return_tensors="pt",
+... )
+>>> outputs = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(outputs))
+['<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s>']
+```
+
+
+## UMT5Model
+
+[[autodoc]] UMT5Model
+    - forward
+
+## UMT5ForConditionalGeneration
+
+[[autodoc]] UMT5ForConditionalGeneration
+    - forward
+
+## UMT5EncoderModel
+
+[[autodoc]] UMT5EncoderModel
+    - forward
+
+## UMT5ForQuestionAnswering
+
+[[autodoc]] UMT5ForQuestionAnswering
+    - forward
+
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 586b0a5d7462..ccc6f48fcdcf 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index cd0db0ca7531..ecdf37ce6efb 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
 
 <!--End of the generated tip-->
 
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index 760b02ee39b3..d5394caef838 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 99683306d6b4..882ac752b2e4 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -521,6 +521,7 @@
         "TvltConfig",
         "TvltProcessor",
     ],
+    "models.umt5": [],
     "models.unispeech": [
         "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "UniSpeechConfig",
@@ -2660,6 +2661,15 @@
             "TvltPreTrainedModel",
         ]
     )
+    _import_structure["models.umt5"].extend(
+        [
+            "UMT5EncoderModel",
+            "UMT5ForConditionalGeneration",
+            "UMT5ForQuestionAnswering",
+            "UMT5Model",
+            "UMT5PreTrainedModel",
+        ]
+    )
     _import_structure["models.unispeech"].extend(
         [
             "UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6157,6 +6167,13 @@
             TvltModel,
             TvltPreTrainedModel,
         )
+        from .models.umt5 import (
+            UMT5EncoderModel,
+            UMT5ForConditionalGeneration,
+            UMT5ForQuestionAnswering,
+            UMT5Model,
+            UMT5PreTrainedModel,
+        )
         from .models.unispeech import (
             UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
             UniSpeechForCTC,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index d8345c9ef8c0..ff7c2ab322da 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -194,6 +194,7 @@
     transfo_xl,
     trocr,
     tvlt,
+    umt5,
     unispeech,
     unispeech_sat,
     upernet,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5cbaa0705afb..5951fd0f028a 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -194,6 +194,7 @@
         ("transfo-xl", "TransfoXLConfig"),
         ("trocr", "TrOCRConfig"),
         ("tvlt", "TvltConfig"),
+        ("umt5", "MT5Config"),
         ("unispeech", "UniSpeechConfig"),
         ("unispeech-sat", "UniSpeechSatConfig"),
         ("upernet", "UperNetConfig"),
@@ -595,6 +596,7 @@
         ("trocr", "TrOCR"),
         ("tvlt", "TVLT"),
         ("ul2", "UL2"),
+        ("umt5", "UMT5"),
         ("unispeech", "UniSpeech"),
         ("unispeech-sat", "UniSpeechSat"),
         ("upernet", "UPerNet"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8bb6ea37aab2..8e8dd45a54bf 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -184,6 +184,7 @@
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
         ("tvlt", "TvltModel"),
+        ("umt5", "UMT5Model"),
         ("unispeech", "UniSpeechModel"),
         ("unispeech-sat", "UniSpeechSatModel"),
         ("van", "VanModel"),
@@ -643,6 +644,7 @@
         ("prophetnet", "ProphetNetForConditionalGeneration"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
         ("t5", "T5ForConditionalGeneration"),
+        ("umt5", "UMT5ForConditionalGeneration"),
         ("xlm-prophetnet", "XLMProphetNetForConditionalGeneration"),
     ]
 )
@@ -784,6 +786,7 @@
         ("splinter", "SplinterForQuestionAnswering"),
         ("squeezebert", "SqueezeBertForQuestionAnswering"),
         ("t5", "T5ForQuestionAnswering"),
+        ("umt5", "UMT5ForQuestionAnswering"),
         ("xlm", "XLMForQuestionAnsweringSimple"),
         ("xlm-roberta", "XLMRobertaForQuestionAnswering"),
         ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
@@ -1039,6 +1042,7 @@
         ("roformer", "RoFormerModel"),
         ("squeezebert", "SqueezeBertModel"),
         ("t5", "T5EncoderModel"),
+        ("umt5", "UMT5EncoderModel"),
         ("xlm", "XLMModel"),
         ("xlm-roberta", "XLMRobertaModel"),
         ("xlm-roberta-xl", "XLMRobertaXLModel"),
diff --git a/src/transformers/models/umt5/__init__.py b/src/transformers/models/umt5/__init__.py
new file mode 100644
index 000000000000..ef9693139329
--- /dev/null
+++ b/src/transformers/models/umt5/__init__.py
@@ -0,0 +1,53 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_umt5"] = [
+        "UMT5EncoderModel",
+        "UMT5ForConditionalGeneration",
+        "UMT5ForQuestionAnswering",
+        "UMT5Model",
+        "UMT5PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_umt5 import (
+            UMT5EncoderModel,
+            UMT5ForConditionalGeneration,
+            UMT5ForQuestionAnswering,
+            UMT5Model,
+            UMT5PreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..eeb5b3eb400e
--- /dev/null
+++ b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright 2023 Google LLC and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert T5X checkpoint to PyTorch
+
+Steps:
+- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
+- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
+    `gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
+- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
+    https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
+- Convert:
+    ```
+    python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
+      --pytorch_dump_path=$HOME/t5_1_1_small_pt
+    ```
+"""
+
+import argparse
+import collections
+
+import numpy as np
+import torch
+from flax import traverse_util
+from t5x import checkpoints
+
+from transformers import MT5Config, UMT5EncoderModel, UMT5ForConditionalGeneration
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+
+
+def t5x_relpos_bias_lookup(params, i, prefix):
+    """Returns the Relative Position Bias parameters of a layer. Does not transpose."""
+    return params[f"{prefix}/{prefix}/relpos_bias/rel_embedding"][:, i, :]
+
+
+def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
+    """Returns the KOQV parameters of (self-)attention. Does not transpose."""
+    k_tmp = k_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/key/kernel"][:, i, :, :])
+    k = k_tmp.reshape(k_tmp.shape[0], k_tmp.shape[1] * k_tmp.shape[2])
+    o_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/out/kernel"][:, i, :, :])
+    o = o_tmp.reshape(o_tmp.shape[0] * o_tmp.shape[1], o_tmp.shape[2])
+    q_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/query/kernel"][:, i, :, :])
+    q = q_tmp.reshape(q_tmp.shape[0], q_tmp.shape[1] * q_tmp.shape[2])
+    v_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/value/kernel"][:, i, :, :])
+    v = v_tmp.reshape(v_tmp.shape[0], v_tmp.shape[1] * v_tmp.shape[2])
+    return k, o, q, v
+
+
+def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
+    """Returns the MLP parameters of a layer. Does not transpose."""
+    if split_mlp_wi:
+        wi_0 = params[f"{prefix}/{prefix}/mlp/wi_0/kernel"][:, i, :]
+        wi_1 = params[f"{prefix}/{prefix}/mlp/wi_1/kernel"][:, i, :]
+        wi = (wi_0, wi_1)
+    else:
+        wi = params[f"{prefix}/{prefix}/mlp/wi/kernel"][:, i, :]
+
+    wo = params[f"{prefix}/{prefix}/mlp/wo/kernel"][:, i, :]
+    return wi, wo
+
+
+def t5x_layer_norm_lookup(params, i, prefix, layer_name):
+    """Returns the layer norm param of a layer."""
+    return params[f"{prefix}/{prefix}/{layer_name}/scale"][:, i]
+
+
+def convert_t5x_to_pytorch(
+    variables: dict, *, num_layers: int, is_encoder_only: bool, scalable_attention: bool = False
+):
+    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
+    old = traverse_util.flatten_dict(variables["target"])
+    old = {"/".join(k): v for k, v in old.items()}
+
+    # v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
+    split_mlp_wi = "encoder/encoder/mlp/wi_0/kernel" in old
+    print("Split MLP:", split_mlp_wi)
+
+    new = collections.OrderedDict()
+
+    # Shared embeddings.
+    new["shared.weight"] = old["token_embedder/embedding"]
+
+    # Encoder.
+    for i in range(num_layers):
+        # Block i, layer 0 (Self Attention).
+        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
+        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
+        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
+        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
+        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
+        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
+        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
+
+        # Block i, layer 1 (MLP).
+        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
+        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
+        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
+        if split_mlp_wi:
+            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
+            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
+        else:
+            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
+        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
+        if scalable_attention:
+            # convert the rel_embedding of each layer
+            new[f"encoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
+                old, i, "encoder"
+            ).T
+
+    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
+
+    if not scalable_attention:
+        new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
+            old, 0, "encoder"
+        ).T
+        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
+            old, 0, "decoder"
+        ).T
+
+    if not is_encoder_only:
+        # Decoder.
+        for i in range(num_layers):
+            # Block i, layer 0 (Self Attention).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
+            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
+            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
+            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
+            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
+
+            # Block i, layer 1 (Cross Attention).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
+            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
+            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
+            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
+
+            # Block i, layer 2 (MLP).
+            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
+            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
+            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
+            if split_mlp_wi:
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
+                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
+            else:
+                new[f"encoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
+            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
+
+            if scalable_attention:
+                # convert the rel_embedding of each layer
+                new[
+                    f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"
+                ] = t5x_relpos_bias_lookup(old, i, "decoder").T
+
+        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
+
+        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
+        if "decoder/logits_dense/kernel" in old:
+            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
+
+    return new
+
+
+def make_state_dict(converted_params, is_encoder_only: bool):
+    """Prepares a state dict for the PyTorch model."""
+    # Make a state dict with torch tensors.
+    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
+
+    # Add what is missing.
+    if "encoder.embed_tokens.weight" not in state_dict:
+        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
+
+    if not is_encoder_only:
+        if "decoder.embed_tokens.weight" not in state_dict:
+            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
+
+        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
+            print("Using shared word embeddings as lm_head.")
+            state_dict["lm_head.weight"] = state_dict["shared.weight"]
+
+    return state_dict
+
+
+def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
+    """Replaces the params in model witht the T5X converted params."""
+    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
+    converted = convert_t5x_to_pytorch(
+        variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
+    )
+    state_dict = make_state_dict(converted, is_encoder_only)
+    model.load_state_dict(state_dict, strict=True)
+
+
+def convert_t5x_checkpoint_to_pytorch(
+    t5x_checkpoint_path,
+    config_file,
+    pytorch_dump_path,
+    is_encoder_only: bool = False,
+    scalable_attention: bool = False,
+):
+    """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
+    # Initialise PyTorch model
+    config = MT5Config.from_json_file(config_file)
+    print(f"Building PyTorch model from configuration: {config}")
+    # Non-v1.1 checkpoints could also use T5Model, but this works for all.
+    # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
+    if is_encoder_only:
+        model = UMT5EncoderModel(config)
+    else:
+        model = UMT5ForConditionalGeneration(config)
+
+    # Load weights from tf checkpoint
+    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention)
+
+    # Save pytorch-model
+    print(f"Save PyTorch model to {pytorch_dump_path}")
+    model.save_pretrained(pytorch_dump_path)
+
+    # Verify that we can load the checkpoint.
+    model.from_pretrained(pytorch_dump_path)
+    print("Done")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
+    # Required parameters
+    parser.add_argument(
+        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
+    )
+    parser.add_argument(
+        "--scalable_attention",
+        action="store_true",
+        help="Whether the model uses scaled attention (umt5 model)",
+        default=False,
+    )
+    args = parser.parse_args()
+    convert_t5x_checkpoint_to_pytorch(
+        args.t5x_checkpoint_path,
+        args.config_file,
+        args.pytorch_dump_path,
+        args.is_encoder_only,
+        args.scalable_attention,
+    )
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
new file mode 100644
index 000000000000..e3dc40d59235
--- /dev/null
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -0,0 +1,1583 @@
+# coding=utf-8
+# Copyright 2023 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch mT5 model."""
+
+import copy
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+
+from ...activations import ACT2FN
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+    replace_return_docstrings,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "UMT5Config"
+_CHECKPOINT_FOR_DOC = "google/umt5-small"
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->UMT5
+class UMT5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # UMT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->UMT5,UMT5Config->PretrainedConfig
+class UMT5DenseActDense(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->UMT5,UMT5Config->PretrainedConfig
+class UMT5DenseGatedActDense(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # See https://github.com/huggingface/transformers/issues/20287
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->UMT5,UMT5Config->PretrainedConfig
+class UMT5LayerFF(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = UMT5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = UMT5DenseActDense(config)
+
+        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class UMT5Attention(nn.Module):
+    """
+    T5's attention using relative_attention_bias.
+    """
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+    def _shape(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.n_heads, self.key_value_proj_dim)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+
+    def _relative_position_bucket(self, relative_position):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        num_buckets = self.relative_attention_num_buckets
+        max_distance = self.relative_attention_max_distance
+        if not self.is_decoder:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        log_ratio = torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact)
+        log_ratio = log_ratio * (num_buckets - max_exact)
+        relative_position_if_large = max_exact + log_ratio.to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(relative_position)
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+    ):
+        is_cross_attention = encoder_hidden_states is not None
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # use encoder_hidden_states if cross attention
+        current_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        # checking that the `sequence_length` of the `past_key_value` is the same as the he provided
+        # `encoder_hidden_states` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        else:
+            key_states = self._shape(self.k(current_states))
+            value_states = self._shape(self.v(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                # reuse k, v, self_attention
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        query_states = self._shape(self.q(hidden_states))
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        # compute positional bias
+        if self.has_relative_attention_bias:
+            query_length = seq_length
+            if past_key_value is not None:
+                query_length += past_key_value[0].shape[2]
+            position_bias = self.compute_bias(query_length, key_states.size(2), device=attention_scores.device)
+        else:
+            position_bias = torch.zeros(
+                (1, self.n_heads, seq_length, key_states.size(2)),
+                device=attention_scores.device,
+                dtype=attention_scores.dtype,
+                requires_grad=self.training,
+            )
+        if past_key_value is not None:
+            position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+        if attention_mask is not None:
+            position_bias = position_bias + attention_mask  # (batch_size, n_heads, seq_length, key_length)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        attention_scores += position_bias
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        #  attn_output = torch.bmm(attn_probs, value_states) ?
+        context_states = torch.matmul(attn_weights, value_states)
+        # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ?
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.o(context_states)
+        return attn_output, attn_weights, past_key_value
+
+
+class UMT5LayerSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.SelfAttention = UMT5Attention(config, has_relative_attention_bias=True)
+        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        layer_head_mask=None,
+        past_key_value=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class UMT5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = UMT5Attention(config, has_relative_attention_bias=False)
+        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        layer_head_mask=None,
+        past_key_value=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class UMT5Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(UMT5LayerSelfAttention(config))
+        if self.is_decoder:
+            self.layer.append(UMT5LayerCrossAttention(config))
+
+        self.layer.append(UMT5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+
+        hidden_states, self_attn_weights, present_key_value = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+        )
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            max_dtype = torch.finfo(hidden_states.dtype).max
+            clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.layer[1](
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16:
+                max_dtype = torch.finfo(hidden_states.dtype).max
+                clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            present_key_value += cross_attn_present_key_value
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            max_dtype = torch.finfo(hidden_states.dtype).max
+            clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (
+            hidden_states,
+            present_key_value,
+        )
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class UMT5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PretrainedConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["UMT5Block"]
+    _keep_in_fp32_modules = ["wo"]
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, UMT5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, (UMT5Model, UMT5ForConditionalGeneration, UMT5EncoderModel, UMT5ForQuestionAnswering)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "qa_outputs"):
+                module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+                module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, UMT5DenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, UMT5DenseGatedActDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, UMT5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (UMT5Attention, UMT5Stack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id."
+                "See UMT5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+class UMT5Stack(UMT5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+        self.block = nn.ModuleList([UMT5Block(config) for i in range(config.num_layers)])
+        self.final_layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.gradient_checkpointing = False
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.is_decoder else None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return tuple(module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if use_cache:
+                present_key_value_states += (layer_outputs[1],)
+
+            if output_attentions:
+                all_attentions += (layer_outputs[2],)
+                if self.is_decoder:
+                    all_cross_attentions += (layer_outputs[3],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+UMT5_START_DOCSTRING = r"""
+
+    The UMT5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
+    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
+    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
+    text-to-text denoising generative setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`UMT5Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+UMT5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
+            Training](./umt5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+UMT5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare UMT5 Model transformer outputting raw hidden-states without any specific head on top.",
+    UMT5_START_DOCSTRING,
+)
+class UMT5Model(UMT5PreTrainedModel):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import UMT5Model, AutoTokenizer
+
+    >>> model = UMT5Model.from_pretrained("google/umt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
+    >>> label = "<extra_id_0> verhandelt"
+    >>> inputs = tokenizer(inputs, return_tensors="pt")
+    >>> labels = tokenizer(label=label, return_tensors="pt")
+
+    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
+    >>> hidden_states = outputs.last_hidden_state
+    ```"""
+    model_type = "uumt5"
+    config_class = PretrainedConfig
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = UMT5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, UMT5Model
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+        >>> model = UMT5Model.from_pretrained("google/umt5-small")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
+        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
+        >>> decoder_input_ids = model._shift_right(decoder_input_ids)
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""UMT5 Model with a `language modeling` head on top.""", UMT5_START_DOCSTRING)
+class UMT5ForConditionalGeneration(UMT5PreTrainedModel):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer
+
+    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    >>> summary = "Weiter Verhandlung in Syrien."
+    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
+
+    >>> outputs = model(**inputs)
+    >>> loss = outputs.loss
+    ```"""
+
+    model_type = "umt5"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = UMT5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
+        >>> outputs = model.generate(input_ids)
+        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    "The bare UMT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+    UMT5_START_DOCSTRING,
+)
+class UMT5EncoderModel(UMT5PreTrainedModel):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import UMT5EncoderModel, AutoTokenizer
+
+    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
+    >>> outputs = model(input_ids)
+    >>> hidden_state = outputs.last_hidden_state
+    ```"""
+
+    model_type = "umt5"
+    # config_class = UMT5Config
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(UMT5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.forward with T5->UMT5, t5-small->google/umt5-small
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, UMT5EncoderModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
+        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+@add_start_docstrings(
+    """
+    UMT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    UMT5_START_DOCSTRING,
+)
+class UMT5ForQuestionAnswering(UMT5PreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = UMT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = UMT5Stack(decoder_config, self.shared)
+
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.d_model, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.shared
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_encoder
+    def get_encoder(self):
+        return self.encoder
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_decoder
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
+            are not taken into account for computing the loss.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        # Copied from models.bart.modeling_bart.BartModel.forward
+        #   different to other models, T5 automatically creates decoder_input_ids from
+        #   input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=None,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2c40f7143d4b..30beef21ffe0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -7040,6 +7040,41 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class UMT5EncoderModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UMT5PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/umt5/__init__.py b/tests/models/umt5/__init__.py
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/tests/models/umt5/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
new file mode 100644
index 000000000000..697e619f133a
--- /dev/null
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -0,0 +1,387 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+from transformers import T5Config, is_torch_available
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoTokenizer, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering, UMT5Model
+
+
+# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5,UMT5Config->T5Config
+class UMT5ModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=False,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        decoder_start_token_id=0,
+        scope=None,
+        decoder_layers=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+
+    def get_large_model_config(self):
+        return T5Config.from_pretrained("google/umt5-base")
+
+    def prepare_inputs_dict(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+    ):
+        if attention_mask is None:
+            attention_mask = input_ids.ne(config.pad_token_id)
+        if decoder_attention_mask is None:
+            decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+        if head_mask is None:
+            head_mask = torch.ones(config.num_hidden_layers, config.num_attention_heads, device=torch_device)
+        if decoder_head_mask is None:
+            decoder_head_mask = torch.ones(config.num_decoder_layers, config.num_attention_heads, device=torch_device)
+        if cross_attn_head_mask is None:
+            cross_attn_head_mask = torch.ones(
+                config.num_decoder_layers, config.num_attention_heads, device=torch_device
+            )
+        return {
+            "input_ids": input_ids,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+        }
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        # we need to clamp the input ids here to avoid having pad token in between
+        # this is because for NllbMoe the position_ids are prepared such that
+        # all pad tokens have pos id = 2 and rest are between 2..seq_length
+        # and the seq_length here is seq_length - num_pad_tokens
+        # but when using past, there is no way of knowing if the past input ids had
+        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
+        # position_ids being off by num_pad_tokens in past input
+        input_ids = input_ids.clamp(self.pad_token_id + 1)
+        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
+
+        config = self.get_config()
+        config.encoder_attention_heads = config.num_attention_heads
+        input_dict = self.prepare_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, input_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_pipeline_config(self):
+        return T5Config(
+            vocab_size=166,  # t5 forces 100 extra tokens
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def get_config(self):
+        return T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UMT5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = UMT5Model(config=config).get_decoder().to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_dict,
+    ):
+        model = UMT5Model(config=config).to(torch_device).half().eval()
+        output = model(**input_dict)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+
+@require_torch
+class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (UMT5Model, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (UMT5ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "conversational": UMT5ForConditionalGeneration,
+            "feature-extraction": UMT5Model,
+            "summarization": UMT5ForConditionalGeneration,
+            "text2text-generation": UMT5ForConditionalGeneration,
+            "translation": UMT5ForConditionalGeneration,
+            "question-answering": UMT5ForQuestionAnswering,
+        }
+        if is_torch_available()
+        else {}
+    )
+    is_encoder_decoder = True
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = True
+    test_torchscript = True
+    # The small UMT5 model needs higher percentages for CPU/MP tests
+    model_split_percents = [0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = UMT5ModelTester(self)
+
+    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    def test_export_to_onnx(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        model = UMT5Model(config_and_inputs[0]).to(torch_device)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.onnx.export(
+                model,
+                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
+                f"{tmpdirname}/t5_test.onnx",
+                export_params=True,
+                opset_version=9,
+                input_names=["input_ids", "decoder_input_ids"],
+            )
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        model = UMT5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1]["input_ids"],
+                num_beams=1,
+                max_length=3,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_disk_offload(self):
+        pass
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class Umt5IntegrationTest(unittest.TestCase):
+    @slow
+    def test_small_integration_test(self):
+        """
+        For comparison run the kaggle notbook available here : https://www.kaggle.com/arthurzucker/umt5-inference
+        """
+
+        model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small", return_dict=True).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("google/umt5-small", use_fast=False)
+        input_text = [
+            "Bonjour monsieur <extra_id_0> bien <extra_id_1>.",
+            "No se como puedo <extra_id_0>.",
+            "This is the reason why we <extra_id_0> them.",
+            "The <extra_id_0> walks in <extra_id_1>, seats",
+            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
+        ]
+        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids
+        # fmt: off
+        EXPECTED_IDS = torch.tensor(
+            [
+                [ 38530, 210703, 256299, 1410, 256298, 274, 1, 0,0, 0, 0, 0, 0, 0, 0, 0,0, 0],
+                [   826, 321, 671, 25922, 256299, 274, 1, 0,0, 0, 0, 0, 0, 0, 0, 0,0, 0],
+                [  1460, 339, 312, 19014, 10620, 758, 256299, 2355,274, 1, 0, 0, 0, 0, 0, 0,0, 0],
+                [   517, 256299, 14869, 281, 301, 256298, 275, 119983,1, 0, 0, 0, 0, 0, 0, 0,0, 0],
+                [   320, 256299, 14869, 281, 2234, 289, 2275, 333,61391, 289, 256298, 543, 256297, 168714, 329, 256296,274, 1],
+            ]
+        )
+        # fmt: on
+        self.assertEqual(input_ids, EXPECTED_IDS)
+
+        generated_ids = model.generate(input_ids.to(torch_device))
+        EXPECTED_FILLING = [
+            "<pad><extra_id_0> et<extra_id_1> [eod] <extra_id_2><extra_id_55>.. [eod] 💐 💐 💐 💐 💐 💐 💐 💐 💐 💐 💐 <extra_id_56>ajšietosto<extra_id_56>lleux<extra_id_19><extra_id_6>ajšie</s>",
+            "<pad><extra_id_0>.<extra_id_1>.,<0x0A>...spech <0x0A><extra_id_20> <extra_id_21></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "<pad><extra_id_0> are not going to be a part of the world. We are not going to be a part of<extra_id_1> and<extra_id_2><0x0A><extra_id_48>.<extra_id_48></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "<pad><extra_id_0> door<extra_id_1>, the door<extra_id_2> 피해[/</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+        ]
+        filling = tokenizer.batch_decode(generated_ids)
+        self.assertTrue(filling, EXPECTED_FILLING)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index db947d834bad..c6e5e39374e1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,6 +46,7 @@
     "RealmBertModel",
     "T5Stack",
     "MT5Stack",
+    "UMT5Stack",
     "SwitchTransformersStack",
     "TFDPRSpanPredictor",
     "MaskFormerSwinModel",
@@ -61,6 +62,7 @@
     "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
     "NllbMoeDecoder",
     "NllbMoeEncoder",
+    "UMT5EncoderModel",  # Building part of bigger (tested) model.
     "LlamaDecoder",  # Building part of bigger (tested) model.
     "Blip2QFormerModel",  # Building part of bigger (tested) model.
     "DetaEncoder",  # Building part of bigger (tested) model.
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index 0dfe5cea6637..f1b8736b597f 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -110,6 +110,7 @@
     "MaskFormerSwinBackbone",
     "MT5Model",
     "MT5ForConditionalGeneration",
+    "UMT5ForConditionalGeneration",
     "TFMT5ForConditionalGeneration",
     "TFMT5Model",
     "QDQBertForSequenceClassification",

From e16191a8ac118cea95b3353fc837e288edae64be Mon Sep 17 00:00:00 2001
From: Nayeon Han <nayeon2.han@gmail.com>
Date: Mon, 3 Jul 2023 15:50:27 +0900
Subject: [PATCH 588/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perplexity.mdx`=20to=20Korean=20(#23850)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: `perplexity.mdx`

* translate comment

* reference english file

* change extension

* update toctree
---
 docs/source/ko/_toctree.yml  |   4 +-
 docs/source/ko/perplexity.md | 135 +++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perplexity.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 80bd4c991a5b..eed3877c95f7 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -174,8 +174,8 @@
     title: BERTology
   - local: in_translation
     title: (번역중) Perplexity of fixed-length models
-  - local: in_translation
-    title: (번역중) Pipelines for webserver inference
+  - local: perplexity
+    title: 고정 길이 모델의 펄플렉서티(Perplexity)
   title: (번역중) 개념 가이드
 - sections:
   - sections:
diff --git a/docs/source/ko/perplexity.md b/docs/source/ko/perplexity.md
new file mode 100644
index 000000000000..72eee0643c33
--- /dev/null
+++ b/docs/source/ko/perplexity.md
@@ -0,0 +1,135 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 고정 길이 모델의 펄플렉서티(Perplexity)[[perplexity-of-fixedlength-models]]
+
+[[open-in-colab]]
+
+펄플렉서티(Perplexity, PPL)는 가장 일반적인 언어 모델 평가지표 중 하나입니다.
+자세히 알아보기 전에 이 평가지표는 고전적인 언어 모델(자기회귀 또는 인과적 언어 모델이라고도 함)에만 적용되며 BERT와 같은 마스킹된 언어 모델에는 잘 적용하지 않습니다 (BERT는 [summary of the models](../en/model_summary) 문서를 참고하세요).
+
+펄플렉서티는 시퀀스의 음의 로그 우도(negative log-likelihood, NLL) 값의 평균에 지수(exponentiate)를 취한 값으로 정의됩니다.
+토큰화된 시퀀스 \\(X = (x_0, x_1, \dots, x_t)\\) 가 있을 때, \\(X\\) 의 펄플렉서티는 아래 수식과 같이 구할 수 있습니다.
+
+$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
+
+\\(\log p_\theta (x_i|x_{<i})\\) 는 모델에 i번째 이전까지 토큰이 주어졌을 때 i번째 토큰의 로그 우도값입니다.
+
+직관적으로 말뭉치에서 지정된 토큰 집합을 균일하게 예측하는 모델의 능력에 대한 평가로 생각할 수 있습니다.
+중요한 점은 토큰화 과정이 모델의 펄플렉서티에 직접적인 영향을 미치므로 서로 다른 모델을 비교할 때 항상 이를 고려해야 합니다.
+
+이는 데이터와 모델 예측 간의 cross-entropy 값에 지수를 취한 것과 동일합니다.
+펄플렉서티와 문자당 비트 수(BPC) 및 데이터 압축과의 관계에 대해 더 직관적인 이해를 원하신다면 다음 글
+[fantastic blog post on The Gradient](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/)을 확인하세요.
+
+## 고정 길이 모델의 펄플렉서티(PPL) 계산하기[[calculating-ppl-with-fixedlength-models]]
+
+모델의 컨텍스트 크기가 정해져있지 않다면,
+아래와 같이 시퀀스를 자동 회귀적으로 분해하고 각 단계에서 선행 하는 전체 시퀀스를 조건부 확률에 넣어 모델의 펄플렉서티를 계산할 것입니다.
+
+<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
+
+그러나 모델의 근사치를 구할 때는 일반적으로 모델이 처리할 수 있는 토큰 수에 제한이 있습니다.
+예를 들어, 가장 큰 버전의 [GPT-2](model_doc/gpt2)는 토큰의 길이가 1024로 고정되어 있습니다.
+따라서 \\(t\\) 가 1024보다 큰 경우에 \\(p_\theta(x_t|x_{<t})\\) 을 계산할 수 없습니다.
+
+대신 시퀀스는 일반적으로 모델의 최대 입력 크기와 동일한 길이는 가지는 부분 시퀀스로 쪼갭니다.
+만약 모델의 최대 입력 길이가 \\(k\\) 라면, 
+토큰 \\(x_t\\) 의 우도 값을 계산할 때 이전 토큰을 모두 사용하지 않고, \\(k-1\\) 토큰까지 사용해 대략적인 우도 값을 추정합니다. 
+
+모델의 시퀀스에 대한 펄플렉서티를 계산할 때,
+수월하지만 차선책은 시퀀스를 청크로 쪼개고 분해된 각 부분의 로그 우도 값을 독립적으로 합산하는 것입니다.
+
+<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
+
+이 방법은 각 부분의 펄플렉서티를 한 번의 포워드 패스로 계산할 수 있어 빠르지만 일반적으로 더 높은(더 나쁜) PPL을 산출합니다.
+왜냐하면 대부분의 예측 단계에서 모델의 컨텍스트가 적기 때문입니다.
+
+대신, 고정 길이 모델의 PPL은 슬라이딩 윈도우 전략으로 평가해야 합니다.
+이 전략에는 컨텍스트 윈도우을 반복적으로 슬라이딩해 모델이 각 예측을 수행할 때 더 많은 컨텍스트를 갖도록 하는 작업이 포함됩니다.
+
+<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
+
+이는 시퀀스 확률의 실제 분해에 더 가까운 근사치이며 일반적으로 더 유리한 점수를 산출합니다.
+단점은 말뭉치의 각 토큰에 대해 별도의 포워드 패스가 필요하다는 것입니다.
+현실적으로 좋은 절충안은 한 번에 한 토큰씩 슬라이딩하는 것이 아니라 더 큰 간격으로 컨텍스트를 이동하는 스트라이드가 적용된 슬라이딩 윈도우을 사용하는 것입니다. 
+이렇게 하면 계산을 훨씬 더 빠르게 진행하면서도 모델에 각 단계에서 예측을 수행할 수 있는 긴 컨텍스트를 제공할 수 있습니다.
+
+## 예제: 🤗 Transformers에서 GPT-2로 펄플렉서티(perplexity) 계산하기[[example-calculating-perplexity-with-gpt2-in-transformers]]
+
+이제 GPT-2로 위의 과정을 시연해 보겠습니다.
+
+```python
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+
+device = "cuda"
+model_id = "gpt2-large"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+```
+
+WikiText-2 데이터 세트를 가져오고 몇 가지 슬라이딩 윈도우 전략을 사용해 펄플렉서티를 계산해보겠습니다.
+이 데이터 세트는 크기가 작고 포워드 패스 한 번만 수행하기 때문에 전체 데이터 세트를 메모리에 가져오고 인코딩할 수 있습니다.
+
+```python
+from datasets import load_dataset
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+```
+
+🤗 Transformers를 사용하면 모델의 `labels`로 `input_ids`를 전달해 각 토큰에 대한 평균 음의 우도 값을 손실로 반환할 수 있습니다.
+하지만 슬라이딩 윈도우 방식을 사용하면 각 반복마다 모델에 전달하는 토큰이 겹칩니다.
+컨텍스트로 처리하는 토큰에 대한 로그 우도 값이 손실에 포함되는 것을 원하지 않기 때문에 이러한 토큰의 `input_ids`를 `-100`으로 설정하여 무시할 수 있습니다. 
+
+다음은 스트라이드(stride)를 `512`로 사용한 예시입니다. 
+즉, 모델이 한 토큰의 조건부 우도 값을 계산할 때 컨텍스트에 최소한 512개의 토큰이 포함되어있다는 의미입니다 (해당 토큰 앞에 512개의 토큰이 있는 경우).
+
+```python
+import torch
+from tqdm import tqdm
+
+max_length = model.config.n_positions
+stride = 512
+seq_len = encodings.input_ids.size(1)
+
+nlls = []
+prev_end_loc = 0
+for begin_loc in tqdm(range(0, seq_len, stride)):
+    end_loc = min(begin_loc + max_length, seq_len)
+    trg_len = end_loc - prev_end_loc  # 마지막 루프의 스트라이드 값과 다를 수 있음
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+    target_ids = input_ids.clone()
+    target_ids[:, :-trg_len] = -100
+
+    with torch.no_grad():
+        outputs = model(input_ids, labels=target_ids)
+
+        # 손실은 모든 유효한 레이블에 대한 평균값을 구하는 교차 엔트로피(cross entropy)로 계산됩니다.
+        # 나이브 베이지안 모델은 내부적으로 레이블을 왼쪽으로 1개씩 밀기 때문에, (타켓 - 1)개 만큼의 레이블에 대해 손실을 계산합니다.
+        neg_log_likelihood = outputs.loss
+
+    nlls.append(neg_log_likelihood)
+
+    prev_end_loc = end_loc
+    if end_loc == seq_len:
+        break
+
+ppl = torch.exp(torch.stack(nlls).mean())
+```
+
+스트라이드를 최대 입력 길이와 동일하게 설정하면 위에서 설명한 차선책인 비슬라이딩 윈도우 전략과 동일합니다.
+일반적으로 스트라이드가 작을수록 모델이 각 예측을 할 때 더 많은 컨텍스트를 볼 수 있게 되어 펄플렉서티 값이 좋아집니다.
+
+위의 계산을 토큰이 겹치지 않도록 `stride = 1024`로 설정하면 PPL은 `19.44`로 GPT-2 논문에서 보고된 `19.93`과 거의 동일합니다.
+`stride = 512`로 슬라이딩 윈도우 전략을 사용하면 PPL은 `16.45`로 떨어집니다. 
+이는 더 좋은 점수일 뿐만 아니라 시퀀스 확률의 실제 자동 회귀 분해에 더 가까운 방식으로 계산됩니다.
\ No newline at end of file

From fc7ce2ebc52eccd8158a7feeeee11eb44f964937 Mon Sep 17 00:00:00 2001
From: Eli Simhayev <elisimhayev@gmail.com>
Date: Mon, 3 Jul 2023 11:07:25 +0300
Subject: [PATCH 589/935] [Time-Series] Added blog-post to tips (#24482)

* [Time-Series] Added blog-post to tips

* added Resources to time series models docs

* removed "with Bert"
---
 docs/source/en/model_doc/autoformer.md              | 8 +++++++-
 docs/source/en/model_doc/informer.md                | 5 ++++-
 docs/source/en/model_doc/time_series_transformer.md | 7 ++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/autoformer.md b/docs/source/en/model_doc/autoformer.md
index f50705d0e3ef..20977c71cae9 100644
--- a/docs/source/en/model_doc/autoformer.md
+++ b/docs/source/en/model_doc/autoformer.md
@@ -29,6 +29,12 @@ The abstract from the paper is the following:
 This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
 The original code can be found [here](https://github.com/thuml/Autoformer).
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- Check out the Autoformer blog-post in HuggingFace blog: [Yes, Transformers are Effective for Time Series Forecasting (+ Autoformer)](https://huggingface.co/blog/autoformer)
+
 ## AutoformerConfig
 
 [[autodoc]] AutoformerConfig
@@ -43,4 +49,4 @@ The original code can be found [here](https://github.com/thuml/Autoformer).
 ## AutoformerForPrediction
 
 [[autodoc]] AutoformerForPrediction
-    - forward
\ No newline at end of file
+    - forward
diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md
index 4ab48f6f8287..0d2d82a3f573 100644
--- a/docs/source/en/model_doc/informer.md
+++ b/docs/source/en/model_doc/informer.md
@@ -29,7 +29,10 @@ The abstract from the paper is the following:
 This model was contributed by [elisim](https://huggingface.co/elisim) and [kashif](https://huggingface.co/kashif).
 The original code can be found [here](https://github.com/zhouhaoyi/Informer2020).
 
-Tips:
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
 - Check out the Informer blog-post in HuggingFace blog: [Multivariate Probabilistic Time Series Forecasting with Informer](https://huggingface.co/blog/informer)
 
 ## InformerConfig
diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md
index f20387fb3ad2..208798aa1c68 100644
--- a/docs/source/en/model_doc/time_series_transformer.md
+++ b/docs/source/en/model_doc/time_series_transformer.md
@@ -29,7 +29,6 @@ The Time Series Transformer model is a vanilla encoder-decoder Transformer for t
 
 Tips:
 
-- Check out the Time Series Transformer blog-post in HuggingFace blog: [Probabilistic Time Series Forecasting with 🤗 Transformers](https://huggingface.co/blog/time-series-transformers)
 - Similar to other models in the library, [`TimeSeriesTransformerModel`] is the raw Transformer without any head on top, and [`TimeSeriesTransformerForPrediction`]
 adds a distribution head on top of the former, which can be used for time-series forecasting. Note that this is a so-called probabilistic forecasting model, not a
 point forecasting model. This means that the model learns a distribution, from which one can sample. The model doesn't directly output values.
@@ -60,6 +59,12 @@ which is then fed to the decoder in order to make the next prediction (also call
 
 This model was contributed by [kashif](https://huggingface.co/kashif).
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- Check out the Time Series Transformer blog-post in HuggingFace blog: [Probabilistic Time Series Forecasting with 🤗 Transformers](https://huggingface.co/blog/time-series-transformers)
+
 
 ## TimeSeriesTransformerConfig
 

From 6eedfa6dd15dc1e22a55ae036f681914e5a0d9a1 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 3 Jul 2023 12:24:46 +0200
Subject: [PATCH 590/935] Pin `Pillow` for now (#24633)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 1ce726d9555c..72e1fa97cb10 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
-    "Pillow",
+    "Pillow<10.0.0",
     "accelerate>=0.20.3",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 8b360558c40f..0f95cdf76fb3 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -2,7 +2,7 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
-    "Pillow": "Pillow",
+    "Pillow": "Pillow<10.0.0",
     "accelerate": "accelerate>=0.20.3",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",

From 4b26a61631b8fd30f845cf08ebcc5ed65fe83c9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gema=20Parre=C3=B1o?= <gema.parreno.piqueras@gmail.com>
Date: Mon, 3 Jul 2023 16:21:21 +0200
Subject: [PATCH 591/935] Fix loading dataset docs link in run_translation.py
 example (#24594)

* fix loading dataset link

* Update examples/tensorflow/translation/run_translation.py

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>

* Update examples/tensorflow/translation/run_translation.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 examples/tensorflow/translation/run_translation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index ad663dc24440..34de29843043 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -339,7 +339,7 @@ def main():
             use_auth_token=True if model_args.use_auth_token else None,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading
     # endregion
 
     # region Load model config and tokenizer

From 9934bb1f4256f214b1ae2fc3a232e57d2b686505 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 3 Jul 2023 16:08:20 +0100
Subject: [PATCH 592/935] Generate: multi-device support for contrastive search
 (#24635)

---
 src/transformers/generation/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 69120ab8f89b..453a2f0c6f51 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -2060,8 +2060,10 @@ def contrastive_search(
             context_hidden = last_hidden_states.repeat_interleave(top_k, dim=0)
 
             # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
-            # model confidence
+            # model confidence. Keeping `selected_idx` on CPU enables multi-device contrastive search and doesn't
+            # introduce (noticeable) slowdowns on single-device runs.
             selected_idx = _ranking_fast(context_hidden, next_hidden, top_k_probs, penalty_alpha, top_k)
+            selected_idx = selected_idx.to("cpu")
 
             # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
             # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores

From f4e4b4d0e2dc248433e808594f7595292037d891 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 3 Jul 2023 18:18:49 +0100
Subject: [PATCH 593/935] Generate: force cache with `inputs_embeds` forwarding
 (#24639)

---
 src/transformers/generation/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 453a2f0c6f51..5fb4b4b5a4cc 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1304,7 +1304,12 @@ def generate(
         # 4. Define other model kwargs
         model_kwargs["output_attentions"] = generation_config.output_attentions
         model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
-        model_kwargs["use_cache"] = generation_config.use_cache
+        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
+        # generating the first new token or not, and we only want to use the embeddings for the first new token)
+        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
+            model_kwargs["use_cache"] = True
+        else:
+            model_kwargs["use_cache"] = generation_config.use_cache
 
         accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
         requires_attention_mask = "encoder_outputs" not in model_kwargs

From cd4584e3c809bb9e1392ccd3fe38b40daba5519a Mon Sep 17 00:00:00 2001
From: Shahad Mahmud <shahad9381@gmail.com>
Date: Tue, 4 Jul 2023 06:51:42 +0600
Subject: [PATCH 594/935] precompiled_charsmap checking before adding to the
 normalizers' list for XLNetTokenizerFast conversion. (#24618)

* precompiled_charsmap checking before adding to the normalizers' list.

* precompiled_charsmap checking for all Sentencepiece tokenizer models

* precompiled_charsmap checking for SPM tokenizer models - correct formatting
---
 src/transformers/convert_slow_tokenizer.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 082ccb47bdfa..39d1b0bc43bb 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -551,7 +551,10 @@ def normalizer(self, proto):
             list_normalizers.append(normalizers.Lowercase())
 
         precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
-        list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
+        if precompiled_charsmap:
+            list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
         list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
         return normalizers.Sequence(list_normalizers)
 
@@ -802,7 +805,10 @@ def normalizer(self, proto):
             list_normalizers.append(normalizers.Lowercase())
 
         precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
-        list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
+        if precompiled_charsmap:
+            list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
         list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
         return normalizers.Sequence(list_normalizers)
 
@@ -836,7 +842,10 @@ def normalizer(self, proto):
             list_normalizers.append(normalizers.Lowercase())
 
         precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
-        list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
+        if precompiled_charsmap:
+            list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
+
         return normalizers.Sequence(list_normalizers)
 
     def post_processor(self):

From 4e945660187d1789cda8463f1ff3786a2dbd7585 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Tue, 4 Jul 2023 16:03:27 +0100
Subject: [PATCH 595/935] Fix audio feature extractor deps (#24636)

* Fix audio feature extractor deps

* use audio utils window over torch window
---
 src/transformers/__init__.py                  | 19 ++++++-------
 src/transformers/models/mctct/__init__.py     | 21 ++------------
 .../models/mctct/feature_extraction_mctct.py  |  9 ++----
 src/transformers/models/speecht5/__init__.py  | 19 ++-----------
 .../speecht5/feature_extraction_speecht5.py   |  6 ++--
 src/transformers/models/tvlt/__init__.py      | 17 ++---------
 .../utils/dummy_speech_objects.py             | 28 -------------------
 .../test_feature_extraction_encodec.py        |  7 ++---
 .../mctct/test_feature_extraction_mctct.py    |  7 ++---
 .../test_feature_extraction_speecht5.py       |  7 ++---
 .../tvlt/test_feature_extraction_tvlt.py      |  7 ++---
 11 files changed, 28 insertions(+), 119 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 882ac752b2e4..9e6fa1bf3836 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -285,6 +285,7 @@
     "models.encodec": [
         "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "EncodecConfig",
+        "EncodecFeatureExtractor",
     ],
     "models.encoder_decoder": ["EncoderDecoderConfig"],
     "models.ernie": [
@@ -388,7 +389,7 @@
     "models.maskformer": ["MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig", "MaskFormerSwinConfig"],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
-    "models.mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig", "MCTCTProcessor"],
+    "models.mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig", "MCTCTFeatureExtractor", "MCTCTProcessor"],
     "models.mega": ["MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegaConfig"],
     "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "models.megatron_gpt2": [],
@@ -481,6 +482,7 @@
         "SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP",
         "SpeechT5Config",
+        "SpeechT5FeatureExtractor",
         "SpeechT5HifiGanConfig",
         "SpeechT5Processor",
     ],
@@ -519,6 +521,7 @@
     "models.tvlt": [
         "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TvltConfig",
+        "TvltFeatureExtractor",
         "TvltProcessor",
     ],
     "models.umt5": [],
@@ -843,11 +846,7 @@
     ]
 else:
     _import_structure["models.audio_spectrogram_transformer"].append("ASTFeatureExtractor")
-    _import_structure["models.encodec"].append("EncodecFeatureExtractor")
-    _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
-    _import_structure["models.speecht5"].append("SpeechT5FeatureExtractor")
-    _import_structure["models.tvlt"].append("TvltFeatureExtractor")
 
 # Tensorflow-text-specific objects
 try:
@@ -4170,6 +4169,7 @@
     from .models.encodec import (
         ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
         EncodecConfig,
+        EncodecFeatureExtractor,
     )
     from .models.encoder_decoder import EncoderDecoderConfig
     from .models.ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig
@@ -4265,7 +4265,7 @@
     from .models.mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
     from .models.maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig, MaskFormerSwinConfig
     from .models.mbart import MBartConfig
-    from .models.mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig, MCTCTProcessor
+    from .models.mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig, MCTCTFeatureExtractor, MCTCTProcessor
     from .models.mega import MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP, MegaConfig
     from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .models.mgp_str import MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP, MgpstrConfig, MgpstrProcessor, MgpstrTokenizer
@@ -4355,6 +4355,7 @@
         SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP,
         SpeechT5Config,
+        SpeechT5FeatureExtractor,
         SpeechT5HifiGanConfig,
         SpeechT5Processor,
     )
@@ -4386,7 +4387,7 @@
         TransfoXLTokenizer,
     )
     from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
-    from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltProcessor
+    from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltFeatureExtractor, TvltProcessor
     from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
     from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
     from .models.upernet import UperNetConfig
@@ -4681,11 +4682,7 @@
         from .utils.dummy_speech_objects import *
     else:
         from .models.audio_spectrogram_transformer import ASTFeatureExtractor
-        from .models.encodec import EncodecFeatureExtractor
-        from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
-        from .models.speecht5 import SpeechT5FeatureExtractor
-        from .models.tvlt import TvltFeatureExtractor
 
     try:
         if not is_tensorflow_text_available():
diff --git a/src/transformers/models/mctct/__init__.py b/src/transformers/models/mctct/__init__.py
index 5da754fc51b8..f389ebd9e70e 100644
--- a/src/transformers/models/mctct/__init__.py
+++ b/src/transformers/models/mctct/__init__.py
@@ -13,24 +13,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_speech_available, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
     "configuration_mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig"],
+    "feature_extraction_mctct": ["MCTCTFeatureExtractor"],
     "processing_mctct": ["MCTCTProcessor"],
 }
 
 
-try:
-    if not is_speech_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_mctct"] = ["MCTCTFeatureExtractor"]
-
-
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
@@ -47,16 +39,9 @@
 
 if TYPE_CHECKING:
     from .configuration_mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig
+    from .feature_extraction_mctct import MCTCTFeatureExtractor
     from .processing_mctct import MCTCTProcessor
 
-    try:
-        if not is_speech_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_mctct import MCTCTFeatureExtractor
-
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py
index 9e9e276c168c..23ae02ecad60 100644
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ b/src/transformers/models/mctct/feature_extraction_mctct.py
@@ -19,9 +19,8 @@
 from typing import List, Optional, Union
 
 import numpy as np
-import torch
 
-from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram
+from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...file_utils import PaddingStrategy, TensorType
@@ -110,11 +109,9 @@ def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
         Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
         """
         if self.win_function == "hamming_window":
-            window = torch.hamming_window(window_length=self.sample_size, periodic=False, alpha=0.54, beta=0.46)
+            window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
         else:
-            window = getattr(torch, self.win_function)()
-
-        window = window.numpy()
+            window = window_function(window_length=self.sample_size, name=self.win_function)
 
         fbanks = mel_filter_bank(
             num_frequency_bins=self.n_freqs,
diff --git a/src/transformers/models/speecht5/__init__.py b/src/transformers/models/speecht5/__init__.py
index d1f8a6d8f740..20606dda51ef 100644
--- a/src/transformers/models/speecht5/__init__.py
+++ b/src/transformers/models/speecht5/__init__.py
@@ -17,7 +17,6 @@
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_sentencepiece_available,
-    is_speech_available,
     is_torch_available,
 )
 
@@ -29,6 +28,7 @@
         "SpeechT5Config",
         "SpeechT5HifiGanConfig",
     ],
+    "feature_extraction_speecht5": ["SpeechT5FeatureExtractor"],
     "processing_speecht5": ["SpeechT5Processor"],
 }
 
@@ -40,14 +40,6 @@
 else:
     _import_structure["tokenization_speecht5"] = ["SpeechT5Tokenizer"]
 
-try:
-    if not is_speech_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_speecht5"] = ["SpeechT5FeatureExtractor"]
-
 try:
     if not is_torch_available():
         raise OptionalDependencyNotAvailable()
@@ -71,6 +63,7 @@
         SpeechT5Config,
         SpeechT5HifiGanConfig,
     )
+    from .feature_extraction_speecht5 import SpeechT5FeatureExtractor
     from .processing_speecht5 import SpeechT5Processor
 
     try:
@@ -81,14 +74,6 @@
     else:
         from .tokenization_speecht5 import SpeechT5Tokenizer
 
-    try:
-        if not is_speech_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_speecht5 import SpeechT5FeatureExtractor
-
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
index dd5ff4c8a1af..84d51e97df95 100644
--- a/src/transformers/models/speecht5/feature_extraction_speecht5.py
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -18,9 +18,8 @@
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
-import torch
 
-from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram
+from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import PaddingStrategy, TensorType, logging
@@ -113,8 +112,7 @@ def __init__(
         self.n_fft = optimal_fft_length(self.sample_size)
         self.n_freqs = (self.n_fft // 2) + 1
 
-        window = getattr(torch, self.win_function)(window_length=self.sample_size, periodic=True)
-        self.window = window.numpy().astype(np.float64)
+        self.window = window_function(window_length=self.sample_size, name=self.win_function, periodic=True)
 
         self.mel_filters = mel_filter_bank(
             num_frequency_bins=self.n_freqs,
diff --git a/src/transformers/models/tvlt/__init__.py b/src/transformers/models/tvlt/__init__.py
index 5ca90bb744e8..86c0f7c1c0b9 100644
--- a/src/transformers/models/tvlt/__init__.py
+++ b/src/transformers/models/tvlt/__init__.py
@@ -20,7 +20,6 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
-    is_speech_available,
     is_torch_available,
     is_vision_available,
 )
@@ -28,6 +27,7 @@
 
 _import_structure = {
     "configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
+    "feature_extraction_tvlt": ["TvltFeatureExtractor"],
     "processing_tvlt": ["TvltProcessor"],
 }
 
@@ -53,17 +53,11 @@
 else:
     _import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]
 
-try:
-    if not is_speech_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_tvlt"] = ["TvltFeatureExtractor"]
 
 if TYPE_CHECKING:
     from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
     from .processing_tvlt import TvltProcessor
+    from .feature_extraction_tvlt import TvltFeatureExtractor
 
     try:
         if not is_torch_available():
@@ -87,13 +81,6 @@
     else:
         from .image_processing_tvlt import TvltImageProcessor
 
-    try:
-        if not is_speech_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_tvlt import TvltFeatureExtractor
 
 else:
     import sys
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index 6c591588581d..0bf08ebea42b 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -9,36 +9,8 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["speech"])
 
 
-class EncodecFeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])
-
-
-class MCTCTFeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])
-
-
 class Speech2TextFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["speech"])
-
-
-class SpeechT5FeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])
-
-
-class TvltFeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])
diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py
index 95639fcda5ff..bf40135c32c8 100644
--- a/tests/models/encodec/test_feature_extraction_encodec.py
+++ b/tests/models/encodec/test_feature_extraction_encodec.py
@@ -20,16 +20,13 @@
 
 import numpy as np
 
-from transformers import is_speech_available
+from transformers import EncodecFeatureExtractor
 from transformers.testing_utils import require_torch
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
-if is_speech_available():
-    from transformers import EncodecFeatureExtractor
-
 if is_torch_available():
     import torch
 
@@ -103,7 +100,7 @@ def _flatten(list_of_lists):
 
 @require_torch
 class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = EncodecFeatureExtractor if is_speech_available() else None
+    feature_extraction_class = EncodecFeatureExtractor
 
     def setUp(self):
         self.feat_extract_tester = EnCodecFeatureExtractionTester(self)
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
index f3d8f0fea940..f1825c364043 100644
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ b/tests/models/mctct/test_feature_extraction_mctct.py
@@ -20,15 +20,12 @@
 
 import numpy as np
 
-from transformers import is_speech_available
+from transformers import MCTCTFeatureExtractor
 from transformers.testing_utils import require_torch
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
-if is_speech_available():
-    from transformers import MCTCTFeatureExtractor
-
 global_rng = random.Random()
 
 
@@ -102,7 +99,7 @@ def _flatten(list_of_lists):
 
 @require_torch
 class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None
+    feature_extraction_class = MCTCTFeatureExtractor
 
     def setUp(self):
         self.feat_extract_tester = MCTCTFeatureExtractionTester(self)
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
index a09bf7f8ae58..038e6117f7cf 100644
--- a/tests/models/speecht5/test_feature_extraction_speecht5.py
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -20,16 +20,13 @@
 
 import numpy as np
 
-from transformers import BatchFeature, is_speech_available
+from transformers import BatchFeature, SpeechT5FeatureExtractor
 from transformers.testing_utils import require_torch
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
-if is_speech_available():
-    from transformers import SpeechT5FeatureExtractor
-
 if is_torch_available():
     import torch
 
@@ -142,7 +139,7 @@ def prepare_inputs_for_target(self, equal_length=False, numpify=False):
 
 @require_torch
 class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = SpeechT5FeatureExtractor if is_speech_available() else None
+    feature_extraction_class = SpeechT5FeatureExtractor
 
     def setUp(self):
         self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
index 051708a30698..ad37b1f984d3 100644
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from transformers import is_datasets_available, is_speech_available
+from transformers import TvltFeatureExtractor, is_datasets_available
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
 from transformers.utils.import_utils import is_torch_available
 
@@ -35,9 +35,6 @@
 if is_datasets_available():
     from datasets import load_dataset
 
-if is_speech_available():
-    from transformers import TvltFeatureExtractor
-
 global_rng = random.Random()
 
 
@@ -111,7 +108,7 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = TvltFeatureExtractor if is_speech_available() else None
+    feature_extraction_class = TvltFeatureExtractor
 
     def setUp(self):
         self.feat_extract_tester = TvltFeatureExtractionTester(self)

From a3b402ff9ad6220c31dd84e411f2e2b6956f1427 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathikr@usc.edu>
Date: Tue, 4 Jul 2023 08:05:12 -0700
Subject: [PATCH 596/935] llama fp16 torch.max bug fix (#24561)

* open llama fp16 bug fix

* bug fix

* bug fixed

* make style

* Update modeling_llama.py

* apply formatting

* Address amy's comment

---------

Co-authored-by: Prathik Rao <prathikrao@microsoft.com@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Co-authored-by: root <root@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
---
 src/transformers/models/llama/modeling_llama.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 24231c3f777d..c8d003f6989c 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -224,9 +224,10 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            dtype_min = torch.tensor(
+                torch.finfo(attn_weights.dtype).min, device=attn_weights.device, dtype=attn_weights.dtype
             )
+            attn_weights = torch.max(attn_weights, dtype_min)
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)

From f3e96235a324dfd14a3e4ed4535259768b6c9b0e Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 4 Jul 2023 17:06:05 +0100
Subject: [PATCH 597/935] documentation_tests.txt - sort filenames
 alphabetically (#24647)

* Sort filenames alphabetically

* Add check for order
---
 utils/check_doctest_list.py   |   4 +
 utils/documentation_tests.txt | 552 +++++++++++++++++-----------------
 2 files changed, 280 insertions(+), 276 deletions(-)

diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py
index c81c3d8e212e..13ef8c52d595 100644
--- a/utils/check_doctest_list.py
+++ b/utils/check_doctest_list.py
@@ -24,12 +24,16 @@
 if __name__ == "__main__":
     doctest_file_path = os.path.join(REPO_PATH, "utils/documentation_tests.txt")
     non_existent_paths = []
+    all_paths = []
     with open(doctest_file_path) as fp:
         for line in fp:
             line = line.strip()
             path = os.path.join(REPO_PATH, line)
             if not (os.path.isfile(path) or os.path.isdir(path)):
                 non_existent_paths.append(line)
+            all_paths.append(path)
     if len(non_existent_paths) > 0:
         non_existent_paths = "\n".join(non_existent_paths)
         raise ValueError(f"`utils/documentation_tests.txt` contains non-existent paths:\n{non_existent_paths}")
+    if all_paths != sorted(all_paths):
+        raise ValueError("Files in `utils/documentation_tests.txt` are not in alphabetical order.")
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 2bbaabb07389..1fbfb0f9d84f 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -1,58 +1,120 @@
-docs/source/en/quicktour.md
-docs/source/es/quicktour.md
-docs/source/en/pipeline_tutorial.md
 docs/source/en/autoclass_tutorial.md
-docs/source/en/task_summary.md
+docs/source/en/model_doc/byt5.md
+docs/source/en/model_doc/donut.md
+docs/source/en/model_doc/encoder-decoder.md
 docs/source/en/model_doc/markuplm.md
 docs/source/en/model_doc/speech_to_text.md
 docs/source/en/model_doc/switch_transformers.md
 docs/source/en/model_doc/t5.md
 docs/source/en/model_doc/t5v1.1.md
-docs/source/en/model_doc/byt5.md
 docs/source/en/model_doc/tapex.md
-docs/source/en/model_doc/donut.md
-docs/source/en/model_doc/encoder-decoder.md
+docs/source/en/pipeline_tutorial.md
+docs/source/en/quicktour.md
+docs/source/en/task_summary.md
+docs/source/es/quicktour.md
 src/transformers/generation/configuration_utils.py
 src/transformers/generation/tf_utils.py
 src/transformers/generation/utils.py
 src/transformers/models/albert/configuration_albert.py
 src/transformers/models/albert/modeling_albert.py
 src/transformers/models/albert/modeling_tf_albert.py
+src/transformers/models/albert/tokenization_albert.py
+src/transformers/models/albert/tokenization_albert_fast.py
+src/transformers/models/align/processing_align.py
+src/transformers/models/altclip/processing_altclip.py
+src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
 src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+src/transformers/models/auto/feature_extraction_auto.py
+src/transformers/models/auto/image_processing_auto.py
+src/transformers/models/auto/processing_auto.py
+src/transformers/models/auto/tokenization_auto.py
 src/transformers/models/bart/configuration_bart.py
 src/transformers/models/bart/modeling_bart.py
+src/transformers/models/bart/tokenization_bart.py
+src/transformers/models/bart/tokenization_bart_fast.py
+src/transformers/models/barthez/tokenization_barthez.py
+src/transformers/models/barthez/tokenization_barthez_fast.py
+src/transformers/models/bartpho/tokenization_bartpho.py
 src/transformers/models/beit/configuration_beit.py
+src/transformers/models/beit/feature_extraction_beit.py
+src/transformers/models/beit/image_processing_beit.py
 src/transformers/models/beit/modeling_beit.py
 src/transformers/models/bert/configuration_bert.py
 src/transformers/models/bert/modeling_bert.py
 src/transformers/models/bert/modeling_tf_bert.py
+src/transformers/models/bert/tokenization_bert.py
+src/transformers/models/bert/tokenization_bert_fast.py
+src/transformers/models/bert/tokenization_bert_tf.py
 src/transformers/models/bert_generation/configuration_bert_generation.py
-src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
-src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+src/transformers/models/bert_generation/tokenization_bert_generation.py
+src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+src/transformers/models/bertweet/tokenization_bertweet.py
 src/transformers/models/big_bird/configuration_big_bird.py
 src/transformers/models/big_bird/modeling_big_bird.py
+src/transformers/models/big_bird/tokenization_big_bird.py
+src/transformers/models/big_bird/tokenization_big_bird_fast.py
+src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+src/transformers/models/biogpt/tokenization_biogpt.py
+src/transformers/models/bit/image_processing_bit.py
 src/transformers/models/blenderbot/configuration_blenderbot.py
 src/transformers/models/blenderbot/modeling_blenderbot.py
+src/transformers/models/blenderbot/tokenization_blenderbot.py
+src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
 src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
 src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+src/transformers/models/blip/image_processing_blip.py
 src/transformers/models/blip/modeling_blip.py
 src/transformers/models/blip/modeling_tf_blip.py
+src/transformers/models/blip/processing_blip.py
+src/transformers/models/blip_2/processing_blip_2.py
 src/transformers/models/bloom/configuration_bloom.py
+src/transformers/models/bloom/tokenization_bloom_fast.py
+src/transformers/models/bridgetower/image_processing_bridgetower.py
+src/transformers/models/bridgetower/processing_bridgetower.py
+src/transformers/models/byt5/tokenization_byt5.py
 src/transformers/models/camembert/configuration_camembert.py
+src/transformers/models/camembert/tokenization_camembert.py
+src/transformers/models/camembert/tokenization_camembert_fast.py
 src/transformers/models/canine/configuration_canine.py
 src/transformers/models/canine/modeling_canine.py
+src/transformers/models/canine/tokenization_canine.py
+src/transformers/models/chinese_clip/feature_extraction_chinese_clip.py
+src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+src/transformers/models/chinese_clip/processing_chinese_clip.py
 src/transformers/models/clap/configuration_clap.py
+src/transformers/models/clap/feature_extraction_clap.py
 src/transformers/models/clap/modeling_clap.py
+src/transformers/models/clap/processing_clap.py
 src/transformers/models/clip/configuration_clip.py
+src/transformers/models/clip/feature_extraction_clip.py
+src/transformers/models/clip/image_processing_clip.py
+src/transformers/models/clip/processing_clip.py
+src/transformers/models/clip/tokenization_clip.py
+src/transformers/models/clip/tokenization_clip_fast.py
 src/transformers/models/clipseg/modeling_clipseg.py
+src/transformers/models/clipseg/processing_clipseg.py
 src/transformers/models/codegen/configuration_codegen.py
+src/transformers/models/codegen/tokenization_codegen.py
+src/transformers/models/codegen/tokenization_codegen_fast.py
 src/transformers/models/conditional_detr/configuration_conditional_detr.py
+src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
+src/transformers/models/conditional_detr/image_processing_conditional_detr.py
 src/transformers/models/conditional_detr/modeling_conditional_detr.py
 src/transformers/models/convbert/configuration_convbert.py
+src/transformers/models/convbert/tokenization_convbert.py
+src/transformers/models/convbert/tokenization_convbert_fast.py
 src/transformers/models/convnext/configuration_convnext.py
+src/transformers/models/convnext/feature_extraction_convnext.py
+src/transformers/models/convnext/image_processing_convnext.py
 src/transformers/models/convnext/modeling_convnext.py
+src/transformers/models/cpm/tokenization_cpm.py
+src/transformers/models/cpm/tokenization_cpm_fast.py
 src/transformers/models/ctrl/configuration_ctrl.py
 src/transformers/models/ctrl/modeling_ctrl.py
+src/transformers/models/ctrl/tokenization_ctrl.py
 src/transformers/models/cvt/configuration_cvt.py
 src/transformers/models/cvt/modeling_cvt.py
 src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -62,433 +124,371 @@ src/transformers/models/data2vec/modeling_data2vec_audio.py
 src/transformers/models/data2vec/modeling_data2vec_vision.py
 src/transformers/models/deberta/configuration_deberta.py
 src/transformers/models/deberta/modeling_deberta.py
+src/transformers/models/deberta/tokenization_deberta.py
+src/transformers/models/deberta/tokenization_deberta_fast.py
 src/transformers/models/deberta_v2/configuration_deberta_v2.py
 src/transformers/models/deberta_v2/modeling_deberta_v2.py
+src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
 src/transformers/models/decision_transformer/configuration_decision_transformer.py
 src/transformers/models/deformable_detr/configuration_deformable_detr.py
+src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py
+src/transformers/models/deformable_detr/image_processing_deformable_detr.py
 src/transformers/models/deformable_detr/modeling_deformable_detr.py
 src/transformers/models/deit/configuration_deit.py
+src/transformers/models/deit/feature_extraction_deit.py
+src/transformers/models/deit/image_processing_deit.py
 src/transformers/models/deit/modeling_deit.py
 src/transformers/models/deit/modeling_tf_deit.py
 src/transformers/models/deta/configuration_deta.py
+src/transformers/models/deta/image_processing_deta.py
 src/transformers/models/deta/modeling_deta.py
 src/transformers/models/detr/configuration_detr.py
+src/transformers/models/detr/feature_extraction_detr.py
+src/transformers/models/detr/image_processing_detr.py
 src/transformers/models/detr/modeling_detr.py
 src/transformers/models/dinat/configuration_dinat.py
 src/transformers/models/dinat/modeling_dinat.py
 src/transformers/models/distilbert/configuration_distilbert.py
+src/transformers/models/distilbert/tokenization_distilbert.py
+src/transformers/models/distilbert/tokenization_distilbert_fast.py
+src/transformers/models/donut/feature_extraction_donut.py
+src/transformers/models/donut/image_processing_donut.py
+src/transformers/models/donut/processing_donut.py
 src/transformers/models/dpr/configuration_dpr.py
+src/transformers/models/dpr/tokenization_dpr.py
+src/transformers/models/dpr/tokenization_dpr_fast.py
+src/transformers/models/dpt/feature_extraction_dpt.py
+src/transformers/models/dpt/image_processing_dpt.py
 src/transformers/models/dpt/modeling_dpt.py
+src/transformers/models/efficientformer/image_processing_efficientformer.py
+src/transformers/models/efficientformer/modeling_tf_efficientformer.py
+src/transformers/models/efficientnet/image_processing_efficientnet.py
 src/transformers/models/electra/configuration_electra.py
 src/transformers/models/electra/modeling_electra.py
 src/transformers/models/electra/modeling_tf_electra.py
-src/transformers/models/efficientformer/modeling_tf_efficientformer.py
+src/transformers/models/electra/tokenization_electra.py
+src/transformers/models/electra/tokenization_electra_fast.py
+src/transformers/models/encodec/feature_extraction_encodec.py
 src/transformers/models/encodec/modeling_encodec.py
 src/transformers/models/ernie/configuration_ernie.py
 src/transformers/models/ernie_m/configuration_ernie_m.py
 src/transformers/models/ernie_m/modeling_ernie_m.py
+src/transformers/models/ernie_m/tokenization_ernie_m.py
+src/transformers/models/esm/tokenization_esm.py
+src/transformers/models/flaubert/tokenization_flaubert.py
 src/transformers/models/flava/configuration_flava.py
+src/transformers/models/flava/feature_extraction_flava.py
+src/transformers/models/flava/image_processing_flava.py
+src/transformers/models/flava/processing_flava.py
 src/transformers/models/fnet/configuration_fnet.py
+src/transformers/models/fnet/tokenization_fnet.py
+src/transformers/models/fnet/tokenization_fnet_fast.py
 src/transformers/models/fsmt/configuration_fsmt.py
+src/transformers/models/fsmt/tokenization_fsmt.py
+src/transformers/models/funnel/tokenization_funnel.py
+src/transformers/models/funnel/tokenization_funnel_fast.py
 src/transformers/models/git/modeling_git.py
+src/transformers/models/git/processing_git.py
+src/transformers/models/glpn/feature_extraction_glpn.py
+src/transformers/models/glpn/image_processing_glpn.py
 src/transformers/models/glpn/modeling_glpn.py
 src/transformers/models/gpt2/configuration_gpt2.py
 src/transformers/models/gpt2/modeling_gpt2.py
-src/transformers/models/gptj/modeling_gptj.py
+src/transformers/models/gpt2/tokenization_gpt2.py
+src/transformers/models/gpt2/tokenization_gpt2_fast.py
+src/transformers/models/gpt2/tokenization_gpt2_tf.py
 src/transformers/models/gpt_neo/configuration_gpt_neo.py
 src/transformers/models/gpt_neox/configuration_gpt_neox.py
+src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
 src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
+src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+src/transformers/models/gptj/modeling_gptj.py
+src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
 src/transformers/models/groupvit/modeling_groupvit.py
 src/transformers/models/groupvit/modeling_tf_groupvit.py
+src/transformers/models/herbert/tokenization_herbert.py
+src/transformers/models/herbert/tokenization_herbert_fast.py
 src/transformers/models/hubert/modeling_hubert.py
 src/transformers/models/imagegpt/configuration_imagegpt.py
+src/transformers/models/imagegpt/feature_extraction_imagegpt.py
+src/transformers/models/imagegpt/image_processing_imagegpt.py
 src/transformers/models/imagegpt/modeling_imagegpt.py
+src/transformers/models/jukebox/tokenization_jukebox.py
+src/transformers/models/jukebox/tokenization_jukebox.py
 src/transformers/models/layoutlm/configuration_layoutlm.py
 src/transformers/models/layoutlm/modeling_layoutlm.py
 src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+src/transformers/models/layoutlm/tokenization_layoutlm.py
+src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
 src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
+src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
 src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
 src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+src/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
+src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
 src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
 src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+src/transformers/models/layoutxlm/processing_layoutxlm.py
+src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+src/transformers/models/led/tokenization_led.py
+src/transformers/models/led/tokenization_led_fast.py
 src/transformers/models/levit/configuration_levit.py
+src/transformers/models/levit/feature_extraction_levit.py
+src/transformers/models/levit/image_processing_levit.py
 src/transformers/models/lilt/modeling_lilt.py
+src/transformers/models/llama/tokenization_llama.py
 src/transformers/models/longformer/modeling_longformer.py
 src/transformers/models/longformer/modeling_tf_longformer.py
+src/transformers/models/longformer/tokenization_longformer.py
+src/transformers/models/longformer/tokenization_longformer_fast.py
 src/transformers/models/longt5/modeling_longt5.py
+src/transformers/models/luke/tokenization_luke.py
+src/transformers/models/lxmert/tokenization_lxmert.py
+src/transformers/models/lxmert/tokenization_lxmert_fast.py
+src/transformers/models/m2m_100/configuration_m2m_100.py
+src/transformers/models/m2m_100/tokenization_m2m_100.py
 src/transformers/models/marian/modeling_marian.py
+src/transformers/models/marian/tokenization_marian.py
 src/transformers/models/markuplm/modeling_markuplm.py
+src/transformers/models/markuplm/processing_markuplm.py
+src/transformers/models/markuplm/tokenization_markuplm.py
+src/transformers/models/markuplm/tokenization_markuplm_fast.py
 src/transformers/models/mask2former/configuration_mask2former.py
+src/transformers/models/mask2former/image_processing_mask2former.py
 src/transformers/models/mask2former/modeling_mask2former.py
 src/transformers/models/maskformer/configuration_maskformer.py
+src/transformers/models/maskformer/feature_extraction_maskformer.py
+src/transformers/models/maskformer/image_processing_maskformer.py
 src/transformers/models/maskformer/modeling_maskformer.py
 src/transformers/models/mbart/configuration_mbart.py
 src/transformers/models/mbart/modeling_mbart.py
 src/transformers/models/mbart/modeling_tf_mbart.py
+src/transformers/models/mbart/tokenization_mbart.py
+src/transformers/models/mbart/tokenization_mbart_fast.py
+src/transformers/models/mbart50/tokenization_mbart50.py
+src/transformers/models/mbart50/tokenization_mbart50_fast.py
 src/transformers/models/mctct/configuration_mctct.py
+src/transformers/models/mctct/feature_extraction_mctct.py
+src/transformers/models/mctct/processing_mctct.py
 src/transformers/models/megatron_bert/configuration_megatron_bert.py
+src/transformers/models/mgp_str/processing_mgp_str.py
+src/transformers/models/mgp_str/tokenization_mgp_str.py
+src/transformers/models/mluke/tokenization_mluke.py
 src/transformers/models/mobilebert/configuration_mobilebert.py
 src/transformers/models/mobilebert/modeling_mobilebert.py
 src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+src/transformers/models/mobilebert/tokenization_mobilebert.py
+src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
+src/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py
+src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
 src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
+src/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py
+src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
 src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
+src/transformers/models/mobilevit/feature_extraction_mobilevit.py
+src/transformers/models/mobilevit/image_processing_mobilevit.py
 src/transformers/models/mobilevit/modeling_mobilevit.py
 src/transformers/models/mobilevit/modeling_tf_mobilevit.py
-src/transformers/models/musicgen/modeling_musicgen.py
+src/transformers/models/mpnet/tokenization_mpnet.py
+src/transformers/models/mpnet/tokenization_mpnet_fast.py
 src/transformers/models/musicgen/configuration_musicgen.py
+src/transformers/models/musicgen/modeling_musicgen.py
 src/transformers/models/musicgen/processing_musicgen.py
 src/transformers/models/mvp/configuration_mvp.py
+src/transformers/models/mvp/tokenization_mvp.py
+src/transformers/models/mvp/tokenization_mvp_fast.py
 src/transformers/models/nat/configuration_nat.py
 src/transformers/models/nat/modeling_nat.py
 src/transformers/models/nezha/configuration_nezha.py
+src/transformers/models/nllb/tokenization_nllb.py
+src/transformers/models/nllb/tokenization_nllb_fast.py
 src/transformers/models/oneformer/configuration_oneformer.py
+src/transformers/models/oneformer/image_processing_oneformer.py
 src/transformers/models/oneformer/modeling_oneformer.py
+src/transformers/models/oneformer/processing_oneformer.py
 src/transformers/models/openai/configuration_openai.py
+src/transformers/models/openai/tokenization_openai.py
+src/transformers/models/openai/tokenization_openai_fast.py
 src/transformers/models/opt/configuration_opt.py
 src/transformers/models/opt/modeling_opt.py
 src/transformers/models/opt/modeling_tf_opt.py
+src/transformers/models/owlvit/feature_extraction_owlvit.py
+src/transformers/models/owlvit/image_processing_owlvit.py
 src/transformers/models/owlvit/modeling_owlvit.py
+src/transformers/models/owlvit/processing_owlvit.py
 src/transformers/models/pegasus/configuration_pegasus.py
 src/transformers/models/pegasus/modeling_pegasus.py
+src/transformers/models/pegasus/tokenization_pegasus.py
+src/transformers/models/pegasus/tokenization_pegasus_fast.py
 src/transformers/models/pegasus_x/configuration_pegasus_x.py
+src/transformers/models/perceiver/feature_extraction_perceiver.py
+src/transformers/models/perceiver/image_processing_perceiver.py
 src/transformers/models/perceiver/modeling_perceiver.py
+src/transformers/models/perceiver/tokenization_perceiver.py
+src/transformers/models/phobert/tokenization_phobert.py
+src/transformers/models/pix2struct/modeling_pix2struct.py
 src/transformers/models/plbart/configuration_plbart.py
 src/transformers/models/plbart/modeling_plbart.py
+src/transformers/models/plbart/tokenization_plbart.py
 src/transformers/models/poolformer/configuration_poolformer.py
+src/transformers/models/poolformer/feature_extraction_poolformer.py
+src/transformers/models/poolformer/image_processing_poolformer.py
 src/transformers/models/poolformer/modeling_poolformer.py
+src/transformers/models/prophetnet/tokenization_prophetnet.py
+src/transformers/models/rag/tokenization_rag.py
 src/transformers/models/realm/configuration_realm.py
+src/transformers/models/realm/tokenization_realm.py
+src/transformers/models/realm/tokenization_realm_fast.py
 src/transformers/models/reformer/configuration_reformer.py
 src/transformers/models/reformer/modeling_reformer.py
+src/transformers/models/reformer/tokenization_reformer.py
+src/transformers/models/reformer/tokenization_reformer_fast.py
 src/transformers/models/regnet/modeling_regnet.py
 src/transformers/models/regnet/modeling_tf_regnet.py
+src/transformers/models/rembert/tokenization_rembert.py
+src/transformers/models/rembert/tokenization_rembert_fast.py
 src/transformers/models/resnet/configuration_resnet.py
 src/transformers/models/resnet/modeling_resnet.py
 src/transformers/models/resnet/modeling_tf_resnet.py
+src/transformers/models/retribert/tokenization_retribert.py
+src/transformers/models/retribert/tokenization_retribert_fast.py
 src/transformers/models/roberta/configuration_roberta.py
 src/transformers/models/roberta/modeling_roberta.py
 src/transformers/models/roberta/modeling_tf_roberta.py
+src/transformers/models/roberta/tokenization_roberta.py
+src/transformers/models/roberta/tokenization_roberta_fast.py
 src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
 src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
 src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
 src/transformers/models/roc_bert/modeling_roc_bert.py
+src/transformers/models/roc_bert/tokenization_roc_bert.py
+src/transformers/models/roformer/tokenization_roformer.py
+src/transformers/models/roformer/tokenization_roformer_fast.py
+src/transformers/models/roformer/tokenization_utils.py
+src/transformers/models/segformer/feature_extraction_segformer.py
+src/transformers/models/segformer/image_processing_segformer.py
 src/transformers/models/segformer/modeling_segformer.py
+src/transformers/models/segformer/modeling_tf_segformer.py
 src/transformers/models/sew/configuration_sew.py
 src/transformers/models/sew/modeling_sew.py
 src/transformers/models/sew_d/configuration_sew_d.py
 src/transformers/models/sew_d/modeling_sew_d.py
 src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
 src/transformers/models/speech_to_text/configuration_speech_to_text.py
+src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
 src/transformers/models/speech_to_text/modeling_speech_to_text.py
+src/transformers/models/speech_to_text/processing_speech_to_text.py
+src/transformers/models/speech_to_text/tokenization_speech_to_text.py
 src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
 src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
+src/transformers/models/speecht5/feature_extraction_speecht5.py
 src/transformers/models/speecht5/modeling_speecht5.py
-src/transformers/models/segformer/modeling_tf_segformer.py
+src/transformers/models/speecht5/processing_speecht5.py
+src/transformers/models/speecht5/tokenization_speecht5.py
+src/transformers/models/splinter/tokenization_splinter.py
+src/transformers/models/splinter/tokenization_splinter_fast.py
 src/transformers/models/squeezebert/configuration_squeezebert.py
+src/transformers/models/squeezebert/tokenization_squeezebert.py
+src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
 src/transformers/models/swin/configuration_swin.py
 src/transformers/models/swin/modeling_swin.py
+src/transformers/models/swin2sr/image_processing_swin2sr.py
 src/transformers/models/swin2sr/modeling_swin2sr.py
 src/transformers/models/swinv2/configuration_swinv2.py
+src/transformers/models/t5/tokenization_t5.py
+src/transformers/models/t5/tokenization_t5_fast.py
 src/transformers/models/table_transformer/modeling_table_transformer.py
+src/transformers/models/tapas/tokenization_tapas.py
+src/transformers/models/tapex/tokenization_tapex.py
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+src/transformers/models/timesformer/configuration_timesformer.py
 src/transformers/models/timesformer/modeling_timesformer.py
 src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
 src/transformers/models/transfo_xl/configuration_transfo_xl.py
+src/transformers/models/transfo_xl/tokenization_transfo_xl.py
 src/transformers/models/trocr/configuration_trocr.py
 src/transformers/models/trocr/modeling_trocr.py
+src/transformers/models/trocr/processing_trocr.py
+src/transformers/models/tvlt/feature_extraction_tvlt.py
+src/transformers/models/tvlt/image_processing_tvlt.py
+src/transformers/models/tvlt/processing_tvlt.py
 src/transformers/models/unispeech/configuration_unispeech.py
 src/transformers/models/unispeech/modeling_unispeech.py
 src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
 src/transformers/models/upernet/modeling_upernet.py
 src/transformers/models/van/modeling_van.py
-src/transformers/models/videomae/modeling_videomae.py
+src/transformers/models/videomae/feature_extraction_videomae.py
+src/transformers/models/videomae/image_processing_videomae.py
+src/transformers/models/videomae/modeling_videomae.py
+src/transformers/models/vilt/feature_extraction_vilt.py
+src/transformers/models/vilt/image_processing_vilt.py
 src/transformers/models/vilt/modeling_vilt.py
+src/transformers/models/vilt/processing_vilt.py
 src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
 src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
 src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
 src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+src/transformers/models/visual_bert/configuration_visual_bert.py
 src/transformers/models/vit/configuration_vit.py
-src/transformers/models/vit/modeling_vit.py
+src/transformers/models/vit/feature_extraction_vit.py
+src/transformers/models/vit/image_processing_vit.py
 src/transformers/models/vit/modeling_tf_vit.py
-src/transformers/models/vit_mae/modeling_vit_mae.py
+src/transformers/models/vit/modeling_vit.py
+src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
 src/transformers/models/vit_mae/configuration_vit_mae.py
+src/transformers/models/vit_mae/modeling_vit_mae.py
 src/transformers/models/vit_msn/modeling_vit_msn.py
-src/transformers/models/visual_bert/configuration_visual_bert.py
 src/transformers/models/wav2vec2/configuration_wav2vec2.py
+src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
 src/transformers/models/wav2vec2/modeling_wav2vec2.py
+src/transformers/models/wav2vec2/processing_wav2vec2.py
+src/transformers/models/wav2vec2/tokenization_wav2vec2.py
 src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
 src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
 src/transformers/models/wavlm/configuration_wavlm.py
 src/transformers/models/wavlm/modeling_wavlm.py
 src/transformers/models/whisper/configuration_whisper.py
-src/transformers/models/whisper/modeling_whisper.py
+src/transformers/models/whisper/feature_extraction_whisper.py
 src/transformers/models/whisper/modeling_tf_whisper.py
-src/transformers/models/xlm/configuration_xlm.py
-src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
-src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
-src/transformers/models/xlnet/configuration_xlnet.py
-src/transformers/models/xmod/configuration_xmod.py
-src/transformers/models/xmod/modeling_xmod.py
-src/transformers/models/yolos/configuration_yolos.py
-src/transformers/models/yolos/modeling_yolos.py
-src/transformers/models/x_clip/modeling_x_clip.py
-src/transformers/models/yoso/configuration_yoso.py
-src/transformers/models/timesformer/configuration_timesformer.py
-src/transformers/pipelines/
-src/transformers/models/albert/tokenization_albert.py
-src/transformers/models/albert/tokenization_albert_fast.py
-src/transformers/models/barthez/tokenization_barthez.py
-src/transformers/models/barthez/tokenization_barthez_fast.py
-src/transformers/models/bartpho/tokenization_bartpho.py
-src/transformers/models/bert/tokenization_bert.py
-src/transformers/models/bert/tokenization_bert_fast.py
-src/transformers/models/bert/tokenization_bert_tf.py
-src/transformers/models/bert_generation/tokenization_bert_generation.py
-src/transformers/models/bert_japanese/tokenization_bert_japanese.py
-src/transformers/models/big_bird/tokenization_big_bird.py
-src/transformers/models/big_bird/tokenization_big_bird_fast.py
-src/transformers/models/biogpt/tokenization_biogpt.py
-src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
-src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
-src/transformers/models/byt5/tokenization_byt5.py
-src/transformers/models/camembert/tokenization_camembert.py
-src/transformers/models/camembert/tokenization_camembert_fast.py
-src/transformers/models/canine/tokenization_canine.py
-src/transformers/models/clip/tokenization_clip.py
-src/transformers/models/clip/tokenization_clip_fast.py
-src/transformers/models/convbert/tokenization_convbert.py
-src/transformers/models/convbert/tokenization_convbert_fast.py
-src/transformers/models/cpm/tokenization_cpm.py
-src/transformers/models/cpm/tokenization_cpm_fast.py
-src/transformers/models/ctrl/tokenization_ctrl.py
-src/transformers/models/deberta_v2/tokenization_deberta_v2.py
-src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
-src/transformers/models/distilbert/tokenization_distilbert.py
-src/transformers/models/distilbert/tokenization_distilbert_fast.py
-src/transformers/models/electra/tokenization_electra.py
-src/transformers/models/electra/tokenization_electra_fast.py
-src/transformers/models/ernie_m/tokenization_ernie_m.py
-src/transformers/models/esm/tokenization_esm.py
-src/transformers/models/flaubert/tokenization_flaubert.py
-src/transformers/models/fnet/tokenization_fnet.py
-src/transformers/models/fnet/tokenization_fnet_fast.py
-src/transformers/models/fsmt/tokenization_fsmt.py
-src/transformers/models/funnel/tokenization_funnel.py
-src/transformers/models/funnel/tokenization_funnel_fast.py
-src/transformers/models/gpt2/tokenization_gpt2_tf.py
-src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
-src/transformers/models/herbert/tokenization_herbert.py
-src/transformers/models/herbert/tokenization_herbert_fast.py
-src/transformers/models/jukebox/tokenization_jukebox.py
-src/transformers/models/layoutlm/tokenization_layoutlm.py
-src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
-src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
-src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
-src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
-src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
-src/transformers/models/layoutxlm/tokenization_layoutxlm.py
-src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
-src/transformers/models/llama/tokenization_llama.py
-src/transformers/models/lxmert/tokenization_lxmert.py
-src/transformers/models/lxmert/tokenization_lxmert_fast.py
-src/transformers/models/m2m_100/configuration_m2m_100.py
-src/transformers/models/markuplm/tokenization_markuplm.py
-src/transformers/models/markuplm/tokenization_markuplm_fast.py
-src/transformers/models/mbart/tokenization_mbart.py
-src/transformers/models/mbart/tokenization_mbart_fast.py
-src/transformers/models/mbart50/tokenization_mbart50.py
-src/transformers/models/mbart50/tokenization_mbart50_fast.py
-src/transformers/models/mgp_str/tokenization_mgp_str.py
-src/transformers/models/mluke/tokenization_mluke.py
-src/transformers/models/mobilebert/tokenization_mobilebert.py
-src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
-src/transformers/models/mpnet/tokenization_mpnet.py
-src/transformers/models/mpnet/tokenization_mpnet_fast.py
-src/transformers/models/nllb/tokenization_nllb.py
-src/transformers/models/nllb/tokenization_nllb_fast.py
-src/transformers/models/openai/tokenization_openai.py
-src/transformers/models/openai/tokenization_openai_fast.py
-src/transformers/models/pegasus/tokenization_pegasus.py
-src/transformers/models/pegasus/tokenization_pegasus_fast.py
-src/transformers/models/perceiver/tokenization_perceiver.py
-src/transformers/models/phobert/tokenization_phobert.py
-src/transformers/models/pix2struct/modeling_pix2struct.py
-src/transformers/models/plbart/tokenization_plbart.py
-src/transformers/models/prophetnet/tokenization_prophetnet.py
-src/transformers/models/rag/tokenization_rag.py
-src/transformers/models/realm/tokenization_realm.py
-src/transformers/models/realm/tokenization_realm_fast.py
-src/transformers/models/reformer/tokenization_reformer.py
-src/transformers/models/reformer/tokenization_reformer_fast.py
-src/transformers/models/rembert/tokenization_rembert.py
-src/transformers/models/rembert/tokenization_rembert_fast.py
-src/transformers/models/retribert/tokenization_retribert.py
-src/transformers/models/retribert/tokenization_retribert_fast.py
-src/transformers/models/roc_bert/tokenization_roc_bert.py
-src/transformers/models/roformer/tokenization_utils.py
-src/transformers/models/speecht5/tokenization_speecht5.py
-src/transformers/models/speech_to_text/tokenization_speech_to_text.py
-src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
-src/transformers/models/splinter/tokenization_splinter.py
-src/transformers/models/splinter/tokenization_splinter_fast.py
-src/transformers/models/squeezebert/tokenization_squeezebert.py
-src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
-src/transformers/models/t5/tokenization_t5.py
-src/transformers/models/t5/tokenization_t5_fast.py
-src/transformers/models/tapas/tokenization_tapas.py
-src/transformers/models/tapex/tokenization_tapex.py
-src/transformers/models/wav2vec2/tokenization_wav2vec2.py
-src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+src/transformers/models/whisper/modeling_whisper.py
+src/transformers/models/whisper/processing_whisper.py
 src/transformers/models/whisper/tokenization_whisper.py
 src/transformers/models/whisper/tokenization_whisper_fast.py
+src/transformers/models/x_clip/modeling_x_clip.py
+src/transformers/models/x_clip/processing_x_clip.py
 src/transformers/models/xglm/tokenization_xglm.py
 src/transformers/models/xglm/tokenization_xglm_fast.py
+src/transformers/models/xlm/configuration_xlm.py
 src/transformers/models/xlm/tokenization_xlm.py
 src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
 src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
 src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
+src/transformers/models/xlnet/configuration_xlnet.py
 src/transformers/models/xlnet/tokenization_xlnet.py
 src/transformers/models/xlnet/tokenization_xlnet_fast.py
-src/transformers/models/beit/image_processing_beit.py
-src/transformers/models/bit/image_processing_bit.py
-src/transformers/models/blip/image_processing_blip.py
-src/transformers/models/bridgetower/image_processing_bridgetower.py
-src/transformers/models/chinese_clip/image_processing_chinese_clip.py
-src/transformers/models/clip/image_processing_clip.py
-src/transformers/models/conditional_detr/image_processing_conditional_detr.py
-src/transformers/models/convnext/image_processing_convnext.py
-src/transformers/models/deformable_detr/image_processing_deformable_detr.py
-src/transformers/models/deit/image_processing_deit.py
-src/transformers/models/deta/image_processing_deta.py
-src/transformers/models/detr/image_processing_detr.py
-src/transformers/models/donut/image_processing_donut.py
-src/transformers/models/dpt/image_processing_dpt.py
-src/transformers/models/efficientformer/image_processing_efficientformer.py
-src/transformers/models/efficientnet/image_processing_efficientnet.py
-src/transformers/models/flava/image_processing_flava.py
-src/transformers/models/glpn/image_processing_glpn.py
-src/transformers/models/imagegpt/image_processing_imagegpt.py
-src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
-src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
-src/transformers/models/levit/image_processing_levit.py
-src/transformers/models/mask2former/image_processing_mask2former.py
-src/transformers/models/maskformer/image_processing_maskformer.py
-src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
-src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
-src/transformers/models/mobilevit/image_processing_mobilevit.py
-src/transformers/models/oneformer/image_processing_oneformer.py
-src/transformers/models/owlvit/image_processing_owlvit.py
-src/transformers/models/perceiver/image_processing_perceiver.py
-src/transformers/models/poolformer/image_processing_poolformer.py
-src/transformers/models/segformer/image_processing_segformer.py
-src/transformers/models/swin2sr/image_processing_swin2sr.py
-src/transformers/models/tvlt/image_processing_tvlt.py
-src/transformers/models/videomae/image_processing_videomae.py
-src/transformers/models/vilt/image_processing_vilt.py
-src/transformers/models/vit/image_processing_vit.py
-src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
-src/transformers/models/yolos/image_processing_yolos.py
-src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
-src/transformers/models/beit/feature_extraction_beit.py
-src/transformers/models/chinese_clip/feature_extraction_chinese_clip.py
-src/transformers/models/clap/feature_extraction_clap.py
-src/transformers/models/clip/feature_extraction_clip.py
-src/transformers/models/conditional_detr/feature_extraction_conditional_detr.py
-src/transformers/models/convnext/feature_extraction_convnext.py
-src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py
-src/transformers/models/deit/feature_extraction_deit.py
-src/transformers/models/detr/feature_extraction_detr.py
-src/transformers/models/donut/feature_extraction_donut.py
-src/transformers/models/dpt/feature_extraction_dpt.py
-src/transformers/models/encodec/feature_extraction_encodec.py
-src/transformers/models/flava/feature_extraction_flava.py
-src/transformers/models/glpn/feature_extraction_glpn.py
-src/transformers/models/imagegpt/feature_extraction_imagegpt.py
-src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
-src/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
-src/transformers/models/levit/feature_extraction_levit.py
-src/transformers/models/maskformer/feature_extraction_maskformer.py
-src/transformers/models/mctct/feature_extraction_mctct.py
-src/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py
-src/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py
-src/transformers/models/mobilevit/feature_extraction_mobilevit.py
-src/transformers/models/owlvit/feature_extraction_owlvit.py
-src/transformers/models/perceiver/feature_extraction_perceiver.py
-src/transformers/models/poolformer/feature_extraction_poolformer.py
-src/transformers/models/segformer/feature_extraction_segformer.py
-src/transformers/models/speecht5/feature_extraction_speecht5.py
-src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
-src/transformers/models/tvlt/feature_extraction_tvlt.py
-src/transformers/models/videomae/feature_extraction_videomae.py
-src/transformers/models/vilt/feature_extraction_vilt.py
-src/transformers/models/vit/feature_extraction_vit.py
-src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
-src/transformers/models/whisper/feature_extraction_whisper.py
+src/transformers/models/xmod/configuration_xmod.py
+src/transformers/models/xmod/modeling_xmod.py
+src/transformers/models/yolos/configuration_yolos.py
 src/transformers/models/yolos/feature_extraction_yolos.py
-src/transformers/models/align/processing_align.py
-src/transformers/models/altclip/processing_altclip.py
-src/transformers/models/blip/processing_blip.py
-src/transformers/models/blip_2/processing_blip_2.py
-src/transformers/models/bridgetower/processing_bridgetower.py
-src/transformers/models/chinese_clip/processing_chinese_clip.py
-src/transformers/models/clap/processing_clap.py
-src/transformers/models/clip/processing_clip.py
-src/transformers/models/clipseg/processing_clipseg.py
-src/transformers/models/donut/processing_donut.py
-src/transformers/models/flava/processing_flava.py
-src/transformers/models/git/processing_git.py
-src/transformers/models/layoutlmv2/processing_layoutlmv2.py
-src/transformers/models/layoutlmv3/processing_layoutlmv3.py
-src/transformers/models/layoutxlm/processing_layoutxlm.py
-src/transformers/models/markuplm/processing_markuplm.py
-src/transformers/models/mctct/processing_mctct.py
-src/transformers/models/mgp_str/processing_mgp_str.py
-src/transformers/models/oneformer/processing_oneformer.py
-src/transformers/models/owlvit/processing_owlvit.py
-src/transformers/models/speecht5/processing_speecht5.py
-src/transformers/models/speech_to_text/processing_speech_to_text.py
-src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
-src/transformers/models/trocr/processing_trocr.py
-src/transformers/models/tvlt/processing_tvlt.py
-src/transformers/models/vilt/processing_vilt.py
-src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
-src/transformers/models/wav2vec2/processing_wav2vec2.py
-src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
-src/transformers/models/whisper/processing_whisper.py
-src/transformers/models/x_clip/processing_x_clip.py
-src/transformers/models/bart/tokenization_bart.py
-src/transformers/models/bart/tokenization_bart_fast.py
-src/transformers/models/blenderbot/tokenization_blenderbot.py
-src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
-src/transformers/models/bloom/tokenization_bloom_fast.py
-src/transformers/models/codegen/tokenization_codegen.py
-src/transformers/models/codegen/tokenization_codegen_fast.py
-src/transformers/models/deberta/tokenization_deberta.py
-src/transformers/models/deberta/tokenization_deberta_fast.py
-src/transformers/models/gpt2/tokenization_gpt2.py
-src/transformers/models/gpt2/tokenization_gpt2_fast.py
-src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
-src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
-src/transformers/models/jukebox/tokenization_jukebox.py
-src/transformers/models/led/tokenization_led.py
-src/transformers/models/led/tokenization_led_fast.py
-src/transformers/models/longformer/tokenization_longformer.py
-src/transformers/models/longformer/tokenization_longformer_fast.py
-src/transformers/models/luke/tokenization_luke.py
-src/transformers/models/mvp/tokenization_mvp.py
-src/transformers/models/mvp/tokenization_mvp_fast.py
-src/transformers/models/roberta/tokenization_roberta.py
-src/transformers/models/roberta/tokenization_roberta_fast.py
-src/transformers/models/auto/feature_extraction_auto.py
-src/transformers/models/auto/image_processing_auto.py
-src/transformers/models/auto/processing_auto.py
-src/transformers/models/auto/tokenization_auto.py
-src/transformers/models/bertweet/tokenization_bertweet.py
-src/transformers/models/dpr/tokenization_dpr.py
-src/transformers/models/dpr/tokenization_dpr_fast.py
-src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
-src/transformers/models/m2m_100/tokenization_m2m_100.py
-src/transformers/models/marian/tokenization_marian.py
-src/transformers/models/roformer/tokenization_roformer.py
-src/transformers/models/roformer/tokenization_roformer_fast.py
-src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+src/transformers/models/yolos/image_processing_yolos.py
+src/transformers/models/yolos/modeling_yolos.py
+src/transformers/models/yoso/configuration_yoso.py
+src/transformers/pipelines/

From ea9caf7abaf5be1dd7d6167c14662fb8c7ac1f53 Mon Sep 17 00:00:00 2001
From: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
Date: Tue, 4 Jul 2023 16:47:57 -0300
Subject: [PATCH 598/935] Update warning messages reffering to
 post_process_object_detection (#24649)

* including the threshold alert in warning messages.

* Updating doc owlvit.md including post_process_object_detection function with threshold.

* fix
---
 docs/source/en/model_doc/owlvit.md                        | 8 ++------
 .../conditional_detr/image_processing_conditional_detr.py | 2 +-
 .../deformable_detr/image_processing_deformable_detr.py   | 2 +-
 src/transformers/models/detr/image_processing_detr.py     | 2 +-
 src/transformers/models/owlvit/image_processing_owlvit.py | 2 +-
 src/transformers/models/yolos/image_processing_yolos.py   | 2 +-
 6 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
index 737e42e25399..b18b80b40511 100644
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -50,17 +50,13 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
 >>> target_sizes = torch.Tensor([image.size[::-1]])
 >>> # Convert outputs (bounding boxes and class logits) to COCO API
->>> results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
-
+>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
 >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
 >>> text = texts[i]
 >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-
->>> score_threshold = 0.1
 >>> for box, score, label in zip(boxes, scores, labels):
 ...     box = [round(i, 2) for i in box.tolist()]
-...     if score >= score_threshold:
-...         print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
 Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
 Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
 ```
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 3de243cd8614..390713c0a5ea 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -1250,7 +1250,7 @@ def post_process(self, outputs, target_sizes):
         """
         logging.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 6aa3d5a82ffd..ae756c0a2b26 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -1248,7 +1248,7 @@ def post_process(self, outputs, target_sizes):
         """
         logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`.",
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 5983506eff84..af14c38dcc49 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -1219,7 +1219,7 @@ def post_process(self, outputs, target_sizes):
         """
         logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index ea5cd7b7766c..9268aac15b57 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -354,7 +354,7 @@ def post_process(self, outputs, target_sizes):
         # TODO: (amy) add support for other frameworks
         warnings.warn(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
             FutureWarning,
         )
 
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index a47267417146..372ab2b3ce3a 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -1151,7 +1151,7 @@ def post_process(self, outputs, target_sizes):
         """
         logger.warning_once(
             "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
-            " `post_process_object_detection`",
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
         )
 
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes

From b19c7b5ccf21164d8c7d34bb94f90d46e884a861 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 4 Jul 2023 17:58:31 -0400
Subject: [PATCH 599/935] Add `finetuned_from` property in the autogenerated
 model card (#24528)

* Add finetuned_from tag in the autogenerated model card

* Update name
---
 src/transformers/modelcard.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index a8a591a23ed5..ba5bac2b1a4a 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -459,6 +459,8 @@ def create_metadata(self):
         metadata = {}
         metadata = _insert_values_as_list(metadata, "language", self.language)
         metadata = _insert_value(metadata, "license", self.license)
+        if self.finetuned_from is not None:
+            metadata = _insert_value(metadata, "base_model", self.finetuned_from)
         metadata = _insert_values_as_list(metadata, "tags", self.tags)
         metadata = _insert_values_as_list(metadata, "datasets", self.dataset_tags)
         metadata = _insert_values_as_list(metadata, "metrics", list(metric_mapping.keys()))

From 469f4d0c29275473daf0627a0b26ec05256e47d2 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 4 Jul 2023 19:03:14 -0400
Subject: [PATCH 600/935] Make warning disappear for remote code in pipelines
 (#24603)

* Make warning disappear for remote code in pipelines

* Make sure it works twice in a row

* No need for that
---
 src/transformers/models/auto/auto_factory.py       | 2 ++
 src/transformers/models/auto/configuration_auto.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 3c0aa8d63fc5..cdda6add0c79 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -418,6 +418,7 @@ def from_config(cls, config, **kwargs):
             else:
                 repo_id = config.name_or_path
             model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+            cls._model_mapping.register(config.__class__, model_class)
             _ = kwargs.pop("code_revision", None)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
@@ -476,6 +477,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
             )
             _ = hub_kwargs.pop("code_revision", None)
+            cls._model_mapping.register(config.__class__, model_class)
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5951fd0f028a..bc5b2a420f74 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -707,11 +707,11 @@ def __iter__(self):
     def __contains__(self, item):
         return item in self._mapping or item in self._extra_content
 
-    def register(self, key, value):
+    def register(self, key, value, exist_ok=False):
         """
         Register a new configuration in this mapping.
         """
-        if key in self._mapping.keys():
+        if key in self._mapping.keys() and not exist_ok:
             raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.")
         self._extra_content[key] = value
 

From d211a84aca8a8513a599171173c29bc8d3061fba Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 5 Jul 2023 11:37:46 +0200
Subject: [PATCH 601/935] Fix
 `EncodecModelTest::test_multi_gpu_data_parallel_forward` (#24663)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/encodec/test_modeling_encodec.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index d678b707a590..389444d4b25d 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -73,7 +73,8 @@ class EncodecModelTester:
     def __init__(
         self,
         parent,
-        batch_size=13,
+        # `batch_size` needs to be an even number if the model has some outputs with batch dim != 0.
+        batch_size=12,
         num_channels=2,
         is_training=False,
         num_hidden_layers=4,

From ee339bad01bf09266eba665c5f063f0ab7474dad Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 5 Jul 2023 13:44:30 +0200
Subject: [PATCH 602/935] Fix `VisionTextDualEncoderIntegrationTest` (#24661)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../test_modeling_flax_vision_text_dual_encoder.py              | 2 +-
 .../test_modeling_tf_vision_text_dual_encoder.py                | 2 +-
 .../test_modeling_vision_text_dual_encoder.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
index 1cfa04e67f39..ddf8b4335f5d 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -366,7 +366,7 @@ def prepare_config_and_inputs(self):
 class FlaxVisionTextDualEncoderIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
-        model = FlaxVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1)
+        model = FlaxVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1.0)
         processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
index 1f27f831e8d7..10baef00594f 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
@@ -398,7 +398,7 @@ class TFVisionTextDualEncoderIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
         model = TFVisionTextDualEncoderModel.from_pretrained(
-            "clip-italian/clip-italian", logit_scale_init_value=1, from_pt=True
+            "clip-italian/clip-italian", logit_scale_init_value=1.0, from_pt=True
         )
         processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
 
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index c3c2321c24ee..4a1ee2462e4f 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -497,7 +497,7 @@ def prepare_config_and_inputs(self):
 class VisionTextDualEncoderIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
-        model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1)
+        model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1.0)
         processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian")
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")

From bd9dfc23b9954b3b623425b31be181698d916a09 Mon Sep 17 00:00:00 2001
From: Nripesh Niketan <86844847+NripeshN@users.noreply.github.com>
Date: Wed, 5 Jul 2023 19:32:20 +0530
Subject: [PATCH 603/935] Add `is_torch_mps_available` function to utils
 (#24660)

* Add mps function utils

* black formating

* format fix

* Added MPS functionality to transformers

* format fix
---
 src/transformers/file_utils.py                  | 1 +
 src/transformers/trainer_utils.py               | 6 ++++++
 src/transformers/utils/__init__.py              | 1 +
 src/transformers/utils/import_utils.py          | 9 +++++++++
 tests/models/musicgen/test_modeling_musicgen.py | 4 ++--
 5 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 38f4db058159..63230b5b8452 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -116,6 +116,7 @@
     is_torch_cuda_available,
     is_torch_fx_available,
     is_torch_fx_proxy,
+    is_torch_mps_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
     is_torchaudio_available,
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index a497ef1ee997..74f01ad9270e 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -35,6 +35,7 @@
     is_tf_available,
     is_torch_available,
     is_torch_cuda_available,
+    is_torch_mps_available,
     is_torch_tpu_available,
     requires_backends,
 )
@@ -411,6 +412,11 @@ def __init__(self, skip_memory_metrics=False):
         if is_torch_cuda_available():
             import torch
 
+            self.torch = torch
+            self.gpu = {}
+        elif is_torch_mps_available():
+            import torch
+
             self.torch = torch
             self.gpu = {}
         else:
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 764ca38b7232..4d27e3084b76 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -161,6 +161,7 @@
     is_torch_cuda_available,
     is_torch_fx_available,
     is_torch_fx_proxy,
+    is_torch_mps_available,
     is_torch_neuroncore_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index ed356a05b700..27700c6598e3 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -249,6 +249,15 @@ def is_torch_cuda_available():
         return False
 
 
+def is_torch_mps_available():
+    if is_torch_available():
+        import torch
+
+        if hasattr(torch.backends, "mps"):
+            return torch.backends.mps.is_available()
+    return False
+
+
 def is_torch_bf16_gpu_available():
     if not is_torch_available():
         return False
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 00f249b09d44..3d59becc7514 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -774,12 +774,12 @@ def test_initialization(self):
                 uniform_init_parms = ["conv"]
                 ignore_init = ["lstm"]
                 if param.requires_grad:
-                    if any([x in name for x in uniform_init_parms]):
+                    if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
                             -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
-                    elif not any([x in name for x in ignore_init]):
+                    elif not any(x in name for x in ignore_init):
                         self.assertIn(
                             ((param.data.mean() * 1e9).round() / 1e9).item(),
                             [0.0, 1.0],

From 050ef1451682a64a70394f372021572c59dc4c28 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 5 Jul 2023 16:49:10 +0200
Subject: [PATCH 604/935] Unpin `huggingface_hub` (#24667)

* fix

* fix

* fix

* [test all] commit

* [test all] commit

* [test all] commit

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml                |  44 ++++++------
 .circleci/create_circleci_config.py | 104 ++++++++++++++--------------
 2 files changed, 74 insertions(+), 74 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 99d0e05c6451..0756e0d4b721 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -30,9 +30,9 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: pip install --upgrade pip
-            - run: pip install -U GitPython
-            - run: pip install -U .
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager GitPython
+            - run: pip install -U --upgrade-strategy eager .
             - run: mkdir -p test_preparation
             - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
             - store_artifacts:
@@ -104,9 +104,9 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: pip install --upgrade pip
-            - run: pip install -U GitPython
-            - run: pip install -U .
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager GitPython
+            - run: pip install -U --upgrade-strategy eager .
             - run: |
                   mkdir test_preparation
                   echo -n "tests" > test_preparation/test_list.txt
@@ -136,20 +136,20 @@ jobs:
             - checkout
             - restore_cache:
                   keys:
-                      - v0.6-code_quality-{{ checksum "setup.py" }}
-                      - v0.6-code-quality
+                      - v0.7-code_quality-{{ checksum "setup.py" }}
+                      - v0.7-code-quality
             - restore_cache:
                   keys:
-                      - v0.6-code_quality-{{ checksum "setup.py" }}-site-packages
-                      - v0.6-code-quality-site-packages
-            - run: pip install --upgrade pip
-            - run: pip install -U .[all,quality]
+                      - v0.7-code_quality-{{ checksum "setup.py" }}-site-packages
+                      - v0.7-code-quality-site-packages
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager .[all,quality]
             - save_cache:
-                  key: v0.6-code_quality-{{ checksum "setup.py" }}
+                  key: v0.7-code_quality-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
             - save_cache:
-                  key: v0.6-code_quality-{{ checksum "setup.py" }}-site-packages
+                  key: v0.7-code_quality-{{ checksum "setup.py" }}-site-packages
                   paths:
                       - '~/.pyenv/versions/'
             - run:
@@ -177,20 +177,20 @@ jobs:
             - checkout
             - restore_cache:
                   keys:
-                      - v0.6-repository_consistency-{{ checksum "setup.py" }}
-                      - v0.6-repository_consistency
+                      - v0.7-repository_consistency-{{ checksum "setup.py" }}
+                      - v0.7-repository_consistency
             - restore_cache:
                   keys:
-                      - v0.6-repository_consistency-{{ checksum "setup.py" }}-site-packages
-                      - v0.6-repository_consistency-site-packages
-            - run: pip install --upgrade pip
-            - run: pip install -U .[all,quality]
+                      - v0.7-repository_consistency-{{ checksum "setup.py" }}-site-packages
+                      - v0.7-repository_consistency-site-packages
+            - run: pip install --upgrade --upgrade-strategy eager pip
+            - run: pip install -U --upgrade-strategy eager .[all,quality]
             - save_cache:
-                  key: v0.6-repository_consistency-{{ checksum "setup.py" }}
+                  key: v0.7-repository_consistency-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
             - save_cache:
-                  key: v0.6-repository_consistency-{{ checksum "setup.py" }}-site-packages
+                  key: v0.7-repository_consistency-{{ checksum "setup.py" }}-site-packages
                   paths:
                       - '~/.pyenv/versions/'
             - run:
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 246dc339d7ff..f44b500a80ad 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -52,7 +52,7 @@ class CircleCIJob:
     name: str
     additional_env: Dict[str, Any] = None
     cache_name: str = None
-    cache_version: str = "0.6"
+    cache_version: str = "0.7"
     docker_image: List[Dict[str, str]] = None
     install_steps: List[str] = None
     marker: Optional[str] = None
@@ -116,8 +116,6 @@ def to_dict(self):
             },
         ]
         steps.extend([{"run": l} for l in self.install_steps])
-        # TODO (ydshieh): Remove this line after the next release (the one after 2023/06/19) of `huggingface_hub`
-        steps.append({"run": {"name": "Split tests", "command": "pip uninstall -y huggingface_hub && pip install -U git+https://github.com/huggingface/huggingface_hub.git@e4a419bf6bbaa95d14704cc781d3e81a49cef413"}})
         steps.append(
             {
                 "save_cache": {
@@ -244,10 +242,10 @@ def job_name(self):
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake",
         "git lfs install",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install -U tensorflow_probability",
-        "pip install -U git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
+        "pip install -U --upgrade-strategy eager tensorflow_probability",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -259,9 +257,9 @@ def job_name(self):
     additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install -U --upgrade pip",
-        "pip install -U .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install -U git+https://github.com/huggingface/accelerate",
+        "pip install -U --upgrade-strategy eager --upgrade pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_flax_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -272,9 +270,9 @@ def job_name(self):
     "torch",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
-        "pip install -U git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     parallelism=1,
     pytest_num_workers=3,
@@ -285,9 +283,9 @@ def job_name(self):
     "tf",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
-        "pip install -U tensorflow_probability",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
+        "pip install -U --upgrade-strategy eager tensorflow_probability",
     ],
     parallelism=1,
     pytest_num_workers=6,
@@ -299,8 +297,8 @@ def job_name(self):
     "flax",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install -U .[flax,testing,sentencepiece,flax-speech,vision]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece,flax-speech,vision]",
     ],
     parallelism=1,
     pytest_options={"rA": None},
@@ -312,8 +310,8 @@ def job_name(self):
     additional_env={"RUN_PIPELINE_TESTS": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
     ],
     pytest_options={"rA": None},
     marker="is_pipeline_test",
@@ -325,9 +323,9 @@ def job_name(self):
     additional_env={"RUN_PIPELINE_TESTS": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y cmake",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,tf-cpu,testing,sentencepiece,vision]",
-        "pip install -U tensorflow_probability",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,vision]",
+        "pip install -U --upgrade-strategy eager tensorflow_probability",
     ],
     pytest_options={"rA": None},
     marker="is_pipeline_test",
@@ -349,8 +347,8 @@ def job_name(self):
                 "sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local\n"
                 "sudo make install\n",
         },
-        "pip install --upgrade pip",
-        "pip install -U .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
         "python -m unidic download",
     ],
     parallelism=None,
@@ -368,9 +366,9 @@ def job_name(self):
     cache_name="torch_examples",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,torch,sentencepiece,testing,torch-speech]",
-        "pip install -U -r examples/pytorch/_tests_requirements.txt",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,torch,sentencepiece,testing,torch-speech]",
+        "pip install -U --upgrade-strategy eager -r examples/pytorch/_tests_requirements.txt",
     ],
 )
 
@@ -380,9 +378,9 @@ def job_name(self):
     cache_name="tensorflow_examples",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y cmake",
-        "pip install --upgrade pip",
-        "pip install -U .[sklearn,tensorflow,sentencepiece,testing]",
-        "pip install -U -r examples/tensorflow/_tests_requirements.txt",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[sklearn,tensorflow,sentencepiece,testing]",
+        "pip install -U --upgrade-strategy eager -r examples/tensorflow/_tests_requirements.txt",
     ],
 )
 
@@ -391,9 +389,9 @@ def job_name(self):
     "examples_flax",
     cache_name="flax_examples",
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install -U .[flax,testing,sentencepiece]",
-        "pip install -U -r examples/flax/_tests_requirements.txt",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece]",
+        "pip install -U --upgrade-strategy eager -r examples/flax/_tests_requirements.txt",
     ],
 )
 
@@ -404,8 +402,8 @@ def job_name(self):
         "sudo apt-get -y update && sudo apt-get install git-lfs",
         'git config --global user.email "ci@dummy.com"',
         'git config --global user.name "ci"',
-        "pip install --upgrade pip",
-        "pip install -U .[torch,sentencepiece,testing]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing]",
     ],
     marker="is_staging_test",
     pytest_num_workers=1,
@@ -416,8 +414,8 @@ def job_name(self):
     "onnx",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y cmake",
-        "pip install --upgrade pip",
-        "pip install -U .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
     ],
     pytest_options={"k onnx": None},
     pytest_num_workers=1,
@@ -428,14 +426,16 @@ def job_name(self):
     "exotic_models",
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev",
-        "pip install --upgrade pip",
-        "pip install -U .[torch,testing,vision]",
-        "pip install -U torchvision",
-        "pip install -U scipy",
-        "pip install -U 'git+https://github.com/facebookresearch/detectron2.git'",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[torch,testing,vision]",
+        "pip install -U --upgrade-strategy eager torchvision",
+        "pip install -U --upgrade-strategy eager scipy",
+        "pip install -U --upgrade-strategy eager 'git+https://github.com/facebookresearch/detectron2.git'",
         "sudo apt install tesseract-ocr",
-        "pip install -U pytesseract",
-        "pip install -U natten",
+        "pip install -U --upgrade-strategy eager pytesseract",
+        "pip install -U --upgrade-strategy eager natten",
+        # TODO (ydshieh): Remove this line once `https://github.com/facebookresearch/detectron2/issues/5010` is resolved
+        'pip install -U --upgrade-strategy eager "Pillow<10.0.0"',
     ],
     tests_to_run=[
         "tests/models/*layoutlmv*",
@@ -450,8 +450,8 @@ def job_name(self):
 repo_utils_job = CircleCIJob(
     "repo_utils",
     install_steps=[
-        "pip install --upgrade pip",
-        "pip install -U .[quality,testing,torch]",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager .[quality,testing,torch]",
     ],
     parallelism=None,
     pytest_num_workers=1,
@@ -471,11 +471,11 @@ def job_name(self):
     additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg",
-        "pip install --upgrade pip",
-        "pip install -U -e .[dev]",
-        "pip install -U git+https://github.com/huggingface/accelerate",
-        "pip install --upgrade pytest pytest-sugar",
-        "pip install -U natten",
+        "pip install --upgrade --upgrade-strategy eager pip",
+        "pip install -U --upgrade-strategy eager -e .[dev]",
+        "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
+        "pip install --upgrade --upgrade-strategy eager pytest pytest-sugar",
+        "pip install -U --upgrade-strategy eager natten",
         "find -name __pycache__ -delete",
         "find . -name \*.pyc -delete",
         # Add an empty file to keep the test step running correctly even no file is selected to be tested.

From 3df3b9d4bf006ab193b3c1257f3436b9fdb91759 Mon Sep 17 00:00:00 2001
From: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com>
Date: Wed, 5 Jul 2023 13:25:36 -0300
Subject: [PATCH 605/935] Fix model referenced and results in documentation.
 Model mentioned was inaccessible (#24609)

---
 docs/source/en/tasks/object_detection.md | 34 ++++++++++++------------
 docs/source/ko/tasks/object_detection.md | 34 ++++++++++++------------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 457d96bfd3f6..563beb274253 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -481,7 +481,7 @@ Next, prepare an instance of a `CocoDetection` class that can be used with `coco
 ...         return {"pixel_values": pixel_values, "labels": target}
 
 
->>> im_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 
 >>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
 >>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
@@ -493,7 +493,7 @@ Finally, load the metrics and run the evaluation.
 >>> import evaluate
 >>> from tqdm import tqdm
 
->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 >>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
 >>> val_dataloader = torch.utils.data.DataLoader(
 ...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
@@ -522,18 +522,18 @@ Finally, load the metrics and run the evaluation.
 Accumulating evaluation results...
 DONE (t=0.08s).
 IoU metric: bbox
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.150
- Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.280
- Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.130
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.182
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.166
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.317
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.335
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.146
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.382
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.681
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.292
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.501
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590
 ```
 These results can be further improved by adjusting the hyperparameters in [`~transformers.TrainingArguments`]. Give it a go!
 
@@ -549,15 +549,15 @@ for object detection with your model, and pass an image to it:
 >>> url = "https://i.imgur.com/2lnWoly.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
->>> obj_detector = pipeline("object-detection", model="MariaK/detr-resnet-50_finetuned_cppe5")
+>>> obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5")
 >>> obj_detector(image)
 ```
 
 You can also manually replicate the results of the pipeline if you'd like:
 
 ```py
->>> image_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 
 >>> with torch.no_grad():
 ...     inputs = image_processor(images=image, return_tensors="pt")
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
index cb573ed4e771..ca384d038162 100644
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@@ -473,7 +473,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 
 ...         return {"pixel_values": pixel_values, "labels": target}
 
 
->>> im_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 
 >>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
 >>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
@@ -485,7 +485,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 
 >>> import evaluate
 >>> from tqdm import tqdm
 
->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 >>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
 >>> val_dataloader = torch.utils.data.DataLoader(
 ...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
@@ -514,18 +514,18 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 
 Accumulating evaluation results...
 DONE (t=0.08s).
 IoU metric: bbox
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.150
- Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.280
- Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.130
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038
- Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036
- Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.182
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.166
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.317
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.335
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104
- Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.146
- Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.382
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.681
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.292
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.501
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590
 ```
 
 이러한 결과는 [`~transformers.TrainingArguments`]의 하이퍼파라미터를 조정하여 더욱 개선될 수 있습니다. 한번 시도해 보세요!
@@ -544,15 +544,15 @@ DETR 모델을 미세 조정 및 평가하고, 허깅페이스 허브에 업로
 >>> url = "https://i.imgur.com/2lnWoly.jpg"
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
->>> obj_detector = pipeline("object-detection", model="MariaK/detr-resnet-50_finetuned_cppe5")
+>>> obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5")
 >>> obj_detector(image)
 ```
 
 만약 원한다면 수동으로 `pipeline`의 결과를 재현할 수 있습니다:
 
 ```py
->>> image_processor = AutoImageProcessor.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
->>> model = AutoModelForObjectDetection.from_pretrained("MariaK/detr-resnet-50_finetuned_cppe5")
+>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 
 >>> with torch.no_grad():
 ...     inputs = image_processor(images=image, return_tensors="pt")

From 9a5d468ba0562e2d5edf9da787881fa227132bca Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 5 Jul 2023 18:28:47 +0100
Subject: [PATCH 606/935] Add Nucleotide Transformer notebooks and restructure
 notebook list (#24669)

* Add Nucleotide Transformer notebooks and restructure lists

* Add missing linebreak!
---
 notebooks/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/notebooks/README.md b/notebooks/README.md
index 97f804eb6d93..18ffc682b51c 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -80,12 +80,20 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | [How to fine-tune a speech recognition model in any language](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| Show how to preprocess the data and fine-tune a multi-lingually pretrained speech model on Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)|
 | [How to fine-tune a model on audio classification](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained Speech model on Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)|
 
-#### Other modalities[[pytorch-other]]
+#### Biological Sequences[[pytorch-bio]]
 
 | Notebook     | Description                                                                             |   |   |
 |:----------|:----------------------------------------------------------------------------------------|:-------------|------:|
 | [How to fine-tune a pre-trained protein model](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | See how to tokenize proteins and fine-tune a large pre-trained protein "language" model | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) |
 | [How to generate protein folds](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | See how to go from protein sequence to a full protein model and PDB file                | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) |
+| [How to fine-tune a Nucleotide Transformer model](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | See how to tokenize DNA and fine-tune a large pre-trained DNA "language" model | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) |
+| [Fine-tune a Nucleotide Transformer model with LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | Train even larger DNA models in a memory-efficient way | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) |
+
+
+#### Other modalities[[pytorch-other]]
+
+| Notebook     | Description                                                                             |   |   |
+|:----------|:----------------------------------------------------------------------------------------|:-------------|------:|
 | [Probabilistic Time Series Forecasting](https://github.com/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | See how to train Time Series Transformer on a custom dataset                            | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) |
 
 #### Utility notebooks[[pytorch-utility]]
@@ -118,7 +126,7 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | [How to fine-tune a model on image classification](https://github.com/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)            | Show how to preprocess the data and fine-tune any pretrained Vision model on Image Classification   | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)|
 | [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb) | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)|
 
-#### Other modalities[[tensorflow-other]]
+#### Biological Sequences[[tensorflow-bio]]
 
 | Notebook     |      Description      |   |   |
 |:----------|:-------------|:-------------|------:|

From fb3b22c3b932cfbaffa217f1d036ab3ce668add2 Mon Sep 17 00:00:00 2001
From: Yuchao Dai <3407450+icyblade@users.noreply.github.com>
Date: Thu, 6 Jul 2023 17:21:27 +0800
Subject: [PATCH 607/935] LlamaTokenizer should be picklable (#24681)

* LlamaTokenizer should be picklable

* make fixup
---
 src/transformers/models/llama/tokenization_llama.py | 3 ++-
 tests/models/llama/test_tokenization_llama.py       | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 13f093ae9432..50b7c7a262a9 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -98,12 +98,13 @@ def __init__(
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
         return state
 
     def __setstate__(self, d):
         self.__dict__ = d
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
 
     @property
     def vocab_size(self):
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 3a1ec2be93bf..67d287fac108 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import os
+import pickle
 import shutil
 import tempfile
 import unittest
@@ -285,6 +286,13 @@ def test_tokenizer_integration(self):
             padding=False,
         )
 
+    def test_picklable(self):
+        with tempfile.NamedTemporaryFile() as f:
+            shutil.copyfile(SAMPLE_VOCAB, f.name)
+            tokenizer = LlamaTokenizer(f.name, keep_accents=True)
+            pickled_tokenizer = pickle.dumps(tokenizer)
+        pickle.loads(pickled_tokenizer)
+
 
 @require_torch
 @require_sentencepiece

From 392740452e86ee6ca523b92bc4ef8527ed4e7a16 Mon Sep 17 00:00:00 2001
From: Zhao Tianyu <zhaoty.ting@gmail.com>
Date: Thu, 6 Jul 2023 18:26:36 +0900
Subject: [PATCH 608/935] Add dropouts to GPT-NeoX (#24680)

* add attention dropout, post attention dropout, post mlp dropout to gpt-neox

* fix typo

* add documentation

* fix too long line

* ran Checking/fixing src/transformers/models/gpt_neox/configuration_gpt_neox.py src/transformers/models/gpt_neox/modeling_gpt_neox.py
python utils/custom_init_isort.py
python utils/sort_auto_mappings.py
doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
python utils/check_doc_toc.py --fix_and_overwrite
running deps_table_update
updating src/transformers/dependency_versions_table.py
python utils/check_copies.py
python utils/check_table.py
python utils/check_dummies.py
python utils/check_repo.py
Checking all models are included.
Checking all models are public.
Checking all models are properly tested.
Checking all objects are properly documented.
Checking all models are in at least one auto class.
Checking all names in auto name mappings are defined.
Checking all keys in auto name mappings are defined in `CONFIG_MAPPING_NAMES`.
Checking all auto mappings could be imported.
Checking all objects are equally (across frameworks) in the main __init__.
python utils/check_inits.py
python utils/check_config_docstrings.py
python utils/check_config_attributes.py
python utils/check_doctest_list.py
python utils/update_metadata.py --check-only
python utils/check_task_guides.py
---
 .../models/gpt_neox/configuration_gpt_neox.py        |  9 +++++++++
 .../models/gpt_neox/modeling_gpt_neox.py             | 12 +++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index 000d566398e4..046d858bf3b1 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -56,6 +56,11 @@ class GPTNeoXConfig(PretrainedConfig):
             percentage of hidden dimensions to allocate to rotary embeddings
         rotary_emb_base (`int`, *optional*, defaults to 10000)
             base for computing rotary embeddings frequency
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio probability of the attention score.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio of (1) the word embeddings, (2) the post-attention hidden states, and (3) the post-mlp
+            hidden states.
         classifier_dropout (`float`, *optional*, defaults to 0.1):
             Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`].
 
@@ -99,6 +104,8 @@ def __init__(
         hidden_act="gelu",
         rotary_pct=0.25,
         rotary_emb_base=10000,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,
         classifier_dropout=0.1,
         max_position_embeddings=2048,
         initializer_range=0.02,
@@ -120,6 +127,8 @@ def __init__(
         self.hidden_act = hidden_act
         self.rotary_pct = rotary_pct
         self.rotary_emb_base = rotary_emb_base
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
         self.classifier_dropout = classifier_dropout
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 841cbe1aa8f2..8546d85ffa3e 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -114,6 +114,8 @@ def __init__(self, config):
         self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
     def forward(
         self,
         hidden_states: torch.FloatTensor,
@@ -245,6 +247,8 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
+        attn_weights = self.attention_dropout(attn_weights)
+
         attn_output = torch.matmul(attn_weights, value)
         return attn_output, attn_weights
 
@@ -320,6 +324,8 @@ def __init__(self, config):
         self.use_parallel_residual = config.use_parallel_residual
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
+        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
         self.attention = GPTNeoXAttention(config)
         self.mlp = GPTNeoXMLP(config)
 
@@ -343,12 +349,14 @@ def forward(
             output_attentions=output_attentions,
         )
         attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+        attn_output = self.post_attention_dropout(attn_output)
         outputs = attention_layer_outputs[1:]
 
         if self.use_parallel_residual:
             # pseudocode:
             # x = x + attn(ln1(x)) + mlp(ln2(x))
             mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            mlp_output = self.post_mlp_dropout(mlp_output)
             hidden_states = mlp_output + attn_output + hidden_states
         else:
             # pseudocode:
@@ -356,6 +364,7 @@ def forward(
             # x = x + mlp(ln2(x))
             attn_output = attn_output + hidden_states
             mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            mlp_output = self.post_mlp_dropout(mlp_output)
             hidden_states = mlp_output + attn_output
 
         if use_cache:
@@ -429,6 +438,7 @@ def __init__(self, config):
         self.config = config
 
         self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.emb_dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -533,7 +543,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_in(input_ids)
 
-        hidden_states = inputs_embeds
+        hidden_states = self.emb_dropout(inputs_embeds)
 
         if self.gradient_checkpointing and self.training:
             if use_cache:

From 66a378429d0e085e4e72bc63a4147889a3b65a14 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Thu, 6 Jul 2023 15:03:25 +0530
Subject: [PATCH 609/935] DeepSpeed/FSDP ckpt saving utils fixes and FSDP
 training args fixes (#24591)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update ds and fsdp ckpt logic

* refactoring

* fix 🐛

* resolve comment

* fix issue with overriding of the fsdp config set by accelerate
---
 src/transformers/trainer.py | 52 +++++++++++++------------------------
 1 file changed, 18 insertions(+), 34 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 60224ea6c113..c18c9b907261 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -56,7 +56,7 @@
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 from .debug_utils import DebugOption, DebugUnderflowOverflow
-from .deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
+from .deepspeed import deepspeed_init, deepspeed_load_checkpoint
 from .dependency_versions_check import dep_version_check
 from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
 from .modelcard import TrainingSummary
@@ -2737,44 +2737,26 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             or self.fsdp is not None
             or self.is_fsdp_enabled
         ):
+            state_dict = self.model.state_dict()
+            if self.args.should_save:
+                self._save(output_dir, state_dict=state_dict)
             if self.is_fsdp_enabled:
-                os.makedirs(output_dir, exist_ok=True)
                 save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
-            else:
-                state_dict = self.model.state_dict()
 
-                if self.args.should_save:
-                    self._save(output_dir, state_dict=state_dict)
         elif self.is_deepspeed_enabled:
             # this takes care of everything as long as we aren't under zero3
-            if self.args.should_save and not is_deepspeed_zero3_enabled():
-                if version.parse(accelerate_version) <= version.parse("0.20.3"):
-                    raise ValueError("Install Accelerate from main branch")
+            if version.parse(accelerate_version) <= version.parse("0.20.3"):
+                raise ValueError("Install Accelerate from main branch")
+            try:
                 state_dict = self.accelerator.get_state_dict(self.deepspeed)
-
-                self._save(output_dir, state_dict=state_dict)
-
-            if is_deepspeed_zero3_enabled():
-                # It's too complicated to try to override different places where the weights dump gets
-                # saved, so since under zero3 the file is bogus, simply delete it. The user should
-                # either use deepspeed checkpoint to resume or to recover full weights use
-                # zero_to_fp32.py stored in the checkpoint.
                 if self.args.should_save:
-                    file = os.path.join(output_dir, WEIGHTS_NAME)
-                    if os.path.isfile(file):
-                        # logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights")
-                        os.remove(file)
-
-                # now save the real model if stage3_gather_16bit_weights_on_model_save=True
-                # if false it will not be saved.
-                # This must be called on all ranks
-                if not self.model_wrapped.save_16bit_model(output_dir, WEIGHTS_NAME):
-                    logger.warning(
-                        "deepspeed.save_16bit_model didn't save the model, since"
-                        " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
-                        " zero_to_fp32.py to recover weights"
-                    )
-                    self.model_wrapped.save_checkpoint(output_dir)
+                    self._save(output_dir, state_dict=state_dict)
+            except ValueError:
+                logger.warning(
+                    " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
+                    " zero_to_fp32.py to recover weights"
+                )
+                self.model_wrapped.save_checkpoint(output_dir)
 
         elif self.args.should_save:
             self._save(output_dir)
@@ -3854,8 +3836,10 @@ def create_accelerator_and_postprocess(self):
         # post accelerator creation setup
         if self.is_fsdp_enabled:
             fsdp_plugin = self.accelerator.state.fsdp_plugin
-            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get("limit_all_gathers", False)
-            fsdp_plugin.use_orig_params = self.args.fsdp_config.get("use_orig_params", False)
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
+                "limit_all_gathers", fsdp_plugin.limit_all_gathers
+            )
+            fsdp_plugin.use_orig_params = self.args.fsdp_config.get("use_orig_params", fsdp_plugin.use_orig_params)
 
         if self.is_deepspeed_enabled:
             if getattr(self.args, "hf_deepspeed_config", None) is None:

From bbf3090848cf0ceff98f9465691e9ecce63684a1 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 6 Jul 2023 16:30:23 +0200
Subject: [PATCH 610/935] Avoid import `sentencepiece_model_pb2` in
 `utils.__init__.py` (#24689)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/utils/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 4d27e3084b76..8afe32403289 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -184,9 +184,6 @@
         from . import sentencepiece_model_pb2
     else:
         from . import sentencepiece_model_pb2_new as sentencepiece_model_pb2
-else:
-    # just to get the expected `No module named 'google.protobuf'` error
-    from . import sentencepiece_model_pb2
 
 
 WEIGHTS_NAME = "pytorch_model.bin"

From fded6f41861561a1e3311850e5d11c4bbf8a0fb3 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 6 Jul 2023 14:12:16 -0400
Subject: [PATCH 611/935] Fix integration with Accelerate and failing test
 (#24691)

Fix integration
---
 src/transformers/trainer.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c18c9b907261..0ea1d50b4640 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3176,13 +3176,19 @@ def evaluation_loop(
 
         # Gather all remaining tensors and put them back on the CPU
         if losses_host is not None:
-            all_losses = nested_numpify(losses_host)
+            losses = nested_numpify(losses_host)
+            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
         if preds_host is not None:
-            all_preds = nested_numpify(preds_host)
+            logits = nested_numpify(preds_host)
+            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
         if inputs_host is not None:
-            all_inputs = nested_numpify(inputs_host)
+            inputs_decode = nested_numpify(inputs_host)
+            all_inputs = (
+                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
+            )
         if labels_host is not None:
-            all_labels = nested_numpify(labels_host)
+            labels = nested_numpify(labels_host)
+            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
 
         # Number of samples
         if has_length(eval_dataset):

From fb78769b9c053876ed7ae152ee995b0439a4462a Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:33:54 +0900
Subject: [PATCH 612/935] [`MT5`] Fix CONFIG_MAPPING issue leading it to load
 umt5 class (#24678)

* update

* add umt5 to auto tokenizer mapping

* nits

* fixup

* fix failing torch test
---
 docs/source/en/index.md                       |   2 +-
 docs/source/en/model_doc/umt5.md              |   3 +
 src/transformers/__init__.py                  |   3 +-
 .../models/auto/configuration_auto.py         |   2 +-
 .../models/auto/tokenization_auto.py          |   7 +
 src/transformers/models/umt5/__init__.py      |   5 +-
 .../models/umt5/configuration_umt5.py         | 182 ++++++++++++++++++
 src/transformers/models/umt5/modeling_umt5.py |  20 +-
 tests/models/umt5/test_modeling_umt5.py       |   2 +-
 utils/check_config_attributes.py              |   1 +
 10 files changed, 212 insertions(+), 15 deletions(-)
 create mode 100644 src/transformers/models/umt5/configuration_umt5.py

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index f45806e90b75..29eb0141482c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -439,7 +439,7 @@ Flax), PyTorch, and/or TensorFlow.
 |        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             UMT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             UMT5              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md
index 05a6b1431941..d7dbc69e8e45 100644
--- a/docs/source/en/model_doc/umt5.md
+++ b/docs/source/en/model_doc/umt5.md
@@ -73,6 +73,9 @@ The conversion script is also different because the model was saved in t5x's lat
 ['<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s>']
 ```
 
+## UMT5Config
+
+[[autodoc]] UMT5Config
 
 ## UMT5Model
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9e6fa1bf3836..8245f49e0e68 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -524,7 +524,7 @@
         "TvltFeatureExtractor",
         "TvltProcessor",
     ],
-    "models.umt5": [],
+    "models.umt5": ["UMT5Config"],
     "models.unispeech": [
         "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "UniSpeechConfig",
@@ -4388,6 +4388,7 @@
     )
     from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
     from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltFeatureExtractor, TvltProcessor
+    from .models.umt5 import UMT5Config
     from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
     from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
     from .models.upernet import UperNetConfig
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index bc5b2a420f74..c0fb12586d13 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -194,7 +194,7 @@
         ("transfo-xl", "TransfoXLConfig"),
         ("trocr", "TrOCRConfig"),
         ("tvlt", "TvltConfig"),
-        ("umt5", "MT5Config"),
+        ("umt5", "UMT5Config"),
         ("unispeech", "UniSpeechConfig"),
         ("unispeech-sat", "UniSpeechSatConfig"),
         ("upernet", "UperNetConfig"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ad2d310663bc..b5880fbe9f15 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -324,6 +324,13 @@
             ("tapas", ("TapasTokenizer", None)),
             ("tapex", ("TapexTokenizer", None)),
             ("transfo-xl", ("TransfoXLTokenizer", None)),
+            (
+                "umt5",
+                (
+                    "T5Tokenizer" if is_sentencepiece_available() else None,
+                    "T5TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
diff --git a/src/transformers/models/umt5/__init__.py b/src/transformers/models/umt5/__init__.py
index ef9693139329..2ad1c56d2390 100644
--- a/src/transformers/models/umt5/__init__.py
+++ b/src/transformers/models/umt5/__init__.py
@@ -17,7 +17,8 @@
 from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
-_import_structure = {}
+_import_structure = {"configuration_umt5": ["UMT5Config", "UMT5OnnxConfig"]}
+
 
 try:
     if not is_torch_available():
@@ -34,6 +35,8 @@
     ]
 
 if TYPE_CHECKING:
+    from .configuration_umt5 import UMT5Config, UMT5OnnxConfig
+
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py
new file mode 100644
index 000000000000..462b57f7f1cb
--- /dev/null
+++ b/src/transformers/models/umt5/configuration_umt5.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2023, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" UMT5 model configuration"""
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxSeq2SeqConfigWithPast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+UMT5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/umt5-small": "https://huggingface.co/google/umt5-small/resolve/main/config.json",
+    # See all umt5 models at https://huggingface.co/models?filter=umt5
+}
+
+
+class UMT5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UMT5Model`]. It is used to instantiate a UMT5
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the UMT5
+    [google/umt5-small](https://huggingface.co/google/umt5-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 250112):
+            Vocabulary size of the UMT5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`UMT5Model`] or [`TFUMT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `UMT5Block`.
+        num_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = "umt5"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=250112,
+        d_model=512,
+        d_kv=64,
+        d_ff=1024,
+        num_layers=8,
+        num_decoder_layers=None,
+        num_heads=6,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        feed_forward_proj="gated-gelu",
+        is_encoder_decoder=True,
+        use_cache=True,
+        tokenizer_class="T5Tokenizer",
+        tie_word_embeddings=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        decoder_start_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            tokenizer_class=tokenizer_class,
+            tie_word_embeddings=tie_word_embeddings,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+
+        act_info = self.feed_forward_proj.split("-")
+        self.dense_act_fn = act_info[-1]
+        self.is_gated_act = act_info[0] == "gated"
+
+        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
+            raise ValueError(
+                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
+                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
+                "'gated-gelu' or 'relu'"
+            )
+
+        if feed_forward_proj == "gated-gelu":
+            self.dense_act_fn = "gelu_new"
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers
+
+
+class UMT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.inputs
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = {
+            "input_ids": {0: "batch", 1: "encoder_sequence"},
+            "attention_mask": {0: "batch", 1: "encoder_sequence"},
+        }
+        if self.use_past:
+            common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+
+        return common_inputs
+
+    @property
+    # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.default_onnx_opset
+    def default_onnx_opset(self) -> int:
+        return 13
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 5e-4
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index e3dc40d59235..ba3bca7875a6 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch mT5 model."""
+""" PyTorch UMT5 model."""
 
 import copy
 import math
@@ -24,7 +24,6 @@
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
-from ...configuration_utils import PretrainedConfig
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -42,6 +41,7 @@
     logging,
     replace_return_docstrings,
 )
+from .configuration_umt5 import UMT5Config
 
 
 logger = logging.get_logger(__name__)
@@ -76,9 +76,9 @@ def forward(self, hidden_states):
         return self.weight * hidden_states
 
 
-# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->UMT5,UMT5Config->PretrainedConfig
+# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->UMT5
 class UMT5DenseActDense(nn.Module):
-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, config: UMT5Config):
         super().__init__()
         self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
         self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
@@ -99,9 +99,9 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->UMT5,UMT5Config->PretrainedConfig
+# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->UMT5
 class UMT5DenseGatedActDense(nn.Module):
-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, config: UMT5Config):
         super().__init__()
         self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
         self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
@@ -129,9 +129,9 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->UMT5,UMT5Config->PretrainedConfig
+# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->UMT5
 class UMT5LayerFF(nn.Module):
-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, config: UMT5Config):
         super().__init__()
         if config.is_gated_act:
             self.DenseReluDense = UMT5DenseGatedActDense(config)
@@ -457,7 +457,7 @@ class UMT5PreTrainedModel(PreTrainedModel):
     models.
     """
 
-    config_class = PretrainedConfig
+    config_class = UMT5Config
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["UMT5Block"]
@@ -916,7 +916,7 @@ class UMT5Model(UMT5PreTrainedModel):
     >>> hidden_states = outputs.last_hidden_state
     ```"""
     model_type = "uumt5"
-    config_class = PretrainedConfig
+    config_class = UMT5Config
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 697e619f133a..8bdab8ca736c 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -35,7 +35,7 @@
     from transformers import AutoTokenizer, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering, UMT5Model
 
 
-# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5,UMT5Config->T5Config
+# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5
 class UMT5ModelTester:
     def __init__(
         self,
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index dad6888c8021..1175f0774029 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -54,6 +54,7 @@
     # used internally in the configuration class file
     # `tokenizer_class` get default value `T5Tokenizer` intentionally
     "MT5Config": ["feed_forward_proj", "tokenizer_class"],
+    "UMT5Config": ["feed_forward_proj", "tokenizer_class"],
     # used internally in the configuration class file
     "LongT5Config": ["feed_forward_proj"],
     # used internally in the configuration class file

From 495729427045c7a58e040fa9bf6df81c16f54208 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 7 Jul 2023 11:55:21 +0200
Subject: [PATCH 613/935] Fix flaky
 `test_for_warning_if_padding_and_no_attention_mask` (#24706)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/bert/test_modeling_bert.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index db021740714f..52c8035d8a60 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -584,6 +584,9 @@ def test_for_warning_if_padding_and_no_attention_mask(self):
 
         # Check for warnings if the attention_mask is missing.
         logger = logging.get_logger("transformers.modeling_utils")
+        # clear cache so we can test the warning is emitted (from `warning_once`).
+        logger.warning_once.cache_clear()
+
         with CaptureLogger(logger) as cl:
             model = BertModel(config=config)
             model.to(torch_device)

From f614b6e393e4c1ec4ae39be0cc36e1e1f6907740 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Fri, 7 Jul 2023 18:11:38 +0100
Subject: [PATCH 614/935] Whisper: fix prompted max length (#24666)

---
 .../generation/stopping_criteria.py           | 20 ++++++++++++++++---
 src/transformers/generation/utils.py          |  8 +++++++-
 .../models/whisper/modeling_whisper.py        |  8 +++-----
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 7023fa9998c9..307a52adc2ec 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -6,7 +6,10 @@
 
 import torch
 
-from ..utils import add_start_docstrings
+from ..utils import add_start_docstrings, logging
+
+
+logger = logging.get_logger(__name__)
 
 
 STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
@@ -46,14 +49,25 @@ class MaxLengthCriteria(StoppingCriteria):
     Args:
         max_length (`int`):
             The maximum length that the output sequence can have in number of tokens.
+        max_position_embeddings (`int`, `optional`):
+            The maximum model length, as defined by the model's `config.max_position_embeddings` attribute.
     """
 
-    def __init__(self, max_length: int):
+    def __init__(self, max_length: int, max_position_embeddings: Optional[int] = None):
         self.max_length = max_length
+        self.max_position_embeddings = max_position_embeddings
 
     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        return input_ids.shape[-1] >= self.max_length
+        cur_len = input_ids.shape[-1]
+        is_done = cur_len >= self.max_length
+        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
+            logger.warning_once(
+                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
+                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
+                "exceptions, performance degradation, or nothing at all."
+            )
+        return is_done
 
 
 class MaxNewTokensCriteria(StoppingCriteria):
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 5fb4b4b5a4cc..f5748f6d267f 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -954,7 +954,13 @@ def _get_stopping_criteria(
     ) -> StoppingCriteriaList:
         criteria = StoppingCriteriaList()
         if generation_config.max_length is not None:
-            criteria.append(MaxLengthCriteria(max_length=generation_config.max_length))
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            criteria.append(
+                MaxLengthCriteria(
+                    max_length=generation_config.max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
         if generation_config.max_time is not None:
             criteria.append(MaxTimeCriteria(max_time=generation_config.max_time))
         criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 699f3a8e9df3..10e624d497b0 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1715,11 +1715,9 @@ def generate(
             # Set the decoder_start_token_id to <|startofprev|>
             kwargs.update({"decoder_start_token_id": decoder_start_token_id})
 
-            # Update the max generation length to include the prompt
-            specified_max_length = kwargs.pop("max_new_tokens", None) or kwargs.pop("max_length", None)
-            default_max_length = generation_config.max_new_tokens or generation_config.max_length
-            non_prompt_max_length = specified_max_length or default_max_length
-            kwargs["max_new_tokens"] = non_prompt_max_length + len(text_prompt_ids)
+            # If the user passes `max_new_tokens`, increase its number to account for the prompt
+            if kwargs.get("max_new_tokens", None) is not None:
+                kwargs["max_new_tokens"] += len(text_prompt_ids)
 
             # Reformat the forced_decoder_ids to incorporate the prompt
             non_prompt_forced_decoder_ids = (

From abaca9f9432a84cfaa95531de4c72334f38a42f2 Mon Sep 17 00:00:00 2001
From: Dan Saattrup Nielsen <47701536+saattrupdan@users.noreply.github.com>
Date: Fri, 7 Jul 2023 20:52:21 +0200
Subject: [PATCH 615/935] Enable `conversational` pipeline for
 `GPTSw3Tokenizer` (#24648)

* feat: Add `_build_conversation_input_ids` to GPT-SW3 tokenizer, adjust line length

* feat: Merge in PR https://github.com/huggingface/transformers/pull/24504.

This allows the GPT-SW3 models (and other GPT-2 based models) to be 4-bit quantised
using `load_in_4bit` with `bitsandbytes`.

* fix: F-string

* fix: F-string

* fix: Remove EOS token from all responses

* fix: Remove redundant newlines

* feat: Add `load_in_4bit` to `Pipeline`

* fix: Separate turns with `\n<s>\n` rather than `<s>`

* fix: Add missing newline in prompt

* tests: Add unit tests for the new `_build_conversation_input_ids` method

* style: Automatic style correction

* tests: Compare encodings rather than decodings

* fix: Remove `load_in_4bit` from pipeline arguments

* docs: Add description and references of the GPT-SW3 chat format

* style: Line breaks

* Apply suggestions from code review

Fix Conversation type hints

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix: Import TYPE_CHECKING

* style: Run automatic fixes

* tests: Remove `_build_conversation_input_ids` unit tests

* tests: Remove import of `Conversation` in GPT-SW3 unit test

* style: Revert formatting

* style: Move TYPE_CHECKING line after all imports

* style: Imports order

* fix: Change prompt to ensure that `sp_model.encode` and `encode` yields same result

* docs: Add TODO comment related to the addition of whitespace during decoding

* style: Automatic style checks

* fix: Remove final whitespace in prompt, as prefix whitespace is used by sentencepiece

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 .../models/gpt_sw3/tokenization_gpt_sw3.py    | 48 ++++++++++++++++---
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index 422cb0722083..b7877b6b0ff6 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -1,20 +1,23 @@
+"""The tokenizer used by the GPT-SW3 models."""
+
 import os
 import re
 import unicodedata
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from ... import is_torch_available
+import sentencepiece as spm
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_torch_available, logging
 
 
 if is_torch_available():
     import torch
 
-from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple, Union
 
-import sentencepiece as spm
-
-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+if TYPE_CHECKING:
+    from transformers.pipelines.conversational import Conversation
 
 
 logger = logging.get_logger(__name__)
@@ -230,8 +233,10 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
         for token in tokens:
             # make sure that special tokens are not decoded using sentencepiece model
             if token in self.all_special_tokens:
+                # TODO: Check if this is needed, as it ensures that decode(encode(doc)) != doc by adding extra whitespace in the decoded document
                 if not prev_is_special:
                     out_string += " "
+
                 out_string += self.sp_model.decode(current_sub_tokens) + token
                 prev_is_special = True
                 current_sub_tokens = []
@@ -312,3 +317,32 @@ def decode_fast(self, token_ids: Union[int, List[int]]) -> str:
         """
 
         return self.sp_model.decode(token_ids)
+
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+        """Builds the input ids for a conversation.
+
+        This is the format used in the original GPT-SW3 paper [1] and which is also mentioned in the model card [2].
+        The format is inspired by the ChatML format [3]. Concretely, the chat format is set up as follows:
+
+        ```
+        <eos><bos>User: Jag tycker träd är fina<bos>Bot: Kul att du tycker det!<bos>...
+        ```
+
+        Args:
+            conversation (`Conversation`):
+                Conversation to build input ids for.
+
+        Returns:
+            `List[int]`:
+                Input ids for the conversation.
+
+        References:
+            - [1] https://doi.org/10.48550/arXiv.2305.12987
+            - [2] https://huggingface.co/AI-Sweden-Models/gpt-sw3-126m-instruct
+            - [3] https://github.com/openai/openai-python/blob/main/chatml.md
+        """
+        all_responses = [f"User: {text}" if is_user else f"Bot: {text}" for is_user, text in conversation.iter_texts()]
+        prompt = (
+            f"{self.eos_token}{self.bos_token}" + f"{self.bos_token}".join(all_responses) + f"{self.bos_token}Bot:"
+        )
+        return self.encode(text=prompt)

From 30ed3adf474aaf2972ab56f5624089bc24a6adf3 Mon Sep 17 00:00:00 2001
From: novice <44259234+novice03@users.noreply.github.com>
Date: Mon, 10 Jul 2023 02:50:43 -0700
Subject: [PATCH 616/935] Add Multi Resolution Analysis (MRA) (New PR) (#24513)

* Add all files

* Update masked_language_modeling.md

* fix mlm models

* fix conflicts

* fix conflicts

* fix copies

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Reduce seq_len and hidden_size in ModelTester

* remove output_attentions

* fix conflicts

* remove copied from statements

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/mra.md               |   68 +
 .../en/tasks/masked_language_modeling.md      |    2 +-
 docs/source/en/tasks/multiple_choice.md       |    2 +-
 docs/source/en/tasks/question_answering.md    |    2 +-
 .../en/tasks/sequence_classification.md       |    2 +-
 docs/source/en/tasks/token_classification.md  |    2 +-
 src/transformers/__init__.py                  |   24 +
 src/transformers/kernels/mra/cuda_kernel.cu   |  383 +++++
 src/transformers/kernels/mra/cuda_kernel.h    |   59 +
 src/transformers/kernels/mra/cuda_launch.cu   |  154 ++
 src/transformers/kernels/mra/cuda_launch.h    |   39 +
 .../kernels/mra/torch_extension.cpp           |   78 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    8 +
 .../models/auto/tokenization_auto.py          |    1 +
 src/transformers/models/mra/__init__.py       |   68 +
 .../models/mra/configuration_mra.py           |  137 ++
 .../mra/convert_mra_pytorch_to_pytorch.py     |  110 ++
 src/transformers/models/mra/modeling_mra.py   | 1501 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   52 +
 tests/models/mra/__init__.py                  |    0
 tests/models/mra/test_modeling_mra.py         |  406 +++++
 32 files changed, 3108 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/en/model_doc/mra.md
 create mode 100644 src/transformers/kernels/mra/cuda_kernel.cu
 create mode 100644 src/transformers/kernels/mra/cuda_kernel.h
 create mode 100644 src/transformers/kernels/mra/cuda_launch.cu
 create mode 100644 src/transformers/kernels/mra/cuda_launch.h
 create mode 100644 src/transformers/kernels/mra/torch_extension.cpp
 create mode 100644 src/transformers/models/mra/__init__.py
 create mode 100644 src/transformers/models/mra/configuration_mra.py
 create mode 100644 src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
 create mode 100644 src/transformers/models/mra/modeling_mra.py
 create mode 100644 tests/models/mra/__init__.py
 create mode 100644 tests/models/mra/test_modeling_mra.py

diff --git a/README.md b/README.md
index 7b4ffff3dc7b..7b2b1cb3d5ed 100644
--- a/README.md
+++ b/README.md
@@ -410,6 +410,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_es.md b/README_es.md
index 2ff7917abee9..84ae55f0cf18 100644
--- a/README_es.md
+++ b/README_es.md
@@ -385,6 +385,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_hd.md b/README_hd.md
index b22ffd26cd47..44850339f652 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -357,6 +357,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA) के साथ जारी किया गया
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_ja.md b/README_ja.md
index bcc513874017..2306e2a1afeb 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -419,6 +419,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
diff --git a/README_ko.md b/README_ko.md
index 044cb1c454c5..9081b83c7007 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -334,6 +334,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index d0e06a86752d..e325896353d4 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -358,6 +358,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 6ff8f9c41fee..d91e1c06d864 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -370,6 +370,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 35a9d4514119..9e0d612377c9 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -361,6 +361,8 @@
         title: MobileBERT
       - local: model_doc/mpnet
         title: MPNet
+      - local: model_doc/mra
+        title: MRA
       - local: model_doc/mt5
         title: MT5
       - local: model_doc/mvp
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 29eb0141482c..cbde131668b4 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -174,6 +174,7 @@ The documentation is organized into five sections:
 1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MRA](model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
@@ -381,6 +382,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |          MobileViTV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MRA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           MusicGen            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md
new file mode 100644
index 000000000000..c67fe35fc724
--- /dev/null
+++ b/docs/source/en/model_doc/mra.md
@@ -0,0 +1,68 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MRA
+
+## Overview
+
+The MRA model was proposed in [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, and Vikas Singh.
+
+The abstract from the paper is the following:
+
+*Transformers have emerged as a preferred model for many tasks in natural langugage processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at https://github.com/mlpen/mra-attention.*
+
+This model was contributed by [novice03](https://huggingface.co/novice03).
+The original code can be found [here](https://github.com/mlpen/mra-attention).
+
+
+## MraConfig
+
+[[autodoc]] MraConfig
+
+
+## MraModel
+
+[[autodoc]] MraModel
+    - forward
+
+
+## MraForMaskedLM
+
+[[autodoc]] MraForMaskedLM
+    - forward
+
+
+## MraForSequenceClassification
+
+[[autodoc]] MraForSequenceClassification
+    - forward
+
+## MraForMultipleChoice
+
+[[autodoc]] MraForMultipleChoice
+    - forward
+
+
+## MraForTokenClassification
+
+[[autodoc]] MraForTokenClassification
+    - forward
+
+
+## MraForQuestionAnswering
+
+[[autodoc]] MraForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
index 83a955efae07..ba1e9e50dbe8 100644
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -35,7 +35,7 @@ Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 <!--End of the generated tip-->
 
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 7b7d6bfa69ce..938d3ba461bb 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 <!--End of the generated tip-->
 
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index ccc6f48fcdcf..3a3eda904ea6 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index dd1ad03277ea..bf07878a6357 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 87d60d37cb31..66d06b9f383a 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8245f49e0e68..ac9ab14f5d12 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -402,6 +402,7 @@
     "models.mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig"],
     "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
+    "models.mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"],
     "models.mt5": ["MT5Config"],
     "models.musicgen": [
         "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -2136,6 +2137,18 @@
             "MPNetPreTrainedModel",
         ]
     )
+    _import_structure["models.mra"].extend(
+        [
+            "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MraForMaskedLM",
+            "MraForMultipleChoice",
+            "MraForQuestionAnswering",
+            "MraForSequenceClassification",
+            "MraForTokenClassification",
+            "MraModel",
+            "MraPreTrainedModel",
+        ]
+    )
     _import_structure["models.mt5"].extend(
         ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5ForQuestionAnswering", "MT5Model", "MT5PreTrainedModel"]
     )
@@ -4276,6 +4289,7 @@
     from .models.mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig
     from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
+    from .models.mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
     from .models.mt5 import MT5Config
     from .models.musicgen import (
         MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5733,6 +5747,16 @@
             MPNetModel,
             MPNetPreTrainedModel,
         )
+        from .models.mra import (
+            MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+            MraModel,
+            MraPreTrainedModel,
+        )
         from .models.mt5 import (
             MT5EncoderModel,
             MT5ForConditionalGeneration,
diff --git a/src/transformers/kernels/mra/cuda_kernel.cu b/src/transformers/kernels/mra/cuda_kernel.cu
new file mode 100644
index 000000000000..87ed89052873
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_kernel.cu
@@ -0,0 +1,383 @@
+#include "cuda_kernel.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.x;
+
+  long thread_idx = threadIdx.x;
+  long num_thread = blockDim.x;
+
+  extern __shared__ float buffer[];
+  int *max_buffer = (int*)buffer;
+
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_buffer[idx] = -1e8;
+    }
+  }
+  __syncthreads();
+
+  int *indices_pt = &indices[batch_idx * num_block];
+  float *index_vals_pt = &index_vals[batch_idx * num_block * 32];
+
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    atomicMax(&max_buffer[A_block_idx * 32 + idx / num_block], (int)(index_vals_pt[idx] * 1000));
+  }
+  __syncthreads();
+  
+  float *max_vals_pt = &max_vals[batch_idx * A_num_block * 32];
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_vals_pt[idx] = (float)max_buffer[idx] / 1000.;
+    }
+  }
+  
+  float *max_vals_scatter_pt = &max_vals_scatter[batch_idx * num_block * 32];
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    max_vals_scatter_pt[idx] = (float)max_buffer[A_block_idx * 32 + idx / num_block] / 1000.;
+  }
+
+}
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[4096];
+  float *A_buffer = &buffer[threadIdx.y * 1024]; // [2, 8, 32]
+  float *B_buffer = &buffer[threadIdx.y * 1024 + 512]; // [2, 8, 32]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * dim * 32];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * dim * 32];
+
+  int reg_1_idx = thread_idx / 8;    // [0000000011111111222222223333333344444444555555556666666677777777]
+  int reg_2_idx = thread_idx % 8;    // [0123456701234567012345670123456701234567012345670123456701234567]
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    A_buffer[i * 64 + thread_idx] = dense_A_pt[i * 64 + thread_idx];
+    B_buffer[i * 64 + thread_idx] = dense_B_pt[i * 64 + thread_idx];
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    reg_1[i] = A_buffer[reg_1_idx * 4 + i];
+    reg_2[i] = B_buffer[reg_2_idx * 4 + i];
+  }
+
+  for (int dim_stride = 1; dim_stride < (dim / 8); dim_stride++) {
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      A_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_A_pt[dim_stride * 256 + i * 64 + thread_idx];
+      B_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_B_pt[dim_stride * 256 + i * 64 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = A_buffer[(dim_stride % 2) * 256 + reg_1_idx * 4 + i];
+      reg_2[i] = B_buffer[(dim_stride % 2) * 256 + reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+  }
+
+  #pragma unroll
+  for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+      reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+    }
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+      }
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *C_buffer = &buffer[threadIdx.y * 1024]; // [32, 32]
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      C_buffer[(reg_2_idx * 4 + j) * 32 + reg_1_idx * 4 + i] = reg_array[i * 4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *sparse_C_pt = &sparse_C[batch_idx__block_idx * 1024];
+
+  #pragma unroll
+  for (int i = 0; i < 16; i++) {
+    sparse_C_pt[i * 64 + thread_idx] = C_buffer[i * 64 + thread_idx];
+  }
+
+}
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[6144];
+  float *A_buffer = &buffer[threadIdx.y * 3072]; // [32, 32]
+  float *B_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [32, 64]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    A_buffer[i * 128 + thread_idx] = sparse_A_pt[i * 128 + thread_idx];
+  }
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * 32 * dim];
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32 * dim];
+
+  // [0000000011111111222222223333333344444444555555556666666677777777]
+  // [0123456701234567012345670123456701234567012345670123456701234567]
+  int reg_1_idx = thread_idx / 8;
+  int reg_2_idx = thread_idx % 8;
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16];
+
+  for (int dim_stride = 0; dim_stride < dim; dim_stride = dim_stride + 64) {
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      B_buffer[i * 128 + thread_idx] = dense_B_pt[dim_stride * 32 + i * 128 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      reg_array[i] = 0;
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = B_buffer[(reg_1_idx * 4 + i) * 32];
+      reg_2[i] = A_buffer[reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 32; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = B_buffer[(reg_1_idx * 4 + i) * 32 + mini_dim_idx];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = A_buffer[mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+    __syncthreads();
+
+    float *C_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [64, 32]
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        C_buffer[(reg_1_idx * 4 + i) * 32 + reg_2_idx * 4 + j] = reg_array[i * 4 + j];
+      }
+    }
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      atomicAdd(&dense_C_pt[dim_stride * 32 + i * 128 + thread_idx], C_buffer[i * 128 + thread_idx]);
+    }
+    __syncthreads();
+
+  }
+
+}
+
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+
+  float reg_array[16];
+  float value = 0;
+
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    reg_array[i] = sparse_A_pt[i * 32 + thread_idx];
+  }
+  #pragma unroll
+  for (int stride = 8; stride < 32; stride = stride + 8) {
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      reg_array[(stride + i) % 16] = sparse_A_pt[(stride + i) * 32 + thread_idx];
+    }
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      value = value + reg_array[(stride - 8 + i) % 16];
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    value = value + reg_array[8 + i];
+  }
+
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+
+  atomicAdd(&dense_C_pt[thread_idx], value);
+
+}
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+  float *sparse_C_pt = &sparse_C[(batch_idx * num_block + block_idx) * 1024];
+
+  float value = dense_A_pt[thread_idx];
+
+  #pragma unroll
+  for (int i = 0; i < 32; i++) {
+    sparse_C_pt[i * 32 + thread_idx] = value;
+  }
+
+}
diff --git a/src/transformers/kernels/mra/cuda_kernel.h b/src/transformers/kernels/mra/cuda_kernel.h
new file mode 100644
index 000000000000..a95b46f7d159
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_kernel.h
@@ -0,0 +1,59 @@
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+#define OPTIMAL_THREADS 256
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
diff --git a/src/transformers/kernels/mra/cuda_launch.cu b/src/transformers/kernels/mra/cuda_launch.cu
new file mode 100644
index 000000000000..ba2a0cacfe61
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_launch.cu
@@ -0,0 +1,154 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include "cuda_kernel.h"
+#include <vector>
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,  // [batch_size, 32, num_block]
+  at::Tensor indices,     // [batch_size, num_block],
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = indices.size(0);
+  int num_block = indices.size(1);
+
+  at::Tensor max_vals = at::zeros({batch_size, A_num_block * 32}, index_vals.options());
+  at::Tensor max_vals_scatter = at::zeros({batch_size, 32, num_block}, index_vals.options());
+
+  dim3 threads(256);
+  dim3 blocks(batch_size);
+  int shared_mem = A_num_block * 32 * sizeof(float);
+
+  index_max_cuda_kernel<<<blocks, threads, shared_mem>>>(
+    index_vals.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    max_vals.data_ptr<float>(),
+    max_vals_scatter.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return {max_vals, max_vals_scatter};
+}
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,  // [batch_size, A_num_block, dim, 32]
+  at::Tensor dense_B,  // [batch_size, B_num_block, dim, 32]
+  at::Tensor indices   // [batch_size, num_block]
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_A.size(2);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(64, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  mm_to_sparse_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    dense_B.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return sparse_C;
+}
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  at::Tensor dense_B,   // [batch_size, B_num_block, dim, 32]
+  int A_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_B.size(2);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, dim, 32}, dense_B.options());
+
+  dim3 threads(128, 2);
+  dim3 blocks(num_block / 2, batch_size);
+
+  sparse_dense_mm_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_B.data_ptr<float>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, 32}, sparse_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  reduce_sum_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,   // [batch_size, A_num_block, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int B_num_block
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  scatter_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return sparse_C;
+}
diff --git a/src/transformers/kernels/mra/cuda_launch.h b/src/transformers/kernels/mra/cuda_launch.h
new file mode 100644
index 000000000000..0200140ee337
--- /dev/null
+++ b/src/transformers/kernels/mra/cuda_launch.h
@@ -0,0 +1,39 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <vector>
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+);
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+);
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+);
diff --git a/src/transformers/kernels/mra/torch_extension.cpp b/src/transformers/kernels/mra/torch_extension.cpp
new file mode 100644
index 000000000000..60c9262b7792
--- /dev/null
+++ b/src/transformers/kernels/mra/torch_extension.cpp
@@ -0,0 +1,78 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include <vector>
+
+std::vector<at::Tensor> index_max(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return index_max_kernel(
+    index_vals,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor mm_to_sparse(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+) {
+  return mm_to_sparse_kernel(
+    dense_A,
+    dense_B,
+    indices
+  );
+}
+
+at::Tensor sparse_dense_mm(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+) {
+  return sparse_dense_mm_kernel(
+    sparse_A,
+    indices,
+    dense_B,
+    A_num_block
+  );
+}
+
+at::Tensor reduce_sum(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return reduce_sum_kernel(
+    sparse_A,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor scatter(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+) {
+  return scatter_kernel(
+    dense_A,
+    indices,
+    B_num_block
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("index_max", &index_max, "index_max (CUDA)");
+  m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)");
+  m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)");
+  m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)");
+  m.def("scatter", &scatter, "scatter (CUDA)");
+}
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index ff7c2ab322da..c13b202975ed 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -134,6 +134,7 @@
     mobilevit,
     mobilevitv2,
     mpnet,
+    mra,
     mt5,
     musicgen,
     mvp,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c0fb12586d13..70ad35056556 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -137,6 +137,7 @@
         ("mobilevit", "MobileViTConfig"),
         ("mobilevitv2", "MobileViTV2Config"),
         ("mpnet", "MPNetConfig"),
+        ("mra", "MraConfig"),
         ("mt5", "MT5Config"),
         ("musicgen", "MusicgenConfig"),
         ("mvp", "MvpConfig"),
@@ -329,6 +330,7 @@
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mra", "MRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -534,6 +536,7 @@
         ("mobilevit", "MobileViT"),
         ("mobilevitv2", "MobileViTV2"),
         ("mpnet", "MPNet"),
+        ("mra", "MRA"),
         ("mt5", "MT5"),
         ("musicgen", "MusicGen"),
         ("mvp", "MVP"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 8e8dd45a54bf..b72d263f9bcd 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -134,6 +134,7 @@
         ("mobilevit", "MobileViTModel"),
         ("mobilevitv2", "MobileViTV2Model"),
         ("mpnet", "MPNetModel"),
+        ("mra", "MraModel"),
         ("mt5", "MT5Model"),
         ("mvp", "MvpModel"),
         ("nat", "NatModel"),
@@ -247,6 +248,7 @@
         ("megatron-bert", "MegatronBertForPreTraining"),
         ("mobilebert", "MobileBertForPreTraining"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForPreTraining"),
         ("nllb-moe", "NllbMoeForConditionalGeneration"),
@@ -326,6 +328,7 @@
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForMaskedLM"),
         ("nllb-moe", "NllbMoeForConditionalGeneration"),
@@ -572,6 +575,7 @@
         ("megatron-bert", "MegatronBertForMaskedLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForMaskedLM"),
         ("nystromformer", "NystromformerForMaskedLM"),
@@ -704,6 +708,7 @@
         ("megatron-bert", "MegatronBertForSequenceClassification"),
         ("mobilebert", "MobileBertForSequenceClassification"),
         ("mpnet", "MPNetForSequenceClassification"),
+        ("mra", "MraForSequenceClassification"),
         ("mvp", "MvpForSequenceClassification"),
         ("nezha", "NezhaForSequenceClassification"),
         ("nystromformer", "NystromformerForSequenceClassification"),
@@ -771,6 +776,7 @@
         ("megatron-bert", "MegatronBertForQuestionAnswering"),
         ("mobilebert", "MobileBertForQuestionAnswering"),
         ("mpnet", "MPNetForQuestionAnswering"),
+        ("mra", "MraForQuestionAnswering"),
         ("mt5", "MT5ForQuestionAnswering"),
         ("mvp", "MvpForQuestionAnswering"),
         ("nezha", "NezhaForQuestionAnswering"),
@@ -856,6 +862,7 @@
         ("megatron-bert", "MegatronBertForTokenClassification"),
         ("mobilebert", "MobileBertForTokenClassification"),
         ("mpnet", "MPNetForTokenClassification"),
+        ("mra", "MraForTokenClassification"),
         ("nezha", "NezhaForTokenClassification"),
         ("nystromformer", "NystromformerForTokenClassification"),
         ("qdqbert", "QDQBertForTokenClassification"),
@@ -899,6 +906,7 @@
         ("megatron-bert", "MegatronBertForMultipleChoice"),
         ("mobilebert", "MobileBertForMultipleChoice"),
         ("mpnet", "MPNetForMultipleChoice"),
+        ("mra", "MraForMultipleChoice"),
         ("nezha", "NezhaForMultipleChoice"),
         ("nystromformer", "NystromformerForMultipleChoice"),
         ("qdqbert", "QDQBertForMultipleChoice"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index b5880fbe9f15..57bec0e17847 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -214,6 +214,7 @@
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
             ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
+            ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "mt5",
                 (
diff --git a/src/transformers/models/mra/__init__.py b/src/transformers/models/mra/__init__.py
new file mode 100644
index 000000000000..d27ee2f17193
--- /dev/null
+++ b/src/transformers/models/mra/__init__.py
@@ -0,0 +1,68 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {"configuration_mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mra"] = [
+        "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MraForMaskedLM",
+        "MraForMultipleChoice",
+        "MraForQuestionAnswering",
+        "MraForSequenceClassification",
+        "MraForTokenClassification",
+        "MraLayer",
+        "MraModel",
+        "MraPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mra import (
+            MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+            MraLayer,
+            MraModel,
+            MraPreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/mra/configuration_mra.py b/src/transformers/models/mra/configuration_mra.py
new file mode 100644
index 000000000000..bc6aeebc907e
--- /dev/null
+++ b/src/transformers/models/mra/configuration_mra.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MRA model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "uw-madison/mra-base-512-4": "https://huggingface.co/uw-madison/mra-base-512-4/resolve/main/config.json",
+}
+
+
+class MraConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MraModel`]. It is used to instantiate an MRA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mra
+    [uw-madison/mra-base-512-4](https://huggingface.co/uw-madison/mra-base-512-4) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Mra model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MraModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 1):
+            The vocabulary size of the `token_type_ids` passed when calling [`MraModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`.
+        block_per_row (`int`, *optional*, defaults to 4):
+            Used to set the budget for the high resolution scale.
+        approx_mode (`str`, *optional*, defaults to `"full"`):
+            Controls whether both low and high resolution approximations are used. Set to `"full"` for both low and
+            high resolution and `"sparse"` for only low resolution.
+        initial_prior_first_n_blocks (`int`, *optional*, defaults to 0):
+            The initial number of blocks for which high resolution is used.
+        initial_prior_diagonal_n_blocks (`int`, *optional*, defaults to 0):
+            The number of diagonal blocks for which high resolution is used.
+
+    Example:
+
+    ```python
+    >>> from transformers import MraConfig, MraModel
+
+    >>> # Initializing a Mra uw-madison/mra-base-512-4 style configuration
+    >>> configuration = MraConfig()
+
+    >>> # Initializing a model (with random weights) from the uw-madison/mra-base-512-4 style configuration
+    >>> model = MraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mra"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        position_embedding_type="absolute",
+        block_per_row=4,
+        approx_mode="full",
+        initial_prior_first_n_blocks=0,
+        initial_prior_diagonal_n_blocks=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.block_per_row = block_per_row
+        self.approx_mode = approx_mode
+        self.initial_prior_first_n_blocks = initial_prior_first_n_blocks
+        self.initial_prior_diagonal_n_blocks = initial_prior_diagonal_n_blocks
diff --git a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
new file mode 100644
index 000000000000..f558f7c7bce3
--- /dev/null
+++ b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MRA checkpoints from the original repository. URL: https://github.com/mlpen/mra-attention"""
+
+import argparse
+
+import torch
+
+from transformers import MraConfig, MraForMaskedLM
+
+
+def rename_key(orig_key):
+    if "model" in orig_key:
+        orig_key = orig_key.replace("model.", "")
+    if "norm1" in orig_key:
+        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
+    if "norm2" in orig_key:
+        orig_key = orig_key.replace("norm2", "output.LayerNorm")
+    if "norm" in orig_key:
+        orig_key = orig_key.replace("norm", "LayerNorm")
+    if "transformer" in orig_key:
+        layer_num = orig_key.split(".")[0].split("_")[-1]
+        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
+    if "mha.attn" in orig_key:
+        orig_key = orig_key.replace("mha.attn", "attention.self")
+    if "mha" in orig_key:
+        orig_key = orig_key.replace("mha", "attention")
+    if "W_q" in orig_key:
+        orig_key = orig_key.replace("W_q", "self.query")
+    if "W_k" in orig_key:
+        orig_key = orig_key.replace("W_k", "self.key")
+    if "W_v" in orig_key:
+        orig_key = orig_key.replace("W_v", "self.value")
+    if "ff.0" in orig_key:
+        orig_key = orig_key.replace("ff.0", "intermediate.dense")
+    if "ff.2" in orig_key:
+        orig_key = orig_key.replace("ff.2", "output.dense")
+    if "ff" in orig_key:
+        orig_key = orig_key.replace("ff", "output.dense")
+    if "mlm_class" in orig_key:
+        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
+    if "mlm" in orig_key:
+        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
+    if "backbone.backbone.encoders" in orig_key:
+        orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
+    if "cls" not in orig_key:
+        orig_key = "mra." + orig_key
+
+    return orig_key
+
+
+def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if ("pooler" in key) or ("sen_class" in key):
+            continue
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
+    orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
+
+    return orig_state_dict
+
+
+def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
+    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
+    config = MraConfig.from_json_file(mra_config_file)
+    model = MraForMaskedLM(config)
+
+    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
+
+    print(model.load_state_dict(new_state_dict))
+    model.eval()
+    model.save_pretrained(pytorch_dump_path)
+
+    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The json file for Mra model config.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py
new file mode 100644
index 000000000000..7ae4974a05a8
--- /dev/null
+++ b/src/transformers/models/mra/modeling_mra.py
@@ -0,0 +1,1501 @@
+# coding=utf-8
+# Copyright 2023 University of Wisconsin-Madison and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MRA model."""
+
+
+import math
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.utils.cpp_extension import load
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_ninja_available,
+    is_torch_cuda_available,
+    logging,
+)
+from .configuration_mra import MraConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "uw-madison/mra-base-512-4"
+_CONFIG_FOR_DOC = "MraConfig"
+_TOKENIZER_FOR_DOC = "AutoTokenizer"
+
+MRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "uw-madison/mra-base-512-4",
+    # See all Mra models at https://huggingface.co/models?filter=mra
+]
+
+
+def load_cuda_kernels():
+    global cuda_kernel
+    src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "mra"
+
+    def append_root(files):
+        return [src_folder / file for file in files]
+
+    src_files = append_root(["cuda_kernel.cu", "cuda_launch.cu", "torch_extension.cpp"])
+
+    cuda_kernel = load("cuda_kernel", src_files, verbose=True)
+
+    import cuda_kernel
+
+
+cuda_kernel = None
+
+
+if is_torch_cuda_available() and is_ninja_available():
+    logger.info("Loading custom CUDA kernels...")
+
+    try:
+        load_cuda_kernels()
+    except Exception as e:
+        logger.warning(
+            "Failed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of"
+            f" PyTorch and CUDA Toolkit are installed: {e}"
+        )
+else:
+    pass
+
+
+def sparse_max(sparse_qk_prod, indices, query_num_block, key_num_block):
+    """
+    Computes maximum values for softmax stability.
+    """
+    if len(sparse_qk_prod.size()) != 4:
+        raise ValueError("sparse_qk_prod must be a 4-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if sparse_qk_prod.size(2) != 32:
+        raise ValueError("The size of the second dimension of sparse_qk_prod must be 32.")
+
+    if sparse_qk_prod.size(3) != 32:
+        raise ValueError("The size of the third dimension of sparse_qk_prod must be 32.")
+
+    index_vals = sparse_qk_prod.max(dim=-2).values.transpose(-1, -2)
+    index_vals = index_vals.contiguous()
+
+    indices = indices.int()
+    indices = indices.contiguous()
+
+    max_vals, max_vals_scatter = cuda_kernel.index_max(index_vals, indices, query_num_block, key_num_block)
+    max_vals_scatter = max_vals_scatter.transpose(-1, -2)[:, :, None, :]
+
+    return max_vals, max_vals_scatter
+
+
+def sparse_mask(mask, indices, block_size=32):
+    """
+    Converts attention mask to a sparse mask for high resolution logits.
+    """
+    if len(mask.size()) != 2:
+        raise ValueError("mask must be a 2-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if mask.shape[0] != indices.shape[0]:
+        raise ValueError("mask and indices must have the same size in the zero-th dimension.")
+
+    batch_size, seq_len = mask.shape
+    num_block = seq_len // block_size
+
+    batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
+    mask = mask.reshape(batch_size, num_block, block_size)
+    mask = mask[batch_idx[:, None], (indices % num_block).long(), :]
+
+    return mask
+
+
+def mm_to_sparse(dense_query, dense_key, indices, block_size=32):
+    """
+    Performs Sampled Dense Matrix Multiplication.
+    """
+    batch_size, query_size, dim = dense_query.size()
+    _, key_size, dim = dense_key.size()
+
+    if query_size % block_size != 0:
+        raise ValueError("query_size (size of first dimension of dense_query) must be divisible by block_size.")
+
+    if key_size % block_size != 0:
+        raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
+
+    dense_query = dense_query.reshape(batch_size, query_size // block_size, block_size, dim).transpose(-1, -2)
+    dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
+
+    if len(dense_query.size()) != 4:
+        raise ValueError("dense_query must be a 4-dimensional tensor.")
+
+    if len(dense_key.size()) != 4:
+        raise ValueError("dense_key must be a 4-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if dense_query.size(3) != 32:
+        raise ValueError("The third dimension of dense_query must be 32.")
+
+    if dense_key.size(3) != 32:
+        raise ValueError("The third dimension of dense_key must be 32.")
+
+    dense_query = dense_query.contiguous()
+    dense_key = dense_key.contiguous()
+
+    indices = indices.int()
+    indices = indices.contiguous()
+
+    return cuda_kernel.mm_to_sparse(dense_query, dense_key, indices.int())
+
+
+def sparse_dense_mm(sparse_query, indices, dense_key, query_num_block, block_size=32):
+    """
+    Performs matrix multiplication of a sparse matrix with a dense matrix.
+    """
+    batch_size, key_size, dim = dense_key.size()
+
+    if key_size % block_size != 0:
+        raise ValueError("key_size (size of first dimension of dense_key) must be divisible by block_size.")
+
+    if sparse_query.size(2) != block_size:
+        raise ValueError("The size of the second dimension of sparse_query must be equal to the block_size.")
+
+    if sparse_query.size(3) != block_size:
+        raise ValueError("The size of the third dimension of sparse_query must be equal to the block_size.")
+
+    dense_key = dense_key.reshape(batch_size, key_size // block_size, block_size, dim).transpose(-1, -2)
+
+    if len(sparse_query.size()) != 4:
+        raise ValueError("sparse_query must be a 4-dimensional tensor.")
+
+    if len(dense_key.size()) != 4:
+        raise ValueError("dense_key must be a 4-dimensional tensor.")
+
+    if len(indices.size()) != 2:
+        raise ValueError("indices must be a 2-dimensional tensor.")
+
+    if dense_key.size(3) != 32:
+        raise ValueError("The size of the third dimension of dense_key must be 32.")
+
+    sparse_query = sparse_query.contiguous()
+
+    indices = indices.int()
+    indices = indices.contiguous()
+    dense_key = dense_key.contiguous()
+
+    dense_qk_prod = cuda_kernel.sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
+    dense_qk_prod = dense_qk_prod.transpose(-1, -2).reshape(batch_size, query_num_block * block_size, dim)
+    return dense_qk_prod
+
+
+def transpose_indices(indices, dim_1_block, dim_2_block):
+    return ((indices % dim_2_block) * dim_1_block + torch.div(indices, dim_2_block, rounding_mode="floor")).long()
+
+
+class MraSampledDenseMatMul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, dense_query, dense_key, indices, block_size):
+        sparse_qk_prod = mm_to_sparse(dense_query, dense_key, indices, block_size)
+        ctx.save_for_backward(dense_query, dense_key, indices)
+        ctx.block_size = block_size
+        return sparse_qk_prod
+
+    @staticmethod
+    def backward(ctx, grad):
+        dense_query, dense_key, indices = ctx.saved_tensors
+        block_size = ctx.block_size
+        query_num_block = dense_query.size(1) // block_size
+        key_num_block = dense_key.size(1) // block_size
+        indices_T = transpose_indices(indices, query_num_block, key_num_block)
+        grad_key = sparse_dense_mm(grad.transpose(-1, -2), indices_T, dense_query, key_num_block)
+        grad_query = sparse_dense_mm(grad, indices, dense_key, query_num_block)
+        return grad_query, grad_key, None, None
+
+    @staticmethod
+    def operator_call(dense_query, dense_key, indices, block_size=32):
+        return MraSampledDenseMatMul.apply(dense_query, dense_key, indices, block_size)
+
+
+class MraSparseDenseMatMul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, sparse_query, indices, dense_key, query_num_block):
+        sparse_qk_prod = sparse_dense_mm(sparse_query, indices, dense_key, query_num_block)
+        ctx.save_for_backward(sparse_query, indices, dense_key)
+        ctx.query_num_block = query_num_block
+        return sparse_qk_prod
+
+    @staticmethod
+    def backward(ctx, grad):
+        sparse_query, indices, dense_key = ctx.saved_tensors
+        query_num_block = ctx.query_num_block
+        key_num_block = dense_key.size(1) // sparse_query.size(-1)
+        indices_T = transpose_indices(indices, query_num_block, key_num_block)
+        grad_key = sparse_dense_mm(sparse_query.transpose(-1, -2), indices_T, grad, key_num_block)
+        grad_query = mm_to_sparse(grad, dense_key, indices)
+        return grad_query, None, grad_key, None
+
+    @staticmethod
+    def operator_call(sparse_query, indices, dense_key, query_num_block):
+        return MraSparseDenseMatMul.apply(sparse_query, indices, dense_key, query_num_block)
+
+
+class MraReduceSum:
+    @staticmethod
+    def operator_call(sparse_query, indices, query_num_block, key_num_block):
+        batch_size, num_block, block_size, _ = sparse_query.size()
+
+        if len(sparse_query.size()) != 4:
+            raise ValueError("sparse_query must be a 4-dimensional tensor.")
+
+        if len(indices.size()) != 2:
+            raise ValueError("indices must be a 2-dimensional tensor.")
+
+        _, _, block_size, _ = sparse_query.size()
+        batch_size, num_block = indices.size()
+
+        sparse_query = sparse_query.sum(dim=2).reshape(batch_size * num_block, block_size)
+
+        batch_idx = torch.arange(indices.size(0), dtype=torch.long, device=indices.device)
+        global_idxes = (
+            torch.div(indices, key_num_block, rounding_mode="floor").long() + batch_idx[:, None] * query_num_block
+        ).reshape(batch_size * num_block)
+        temp = torch.zeros(
+            (batch_size * query_num_block, block_size), dtype=sparse_query.dtype, device=sparse_query.device
+        )
+        output = temp.index_add(0, global_idxes, sparse_query).reshape(batch_size, query_num_block, block_size)
+
+        output = output.reshape(batch_size, query_num_block * block_size)
+        return output
+
+
+def get_low_resolution_logit(query, key, block_size, mask=None, value=None):
+    """
+    Compute low resolution approximation.
+    """
+    batch_size, seq_len, head_dim = query.size()
+
+    num_block_per_row = seq_len // block_size
+
+    value_hat = None
+    if mask is not None:
+        token_count = mask.reshape(batch_size, num_block_per_row, block_size).sum(dim=-1)
+        query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
+            token_count[:, :, None] + 1e-6
+        )
+        key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
+            token_count[:, :, None] + 1e-6
+        )
+        if value is not None:
+            value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).sum(dim=-2) / (
+                token_count[:, :, None] + 1e-6
+            )
+    else:
+        token_count = block_size * torch.ones(batch_size, num_block_per_row, dtype=torch.float, device=query.device)
+        query_hat = query.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
+        key_hat = key.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
+        if value is not None:
+            value_hat = value.reshape(batch_size, num_block_per_row, block_size, head_dim).mean(dim=-2)
+
+    low_resolution_logit = torch.matmul(query_hat, key_hat.transpose(-1, -2)) / math.sqrt(head_dim)
+
+    low_resolution_logit_row_max = low_resolution_logit.max(dim=-1, keepdims=True).values
+
+    if mask is not None:
+        low_resolution_logit = (
+            low_resolution_logit - 1e4 * ((token_count[:, None, :] * token_count[:, :, None]) < 0.5).float()
+        )
+
+    return low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat
+
+
+def get_block_idxes(
+    low_resolution_logit, num_blocks, approx_mode, initial_prior_first_n_blocks, initial_prior_diagonal_n_blocks
+):
+    """
+    Compute the indices of the subset of components to be used in the approximation.
+    """
+    batch_size, total_blocks_per_row, _ = low_resolution_logit.shape
+
+    if initial_prior_diagonal_n_blocks > 0:
+        offset = initial_prior_diagonal_n_blocks // 2
+        temp_mask = torch.ones(total_blocks_per_row, total_blocks_per_row, device=low_resolution_logit.device)
+        diagonal_mask = torch.tril(torch.triu(temp_mask, diagonal=-offset), diagonal=offset)
+        low_resolution_logit = low_resolution_logit + diagonal_mask[None, :, :] * 5e3
+
+    if initial_prior_first_n_blocks > 0:
+        low_resolution_logit[:, :initial_prior_first_n_blocks, :] = (
+            low_resolution_logit[:, :initial_prior_first_n_blocks, :] + 5e3
+        )
+        low_resolution_logit[:, :, :initial_prior_first_n_blocks] = (
+            low_resolution_logit[:, :, :initial_prior_first_n_blocks] + 5e3
+        )
+
+    top_k_vals = torch.topk(
+        low_resolution_logit.reshape(batch_size, -1), num_blocks, dim=-1, largest=True, sorted=False
+    )
+    indices = top_k_vals.indices
+
+    if approx_mode == "full":
+        threshold = top_k_vals.values.min(dim=-1).values
+        high_resolution_mask = (low_resolution_logit >= threshold[:, None, None]).float()
+    elif approx_mode == "sparse":
+        high_resolution_mask = None
+    else:
+        raise ValueError(f"{approx_mode} is not a valid approx_model value.")
+
+    return indices, high_resolution_mask
+
+
+def mra2_attention(
+    query,
+    key,
+    value,
+    mask,
+    num_blocks,
+    approx_mode,
+    block_size=32,
+    initial_prior_first_n_blocks=0,
+    initial_prior_diagonal_n_blocks=0,
+):
+    """
+    Use Mra to approximate self-attention.
+    """
+    if cuda_kernel is None:
+        return torch.zeros_like(query).requires_grad_()
+
+    batch_size, num_head, seq_len, head_dim = query.size()
+    meta_batch = batch_size * num_head
+
+    if seq_len % block_size != 0:
+        raise ValueError("sequence length must be divisible by the block_size.")
+
+    num_block_per_row = seq_len // block_size
+
+    query = query.reshape(meta_batch, seq_len, head_dim)
+    key = key.reshape(meta_batch, seq_len, head_dim)
+    value = value.reshape(meta_batch, seq_len, head_dim)
+
+    if mask is not None:
+        query = query * mask[:, :, None]
+        key = key * mask[:, :, None]
+        value = value * mask[:, :, None]
+
+    if approx_mode == "full":
+        low_resolution_logit, token_count, low_resolution_logit_row_max, value_hat = get_low_resolution_logit(
+            query, key, block_size, mask, value
+        )
+    elif approx_mode == "sparse":
+        with torch.no_grad():
+            low_resolution_logit, token_count, low_resolution_logit_row_max, _ = get_low_resolution_logit(
+                query, key, block_size, mask
+            )
+    else:
+        raise Exception('approx_mode must be "full" or "sparse"')
+
+    with torch.no_grad():
+        low_resolution_logit_normalized = low_resolution_logit - low_resolution_logit_row_max
+        indices, high_resolution_mask = get_block_idxes(
+            low_resolution_logit_normalized,
+            num_blocks,
+            approx_mode,
+            initial_prior_first_n_blocks,
+            initial_prior_diagonal_n_blocks,
+        )
+
+    high_resolution_logit = MraSampledDenseMatMul.operator_call(
+        query, key, indices, block_size=block_size
+    ) / math.sqrt(head_dim)
+    max_vals, max_vals_scatter = sparse_max(high_resolution_logit, indices, num_block_per_row, num_block_per_row)
+    high_resolution_logit = high_resolution_logit - max_vals_scatter
+    if mask is not None:
+        high_resolution_logit = high_resolution_logit - 1e4 * (1 - sparse_mask(mask, indices)[:, :, :, None])
+    high_resolution_attn = torch.exp(high_resolution_logit)
+    high_resolution_attn_out = MraSparseDenseMatMul.operator_call(
+        high_resolution_attn, indices, value, num_block_per_row
+    )
+    high_resolution_normalizer = MraReduceSum.operator_call(
+        high_resolution_attn, indices, num_block_per_row, num_block_per_row
+    )
+
+    if approx_mode == "full":
+        low_resolution_attn = (
+            torch.exp(low_resolution_logit - low_resolution_logit_row_max - 1e4 * high_resolution_mask)
+            * token_count[:, None, :]
+        )
+
+        low_resolution_attn_out = (
+            torch.matmul(low_resolution_attn, value_hat)[:, :, None, :]
+            .repeat(1, 1, block_size, 1)
+            .reshape(meta_batch, seq_len, head_dim)
+        )
+        low_resolution_normalizer = (
+            low_resolution_attn.sum(dim=-1)[:, :, None].repeat(1, 1, block_size).reshape(meta_batch, seq_len)
+        )
+
+        log_correction = low_resolution_logit_row_max.repeat(1, 1, block_size).reshape(meta_batch, seq_len) - max_vals
+        if mask is not None:
+            log_correction = log_correction * mask
+
+        low_resolution_corr = torch.exp(log_correction * (log_correction <= 0).float())
+        low_resolution_attn_out = low_resolution_attn_out * low_resolution_corr[:, :, None]
+        low_resolution_normalizer = low_resolution_normalizer * low_resolution_corr
+
+        high_resolution_corr = torch.exp(-log_correction * (log_correction > 0).float())
+        high_resolution_attn_out = high_resolution_attn_out * high_resolution_corr[:, :, None]
+        high_resolution_normalizer = high_resolution_normalizer * high_resolution_corr
+
+        context_layer = (high_resolution_attn_out + low_resolution_attn_out) / (
+            high_resolution_normalizer[:, :, None] + low_resolution_normalizer[:, :, None] + 1e-6
+        )
+
+    elif approx_mode == "sparse":
+        context_layer = high_resolution_attn_out / (high_resolution_normalizer[:, :, None] + 1e-6)
+    else:
+        raise Exception('config.approx_mode must be "full" or "sparse"')
+
+    if mask is not None:
+        context_layer = context_layer * mask[:, :, None]
+
+    context_layer = context_layer.reshape(batch_size, num_head, seq_len, head_dim)
+
+    return context_layer
+
+
+class MraEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+            persistent=False,
+        )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MraSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = (
+            position_embedding_type if position_embedding_type is not None else config.position_embedding_type
+        )
+
+        self.num_block = (config.max_position_embeddings // 32) * config.block_per_row
+        self.num_block = min(self.num_block, int((config.max_position_embeddings // 32) ** 2))
+
+        self.approx_mode = config.approx_mode
+        self.initial_prior_first_n_blocks = config.initial_prior_first_n_blocks
+        self.initial_prior_diagonal_n_blocks = config.initial_prior_diagonal_n_blocks
+
+    def transpose_for_scores(self, layer):
+        new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        layer = layer.view(*new_layer_shape)
+        return layer.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        batch_size, num_heads, seq_len, head_dim = query_layer.size()
+
+        # revert changes made by get_extended_attention_mask
+        attention_mask = 1.0 + attention_mask / 10000.0
+        attention_mask = (
+            attention_mask.squeeze().repeat(1, num_heads, 1).reshape(batch_size * num_heads, seq_len).int()
+        )
+
+        # The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
+        # smaller than this are padded with zeros.
+        gpu_warp_size = 32
+
+        if head_dim < gpu_warp_size:
+            pad_size = batch_size, num_heads, seq_len, gpu_warp_size - head_dim
+
+            query_layer = torch.cat([query_layer, torch.zeros(pad_size, device=query_layer.device)], dim=-1)
+            key_layer = torch.cat([key_layer, torch.zeros(pad_size, device=key_layer.device)], dim=-1)
+            value_layer = torch.cat([value_layer, torch.zeros(pad_size, device=value_layer.device)], dim=-1)
+
+        context_layer = mra2_attention(
+            query_layer.float(),
+            key_layer.float(),
+            value_layer.float(),
+            attention_mask.float(),
+            self.num_block,
+            approx_mode=self.approx_mode,
+            initial_prior_first_n_blocks=self.initial_prior_first_n_blocks,
+            initial_prior_diagonal_n_blocks=self.initial_prior_diagonal_n_blocks,
+        )
+
+        if head_dim < gpu_warp_size:
+            context_layer = context_layer[:, :, :, :head_dim]
+
+        context_layer = context_layer.reshape(batch_size, num_heads, seq_len, head_dim)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class MraSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MraAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = MraSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = MraSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None):
+        self_outputs = self.self(hidden_states, attention_mask)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class MraIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class MraOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MraLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MraAttention(config)
+        self.add_cross_attention = config.add_cross_attention
+        self.intermediate = MraIntermediate(config)
+        self.output = MraOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None):
+        self_attention_outputs = self.attention(hidden_states, attention_mask)
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MraEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([MraLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask)
+
+            hidden_states = layer_outputs[0]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform
+class MraPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Mra
+class MraLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = MraPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Mra
+class MraOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MraLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.yoso.modeling_yoso.YosoPreTrainedModel with Yoso->Mra,yoso->mra
+class MraPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MraConfig
+    base_model_prefix = "mra"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MraEncoder):
+            module.gradient_checkpointing = value
+
+
+MRA_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MraConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MRA Model transformer outputting raw hidden-states without any specific head on top.",
+    MRA_START_DOCSTRING,
+)
+class MraModel(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = MraEmbeddings(config)
+        self.encoder = MraEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithCrossAttentions]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings("""MRA Model with a `language modeling` head on top.""", MRA_START_DOCSTRING)
+class MraForMaskedLM(MraPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mra = MraModel(config)
+        self.cls = MraOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.yoso.modeling_yoso.YosoClassificationHead with Yoso->Mra
+class MraClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.config = config
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks.""",
+    MRA_START_DOCSTRING,
+)
+class MraForSequenceClassification(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.mra = MraModel(config)
+        self.classifier = MraClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MRA Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
+    MRA_START_DOCSTRING,
+)
+class MraForMultipleChoice(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mra = MraModel(config)
+        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MRA Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
+    MRA_START_DOCSTRING,
+)
+class MraForTokenClassification(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mra = MraModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    MRA_START_DOCSTRING,
+)
+class MraForQuestionAnswering(MraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.mra = MraModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mra(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 30beef21ffe0..a40deea2c059 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4901,6 +4901,58 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MraForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MT5EncoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/mra/__init__.py b/tests/models/mra/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
new file mode 100644
index 000000000000..ced66f02f56b
--- /dev/null
+++ b/tests/models/mra/test_modeling_mra.py
@@ -0,0 +1,406 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MRA model. """
+
+
+import unittest
+
+from transformers import MraConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MraForMaskedLM,
+        MraForMultipleChoice,
+        MraForQuestionAnswering,
+        MraForSequenceClassification,
+        MraForTokenClassification,
+        MraModel,
+    )
+    from transformers.models.mra.modeling_mra import MRA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class MraModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=8,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=5,
+        num_attention_heads=2,
+        intermediate_size=36,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return MraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = MraModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MraForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MraForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MraForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MraModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            MraModel,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    has_attentions = False
+
+    all_generative_model_classes = ()
+
+    def setUp(self):
+        self.model_tester = MraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="MRA does not output attentions")
+    def test_attention_outputs(self):
+        return
+
+
+@require_torch
+class MraModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = MraModel.from_pretrained("uw-madison/mra-base-512-4")
+        input_ids = torch.arange(256).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        expected_shape = torch.Size((1, 256, 768))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-0.0140, 0.0830, -0.0381], [0.1546, 0.1402, 0.0220], [0.1162, 0.0851, 0.0165]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm(self):
+        model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-512-4")
+        input_ids = torch.arange(256).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        vocab_size = 50265
+
+        expected_shape = torch.Size((1, 256, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[9.2595, -3.6038, 11.8819], [9.3869, -3.2693, 11.0956], [11.8524, -3.4938, 13.1210]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm_long_input(self):
+        model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-4096-8-d3")
+        input_ids = torch.arange(4096).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        vocab_size = 50265
+
+        expected_shape = torch.Size((1, 4096, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[5.4789, -2.3564, 7.5064], [7.9067, -1.3369, 9.9668], [9.0712, -1.8106, 7.0380]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

From 25411085647a4dbcbd4e7ba6f381881a3e49c33e Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl@users.noreply.github.com>
Date: Mon, 10 Jul 2023 14:50:07 +0200
Subject: [PATCH 617/935] [`T5`] Adding model_parallel = False to
 `T5ForQuestionAnswering` and `MT5ForQuestionAnswering` (#24684)

Adding model_parallel = False
---
 src/transformers/models/mt5/modeling_mt5.py | 2 ++
 src/transformers/models/t5/modeling_t5.py   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 03e3581cf05b..57593f12089b 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -2032,6 +2032,8 @@ def __init__(self, config: MT5Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+        self.model_parallel = False
+
     # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.get_input_embeddings
     def get_input_embeddings(self):
         return self.shared
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 7934b10b0a27..71d282c615fa 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1980,6 +1980,8 @@ def __init__(self, config: T5Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+        self.model_parallel = False
+
     def get_input_embeddings(self):
         return self.shared
 

From a074a5d34d6411fb00e83a2ed30acf23d8c976b5 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 10 Jul 2023 17:57:26 +0100
Subject: [PATCH 618/935] Docs: change some `input_ids` doc reference from
 `BertTokenizer` to `AutoTokenizer`  (#24730)

---
 src/transformers/generation/logits_process.py                 | 2 +-
 src/transformers/generation/stopping_criteria.py              | 2 +-
 src/transformers/models/bart/modeling_tf_bart.py              | 2 +-
 src/transformers/models/bridgetower/modeling_bridgetower.py   | 2 +-
 src/transformers/models/clip/modeling_tf_clip.py              | 4 ++--
 src/transformers/models/funnel/modeling_funnel.py             | 2 +-
 src/transformers/models/groupvit/modeling_tf_groupvit.py      | 4 ++--
 src/transformers/models/led/modeling_tf_led.py                | 2 +-
 src/transformers/models/mmbt/modeling_mmbt.py                 | 2 +-
 src/transformers/models/mobilebert/modeling_mobilebert.py     | 2 +-
 src/transformers/models/t5/modeling_tf_t5.py                  | 2 +-
 src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py  | 2 +-
 src/transformers/models/vilt/modeling_vilt.py                 | 4 ++--
 .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py       | 2 +-
 14 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index a7fad6784579..9982f941de8d 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -32,7 +32,7 @@
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 307a52adc2ec..8d1c3a0f4f16 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -17,7 +17,7 @@
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 2199c80a216e..497dad424911 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -576,7 +576,7 @@ def dummy_inputs(self):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 080c13f10f22..b2e03441d3f6 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -65,7 +65,7 @@
 BRIDGETOWER_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
 
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 009e474440d7..3452deba9cdc 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -943,7 +943,7 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1000,7 +1000,7 @@ class TFCLIPPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 0ee9ed587ed9..b3f4297eaf54 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -882,7 +882,7 @@ class FunnelForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index e6d6c1d32527..2c3297a8f8b0 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -1502,7 +1502,7 @@ class TFGroupViTPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
@@ -1560,7 +1560,7 @@ class TFGroupViTPreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index b508c8432d7e..374a6e8866c0 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1560,7 +1560,7 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
         input_ids (`tf.Tensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py
index 8819dc4d5178..220cfb49c997 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/mmbt/modeling_mmbt.py
@@ -106,7 +106,7 @@ def forward(self, input_modal, start_token=None, end_token=None, position_ids=No
             Encoder, the shape would be (batch_size, channels, height, width)
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
-            appended to the end of other modality embeddings. Indices can be obtained using [`BertTokenizer`]. See
+            appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 06318679faee..809e209b8b73 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -761,7 +761,7 @@ class MobileBertForPreTrainingOutput(ModelOutput):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index daef8bfb7fdd..5454b8186c7a 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -960,7 +960,7 @@ def _shift_right(self, input_ids):
             Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
             should be able to pad the inputs on the right or the left.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index dee1d71bc63f..a58b5ea70827 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -814,7 +814,7 @@ class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
         input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
             [`PreTrainedTokenizer.encode`] for details.
 
             [What are input IDs?](../glossary#input-ids)
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 4d5283bae60e..c5df1c823f66 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -610,7 +610,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 VILT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
 
@@ -665,7 +665,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`BertTokenizer`]. See
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
             IDs?](../glossary#input-ids)
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index c59a78a3459d..6e9e848cb868 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -851,7 +851,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
         input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See
+            Indices can be obtained using [`AutoTokenizer`]. See
             [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
             details.
 

From 35eac0df75c692c5b93c12f7eaf3279cab8bd7ce Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Mon, 10 Jul 2023 17:49:30 -0400
Subject: [PATCH 619/935] add link to accelerate doc (#24601)

---
 docs/source/en/main_classes/quantization.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index eb360e603ab0..8984ac478785 100644
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -25,6 +25,8 @@ Learn more about the quantization method in the [LLM.int8()](https://arxiv.org/a
 
 Since its `0.39.0` release, you can load any model that supports `device_map` using 4-bit quantization, leveraging FP4 data type.
 
+If you want to quantize your own pytorch model, check out this [documentation](https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization) from 🤗 Accelerate library. 
+
 Here are the things you can do using `bitsandbytes` integration
 
 ### FP4 quantization 
@@ -269,4 +271,3 @@ Note that you don't need to pass `device_map` when loading the model for trainin
 ## Quantization with 🤗 `optimum` 
 
 Please have a look at [Optimum documentation](https://huggingface.co/docs/optimum/index) to learn more about quantization methods that are supported by `optimum` and see if these are applicable for your usecase.
-

From b3ab3fac1df7c534be17f97c3584744a97599bfd Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 11 Jul 2023 13:36:31 +0100
Subject: [PATCH 620/935] Falcon port (#24523)

* Initial commit

* Update src/transformers/models/falcon/configuration_falcon.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/falcon/configuration_falcon.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Cleanup config docstring

* Update src/transformers/models/falcon/configuration_falcon.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Convert to relative imports

* Remove torch < 1.8 warning

* Restructure cos_sin header

* qkv -> query, key, value

* Refactor attention calculation

* Add a couple of config variables to account for the different checkpoints

* Successful merging of the code paths!

* Fix misplaced line in the non-parallel attention path

* Update config and tests

* Add a pad_token_id when testing

* Support output_attentions when alibi is None

* make fixup

* Skip KV cache shape test

* No more _keys_to_ignore_on_load_missing

* Simplify self attention a bit

* Simplify self attention a bit

* make fixup

* stash commit

* Some more attention mask updates

* Should pass all tests except assisted generation!

* Add big model generation test

* make fixup

* Add temporary workaround for test

* Test overrides for assisted generation

* Update src/transformers/models/falcon/modeling_falcon.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/falcon/modeling_falcon.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/falcon/modeling_falcon.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update tests/models/falcon/test_modeling_falcon.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Test overrides for assisted generation

* Add generation demo

* Update copyright

* Make the docstring model actually small

* Add module-level docstring

* Remove all assertions

* Add copied from bloom

* Reformat the QKV layer

* Add copied from bloom

* Update src/transformers/models/falcon/modeling_falcon.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Remove unused line and reformat

* No single letter variables

* Cleanup return names

* Add copied from line

* Remove the deprecated arguments blocks

* Change the embeddings test to an alibi on/off test

* Remove position_ids from FalconForQA

* Remove old check for token type IDs

* Fix the alibi path when multi_query is False

* Update src/transformers/models/falcon/modeling_falcon.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/falcon/modeling_falcon.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/falcon/test_modeling_falcon.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update config naming

* Fix typo for new_decoder_architecture

* Add some comments

* Fix docstring

* Fix docstring

* Create range in the right dtype from the start

* Review comment cleanup

* n_head_kv -> num_kv_heads

* self.alibi -> self.use_alibi

* self.num_kv -> self.num_kv_heads

* Reorder config args

* Made alibi arguments Optional

* Add all model docstrings

* Add extra checkpoints

* Add author info for Falcon

* Stop removing token_type_ids because our checkpoints shouldn't return it anymore

* Add one hopeful comment for the future

* Fix typo

* Update tests, fix cache issue for generation

* Use -1e9 instead of -inf to avoid float overflow

* Recompute the rotary embeddings much less often

* Re-enable disabled tests

* One final fix to attention mask calculation, and update tests

* Cleanup targeting falcon-40b equivalency

* Post-rebase docs update

* Update docstrings, especially in the config

* More descriptive variable names, and comments where we can't rename them

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/falcon.md            |   66 +
 docs/source/en/tasks/language_modeling.md     |    2 +-
 docs/source/en/tasks/question_answering.md    |    2 +-
 .../en/tasks/sequence_classification.md       |    2 +-
 docs/source/en/tasks/token_classification.md  |    2 +-
 src/transformers/__init__.py                  |   22 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    5 +
 .../models/bloom/modeling_bloom.py            |    2 +-
 src/transformers/models/falcon/__init__.py    |   68 +
 .../models/falcon/configuration_falcon.py     |  147 ++
 .../models/falcon/modeling_falcon.py          | 1262 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   45 +
 tests/models/falcon/__init__.py               |    0
 tests/models/falcon/test_modeling_falcon.py   |  469 ++++++
 25 files changed, 2104 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/en/model_doc/falcon.md
 create mode 100644 src/transformers/models/falcon/__init__.py
 create mode 100644 src/transformers/models/falcon/configuration_falcon.py
 create mode 100644 src/transformers/models/falcon/modeling_falcon.py
 create mode 100644 tests/models/falcon/__init__.py
 create mode 100644 tests/models/falcon/test_modeling_falcon.py

diff --git a/README.md b/README.md
index 7b2b1cb3d5ed..d0fddfd1b640 100644
--- a/README.md
+++ b/README.md
@@ -351,6 +351,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
diff --git a/README_es.md b/README_es.md
index 84ae55f0cf18..62ba9546f449 100644
--- a/README_es.md
+++ b/README_es.md
@@ -326,6 +326,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
diff --git a/README_hd.md b/README_hd.md
index 44850339f652..0c1dcb3bfd38 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -298,6 +298,7 @@ conda install -c huggingface transformers
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया।
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स।
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 2306e2a1afeb..8c5b67adabb7 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -360,6 +360,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
diff --git a/README_ko.md b/README_ko.md
index 9081b83c7007..53b619d1186b 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -275,6 +275,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index e325896353d4..d8b23fa833fe 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -299,6 +299,7 @@ conda install -c huggingface transformers
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index d91e1c06d864..da52c5831102 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -311,6 +311,7 @@ conda install -c huggingface transformers
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 9e0d612377c9..0a98f793134c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -295,6 +295,8 @@
         title: ErnieM
       - local: model_doc/esm
         title: ESM
+      - local: model_doc/falcon
+        title: Falcon
       - local: model_doc/flan-t5
         title: FLAN-T5
       - local: model_doc/flan-ul2
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index cbde131668b4..d3632aea5cf5 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -115,6 +115,7 @@ The documentation is organized into five sections:
 1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -332,6 +333,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            ErnieM             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Falcon             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md
new file mode 100644
index 000000000000..8b6c655a4e6f
--- /dev/null
+++ b/docs/source/en/model_doc/falcon.md
@@ -0,0 +1,66 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Falcon
+
+## Overview
+
+Falcon is a state-of-the-art language model trained on the [RefinedWeb dataset](https://arxiv.org/abs/2306.01116). At the time of writing, it is the leading model on the [OpenLLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). 
+
+There is no paper associated with Falcon yet, but for citation information please see [the repository for Falcon-40B](https://huggingface.co/tiiuae/falcon-40b#citation), the highest-performance Falcon model. 
+
+- The model and tokenizer can be loaded via:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b-instruct")
+model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b-instruct")
+
+inputs = tokenizer("What's the best way to divide a pizza between three people?", return_tensors="pt")
+outputs = model.generate(**inputs, max_length=50)
+```
+
+- The Falcon tokenizer is a BPE model.
+
+## FalconConfig
+
+[[autodoc]] FalconConfig
+
+## FalconModel
+
+[[autodoc]] FalconModel
+    - forward
+
+## FalconForCausalLM
+
+[[autodoc]] FalconForCausalLM
+    - forward
+
+## FalconForSequenceClassification
+
+[[autodoc]] FalconForSequenceClassification
+    - forward
+
+## FalconForTokenClassification
+
+[[autodoc]] FalconForTokenClassification
+    - forward
+
+## FalconForQuestionAnswering
+
+[[autodoc]] FalconForQuestionAnswering
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 4986d90b28fc..172ac5786ea1 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 3a3eda904ea6..6009cef4930b 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index bf07878a6357..25649b03de8c 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 66d06b9f383a..399b7c515dcd 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ac9ab14f5d12..82f461e0fd48 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -294,6 +294,7 @@
     ],
     "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"],
     "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"],
+    "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
     "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
     "models.flava": [
         "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1638,6 +1639,17 @@
             "EsmPreTrainedModel",
         ]
     )
+    _import_structure["models.falcon"].extend(
+        [
+            "FALCON_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "FalconForCausalLM",
+            "FalconForQuestionAnswering",
+            "FalconForSequenceClassification",
+            "FalconForTokenClassification",
+            "FalconModel",
+            "FalconPreTrainedModel",
+        ]
+    )
     _import_structure["models.flaubert"].extend(
         [
             "FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4188,6 +4200,7 @@
     from .models.ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig
     from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
     from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer
+    from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
     from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5349,6 +5362,15 @@
             EsmModel,
             EsmPreTrainedModel,
         )
+        from .models.falcon import (
+            FALCON_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FalconForCausalLM,
+            FalconForQuestionAnswering,
+            FalconForSequenceClassification,
+            FalconForTokenClassification,
+            FalconModel,
+            FalconPreTrainedModel,
+        )
         from .models.flaubert import (
             FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             FlaubertForMultipleChoice,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index c13b202975ed..7acaee768525 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -77,6 +77,7 @@
     ernie,
     ernie_m,
     esm,
+    falcon,
     flaubert,
     flava,
     fnet,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 70ad35056556..3f5c88be52bf 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -85,6 +85,7 @@
         ("ernie", "ErnieConfig"),
         ("ernie_m", "ErnieMConfig"),
         ("esm", "EsmConfig"),
+        ("falcon", "FalconConfig"),
         ("flaubert", "FlaubertConfig"),
         ("flava", "FlavaConfig"),
         ("fnet", "FNetConfig"),
@@ -282,6 +283,7 @@
         ("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ernie_m", "ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("esm", "ESM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("falcon", "FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -475,6 +477,7 @@
         ("ernie", "ERNIE"),
         ("ernie_m", "ErnieM"),
         ("esm", "ESM"),
+        ("falcon", "Falcon"),
         ("flan-t5", "FLAN-T5"),
         ("flan-ul2", "FLAN-UL2"),
         ("flaubert", "FlauBERT"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index b72d263f9bcd..415ee5e20c3e 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -83,6 +83,7 @@
         ("ernie", "ErnieModel"),
         ("ernie_m", "ErnieMModel"),
         ("esm", "EsmModel"),
+        ("falcon", "FalconModel"),
         ("flaubert", "FlaubertModel"),
         ("flava", "FlavaModel"),
         ("fnet", "FNetModel"),
@@ -380,6 +381,7 @@
         ("data2vec-text", "Data2VecTextForCausalLM"),
         ("electra", "ElectraForCausalLM"),
         ("ernie", "ErnieForCausalLM"),
+        ("falcon", "FalconForCausalLM"),
         ("git", "GitForCausalLM"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
@@ -684,6 +686,7 @@
         ("ernie", "ErnieForSequenceClassification"),
         ("ernie_m", "ErnieMForSequenceClassification"),
         ("esm", "EsmForSequenceClassification"),
+        ("falcon", "FalconForSequenceClassification"),
         ("flaubert", "FlaubertForSequenceClassification"),
         ("fnet", "FNetForSequenceClassification"),
         ("funnel", "FunnelForSequenceClassification"),
@@ -755,6 +758,7 @@
         ("electra", "ElectraForQuestionAnswering"),
         ("ernie", "ErnieForQuestionAnswering"),
         ("ernie_m", "ErnieMForQuestionAnswering"),
+        ("falcon", "FalconForQuestionAnswering"),
         ("flaubert", "FlaubertForQuestionAnsweringSimple"),
         ("fnet", "FNetForQuestionAnswering"),
         ("funnel", "FunnelForQuestionAnswering"),
@@ -842,6 +846,7 @@
         ("ernie", "ErnieForTokenClassification"),
         ("ernie_m", "ErnieMForTokenClassification"),
         ("esm", "EsmForTokenClassification"),
+        ("falcon", "FalconForTokenClassification"),
         ("flaubert", "FlaubertForTokenClassification"),
         ("fnet", "FNetForTokenClassification"),
         ("funnel", "FunnelForTokenClassification"),
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index d37972a429f1..48a8db9186db 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -135,7 +135,7 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
         x (`torch.tensor`, *required*):
             input tensor
         residual (`torch.tensor`, *required*):
-            esidual tensor
+            residual tensor
         prob (`float`, *required*):
             dropout probability
         training (`bool`, *required*):
diff --git a/src/transformers/models/falcon/__init__.py b/src/transformers/models/falcon/__init__.py
new file mode 100644
index 000000000000..070e0cc033fb
--- /dev/null
+++ b/src/transformers/models/falcon/__init__.py
@@ -0,0 +1,68 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_falcon"] = [
+        "FALCON_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "FalconForCausalLM",
+        "FalconModel",
+        "FalconPreTrainedModel",
+        "FalconForSequenceClassification",
+        "FalconForTokenClassification",
+        "FalconForQuestionAnswering",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_falcon import (
+            FALCON_PRETRAINED_MODEL_ARCHIVE_LIST,
+            FalconForCausalLM,
+            FalconForQuestionAnswering,
+            FalconForSequenceClassification,
+            FalconForTokenClassification,
+            FalconModel,
+            FalconPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py
new file mode 100644
index 000000000000..b239f887990a
--- /dev/null
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Falcon configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "tiiuae/falcon-40b": "https://huggingface.co/tiiuae/falcon-40b/resolve/main/config.json",
+    "tiiuae/falcon-7b": "https://huggingface.co/tiiuae/falcon-7b/resolve/main/config.json",
+}
+
+
+class FalconConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65024):
+            Vocabulary size of the Falcon model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconModel`]
+        hidden_size (`int`, *optional*, defaults to 4544):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 71):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for MLP layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for attention layers.
+        num_kv_heads (`int`, *optional*):
+            Number of key-value heads to use per attention layer. If unset, defaults to the same value as
+            `num_attention_heads`.
+        alibi (`bool`, *optional*, defaults to `False`):
+            Whether to use ALiBi positional biases during self-attention.
+        new_decoder_architecture (`bool`, *optional*, defaults to `False`):
+            Whether to use the new (Falcon-40B) decoder architecture. If `True`, the `multi_query` and `parallel_attn`
+            arguments are ignored, as the new decoder always uses parallel attention.
+        multi_query (`bool`, *optional*, defaults to `True`):
+            Whether to use multi-query attention in the decoder. Ignored when `new_decoder_architecture` is `True`.
+        parallel_attn (`bool`, *optional*, defaults to `True`):
+            Whether to compute attention in parallel with the feedforward layer. If False, they are consecutive
+            instead, as in the original Transformer architecture. Ignored when `new_decoder_architecture` is `True`.
+        bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias on Linear layers.
+        bos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 11):
+            The id of the "end-of-sequence" token.
+
+    Example:
+
+    ```pytho
+    >>> from transformers import FalconModel, FalconConfig
+
+    >>> # Initializing a small (2-layer) Falcon configuration
+    >>> configuration = FalconConfig(num_hidden_layers=2)
+
+    >>> # Initializing a model from the small configuration
+    >>> model = FalconModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65024,
+        hidden_size=4544,
+        num_hidden_layers=32,
+        num_attention_heads=71,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        num_kv_heads=None,
+        alibi=False,
+        new_decoder_architecture=False,
+        multi_query=True,
+        parallel_attn=True,
+        bias=False,
+        bos_token_id=11,
+        eos_token_id=11,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
+        self.alibi = alibi
+        self.new_decoder_architecture = new_decoder_architecture
+        self.multi_query = multi_query  # Ignored when new_decoder_architecture is True
+        self.parallel_attn = parallel_attn
+        self.bias = bias
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.num_attention_heads
+
+    @property
+    def rotary(self):
+        return not self.alibi
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
new file mode 100644
index 000000000000..9ab9030a5447
--- /dev/null
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -0,0 +1,1262 @@
+# coding=utf-8
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Falcon model."""
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_falcon import FalconConfig
+
+
+logger = logging.get_logger(__name__)
+
+FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "tiiuae/falcon-40b",
+    "tiiuae/falcon-40b-instruct",
+    "tiiuae/falcon-7b",
+    "tiiuae/falcon-7b-instruct",
+    "tiiuae/falcon-rw-7b",
+    "tiiuae/falcon-rw-1b",
+]
+_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
+_CONFIG_FOR_DOC = "FalconConfig"
+
+
+# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
+# In order not to degrade the quality of our HF-port, we keep these characteristics in the final model.
+class FalconLinear(nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_states = input @ self.weight.T
+        if self.bias is None:
+            return hidden_states
+        return hidden_states + self.bias
+
+
+# rotary pos emb helpers (torch.jit.script does not seem to support staticmethod...)
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+class FalconRotaryEmbedding(nn.Module):
+    """Implementation of RotaryEmbedding from GPT-NeoX.
+    This implementation is designed to operate on queries and keys that are compatible with `[batch_size,
+    n_heads_per_partition, seq_len, head_dim]` (e.g. MinGPTAttention format).
+    """
+
+    def __init__(self, head_dim: int, base=10000):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.head_dim = head_dim
+        self.seq_len_cached = -1
+        self.cos_cached: torch.Tensor | None = None
+        self.sin_cached: torch.Tensor | None = None
+
+    def cos_sin(self, seq_len: int, past_key_values_length: int, device="cpu", dtype=torch.bfloat16) -> torch.Tensor:
+        total_length = seq_len + past_key_values_length
+        if total_length > self.seq_len_cached:
+            self.seq_len_cached = total_length
+            t = torch.arange(total_length, device=device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(device)
+
+            if dtype in [torch.float16, torch.bfloat16]:
+                emb = emb.float()
+
+            self.cos_cached = emb.cos()[None, :, :]
+            self.sin_cached = emb.sin()[None, :, :]
+
+            self.cos_cached = self.cos_cached.type(dtype)
+            self.sin_cached = self.sin_cached.type(dtype)
+
+        return (
+            self.cos_cached[:, past_key_values_length : seq_len + past_key_values_length],
+            self.sin_cached[:, past_key_values_length : seq_len + past_key_values_length],
+        )
+
+    def forward(self, query, key, past_key_values_length=0):
+        batch, seq_len, head_dim = query.shape
+        cos, sin = self.cos_sin(seq_len, past_key_values_length, query.device, query.dtype)
+        return (query * cos) + (rotate_half(query) * sin), (key * cos) + (rotate_half(key) * sin)
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention. This mask does not take the existing attention mask into account - it
+    just blocks tokens from attending forwards in the sequence. The output shape will be `[batch_size, 1,
+    target_length, target_length+past_key_values_length]`.
+    """
+    batch_size, target_length = input_ids_shape
+
+    mask = torch.triu(torch.ones((target_length, target_length), dtype=torch.bool, device=device), diagonal=1)
+    # If past_key_values_length is 0 this is an empty tensor and the concatenation is a no-op.
+    # This code style is an unfortunate consequence of getting your TF engineer to port models; doing it this
+    # way avoids a data-dependent conditional, which will help me when I have to port this to XLA later.
+    past_mask = torch.zeros((target_length, past_key_values_length), dtype=torch.bool, device=device)
+    mask = torch.cat([past_mask, mask], dim=-1)
+    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
+    return expanded_mask
+
+
+def _expand_mask(mask: torch.Tensor, past_key_values_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, seq_length]` to `[batch_size, 1, seq_length, seq_length + past_length]`.
+    """
+    batch_size, total_length = mask.shape
+    seq_length = total_length - past_key_values_length if past_key_values_length is not None else total_length
+
+    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, 1, seq_length, total_length)
+
+
+def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None].bfloat16() * arange_tensor
+    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
+# Copied from transformers.models.bloom.modeling_bloom.dropout_add
+def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
+    """
+    Dropout add function
+
+    Args:
+        x (`torch.tensor`, *required*):
+            input tensor
+        residual (`torch.tensor`, *required*):
+            residual tensor
+        prob (`float`, *required*):
+            dropout probability
+        training (`bool`, *required*):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+class FalconAttention(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.maybe_rotary = FalconRotaryEmbedding(config.head_dim) if config.rotary else lambda q, k, t: (q, k)
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = self.inv_norm_factor
+        if config.new_decoder_architecture:
+            qkv_out_dim = (config.num_kv_heads * 2 + config.num_attention_heads) * self.head_dim
+        elif config.multi_query:
+            qkv_out_dim = self.hidden_size + 2 * self.head_dim
+        else:
+            qkv_out_dim = 3 * self.hidden_size
+        self.query_key_value = FalconLinear(self.hidden_size, qkv_out_dim, bias=config.bias)
+        self.new_decoder_architecture = config.new_decoder_architecture
+        self.multi_query = config.multi_query
+        self.dense = FalconLinear(self.hidden_size, self.hidden_size, bias=config.bias)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1
+
+    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        if self.new_decoder_architecture:
+            batch, seq_len, _ = fused_qkv.shape
+            qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv_heads + 2, self.head_dim)
+            query = qkv[:, :, :, :-2]
+            key = qkv[:, :, :, [-2]]
+            value = qkv[:, :, :, [-1]]
+            key = torch.broadcast_to(key, query.shape)
+            value = torch.broadcast_to(value, query.shape)
+
+            query, key, value = [x.flatten(2, 3) for x in (query, key, value)]
+            return query, key, value
+        elif not self.multi_query:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+        else:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
+            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Merge heads together over the last dimenstion
+
+        Args:
+            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+
+        Returns:
+            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+
+        # First view to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
+
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.permute(0, 2, 1, 3)
+
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, query_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(
+            batch_size * num_kv_heads,
+            query_length,
+            self.head_dim,
+        )
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim)
+
+        past_kv_length = 0 if layer_past is None else layer_past[0].shape[1]
+        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size * self.num_heads, kv_length, head_dim]
+            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            key_layer = torch.cat((past_key, key_layer), dim=1)
+            value_layer = torch.cat((past_value, value_layer), dim=1)
+
+        _, kv_length, _ = key_layer.shape
+        if use_cache:
+            present = (key_layer, value_layer)
+        else:
+            present = None
+
+        attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(query_layer.dtype)
+
+        query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
+        key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim)
+        value_layer_ = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim)
+
+        if alibi is None:
+            if output_attentions:
+                # F.scaled_dot_product_attention doesn't return the attention weights, so we have
+                # to do it by hand if we want them
+                attention_scores = query_layer_ @ key_layer_.transpose(-1, -2)
+                attention_scores /= math.sqrt(self.head_dim)
+
+                attention_scores = F.softmax(
+                    attention_scores + attention_mask_float, dim=-1, dtype=hidden_states.dtype
+                )
+                attn_output = attention_scores @ value_layer_
+            else:
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer_, key_layer_, value_layer_, attention_mask_float, 0.0, is_causal=False
+                )
+                attention_scores = None
+
+            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+            attn_output = attn_output.permute(0, 2, 1, 3)
+            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+            output_tensor = self.dense(attn_output)
+
+            if output_attentions:
+                return output_tensor, present, attention_scores
+            else:
+                return output_tensor, present
+
+        else:
+            matmul_result = query_layer_ @ key_layer_.transpose(-1, -2)
+
+            # change view to [batch_size, num_heads, q_length, kv_length]
+            attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            input_dtype = attention_scores.dtype
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                attention_scores = attention_scores.to(torch.float32)
+            # Matt (HF) note: We could possibly use F.scaled_dot_product_attention here too, by
+            # adding (alibi * self.inv_norm_factor) to attention_mask_float. I think this would be mathematically
+            # equivalent and more performant, but there might be a numerical difference. If you're reading this
+            # and you'd like to experiment and maybe file a PR, feel free!
+            attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+            attention_logits *= self.inv_norm_factor
+            attention_probs = F.softmax(attention_logits + attention_mask_float, dim=-1, dtype=hidden_states.dtype)
+            # [batch_size, num_heads, q_length, kv_length]
+            attention_probs = self.attention_dropout(attention_probs)
+
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            # change view [batch_size, num_heads, q_length, kv_length]
+            attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+
+            # matmul: [batch_size * num_heads, q_length, head_dim]
+            context_layer = (attention_probs_reshaped @ value_layer_).flatten(0, 1)
+
+            # change view [batch_size, num_heads, q_length, head_dim]
+            context_layer = self._merge_heads(context_layer)
+
+            output_tensor = self.dense(context_layer)
+
+            if output_attentions:
+                return output_tensor, present, attention_probs
+            else:
+                return output_tensor, present
+
+
+class FalconMLP(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.dense_h_to_4h = FalconLinear(hidden_size, 4 * hidden_size, bias=config.bias)
+        self.act = nn.GELU()
+        self.dense_4h_to_h = FalconLinear(4 * hidden_size, hidden_size, bias=config.bias)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.act(self.dense_h_to_4h(x))
+        x = self.dense_4h_to_h(x)
+        return x
+
+
+class FalconDecoderLayer(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attention = FalconAttention(config)
+        self.mlp = FalconMLP(config)
+        self.hidden_dropout = config.hidden_dropout
+        self.config = config
+
+        if config.new_decoder_architecture:
+            # The layer norm before self-attention
+            self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            # The layer norm before the MLP
+            self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        else:
+            self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+            if not config.parallel_attn:
+                self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+
+        if self.config.new_decoder_architecture:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attn_outputs = self.self_attention(
+            attention_layernorm_out,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attn_outputs[0]
+
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual = dropout_add(
+                    attention_output, residual, self.config.attention_dropout, training=self.training
+                )
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
+
+        outputs = attn_outputs[1:]
+
+        # MLP.
+        mlp_output = self.mlp(mlp_layernorm_out)
+
+        if self.config.new_decoder_architecture or self.config.parallel_attn:
+            mlp_output += attention_output
+
+        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+FALCON_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+FALCON_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.num_hidden_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+
+            Each element of `past_key_values` is a tuple (past_key, past_value):
+            - past_key: [batch_size * num_heads, head_dim, kv_length]
+            - past_value: [batch_size * num_heads, kv_length, head_dim]
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FalconPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FalconConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconDecoderLayer"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear) or isinstance(module, FalconLinear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel
+    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
+        if isinstance(module, FalconModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_cache_to_standard_format(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+        num_heads, ...]))
+        """
+        batch_size_times_num_heads, kv_length, head_dim = past_key_value[0][0].shape
+        # [batch_size * self.num_heads, kv_length, head_dim] -> [batch_size, num_heads, kv_length, head_dim]
+        # Note that don't want to use self.num_attention_heads because the number of heads may vary depending
+        # on whether we use multi_query attention.
+        num_heads = batch_size_times_num_heads // batch_size
+        return tuple(
+            (
+                layer_past[0].view(batch_size, num_heads, kv_length, head_dim),
+                layer_past[1].view(batch_size, num_heads, kv_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+    @staticmethod
+    def _convert_to_rw_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        batch_size, num_heads, kv_length, head_dim = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # [batch_size, num_heads, kv_length, head_dim] -> [batch_size * num_heads, kv_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].view(batch_size_times_num_heads, kv_length, head_dim),
+                layer_past[1].view(batch_size_times_num_heads, kv_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+
+@add_start_docstrings(
+    "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
+    FALCON_START_DOCSTRING,
+)
+class FalconModel(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_alibi = config.alibi
+
+        # Embedding + LN Embedding
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+
+        # Transformer blocks
+        self.h = nn.ModuleList([FalconDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    @staticmethod
+    def _prepare_attn_mask(
+        attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
+    ) -> torch.BoolTensor:
+        # Create a causal mask
+        # The attention mask we receive as input should cover the whole extended sequence, including any past
+        # cache, so its shape should be [batch_size, seq_length + past_key_values_length]
+        # The output shape will be [batch_size, 1, seq_length, seq_length + past_key_values_length]
+        if input_shape[1] + past_key_values_length != attention_mask.shape[1]:
+            raise ValueError(
+                "Attention mask shape should be (batch_size, seq_length + past_key_values_length)"
+                f" but is {attention_mask.shape} with input_ids shape {input_shape} and past length"
+                f" {past_key_values_length}."
+            )
+        combined_attention_mask = None
+        device = attention_mask.device
+        _, seq_length = input_shape
+
+        if seq_length > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, device=device, past_key_values_length=past_key_values_length
+            )
+
+        # [batch_size, seq_length + past_key_values_length] -> [batch_size, 1, seq_length, seq_length + past_key_values_length]
+        expanded_attn_mask = _expand_mask(attention_mask, past_key_values_length=past_key_values_length)
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_key_values = self._convert_to_rw_cache(past_key_values)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = inputs_embeds
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[1]  # 1 because RW-cache, not standard format
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        if self.use_alibi:
+            alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
+        else:
+            alibi = None
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    head_mask[i],
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if presents is not None:
+            presents = self._convert_cache_to_standard_format(presents, batch_size)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
+    FALCON_START_DOCSTRING,
+)
+class FalconForCausalLM(FalconPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in past
+        )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Falcon Model transformer with a sequence classification head on top (linear layer).
+
+    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForSequenceClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Falcon Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForTokenClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = FalconModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Falcon Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FALCON_START_DOCSTRING,
+)
+class FalconForQuestionAnswering(FalconPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FalconModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(FALCON_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index a40deea2c059..9a41a8aa94b1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2876,6 +2876,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class FalconForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FalconForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FalconForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FalconForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FalconModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class FalconPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/falcon/__init__.py b/tests/models/falcon/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
new file mode 100644
index 000000000000..1c08f76bda5b
--- /dev/null
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -0,0 +1,469 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Falcon model. """
+
+
+import unittest
+
+from transformers import AutoTokenizer, FalconConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        FalconForCausalLM,
+        FalconForQuestionAnswering,
+        FalconForSequenceClassification,
+        FalconForTokenClassification,
+        FalconModel,
+    )
+
+
+class FalconModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return FalconConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=1,
+            new_decoder_architecture=True,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = FalconModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = FalconModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = FalconForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = FalconForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            FalconModel,
+            FalconForCausalLM,
+            FalconForSequenceClassification,
+            FalconForTokenClassification,
+            FalconForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (FalconForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": FalconModel,
+            "text-classification": FalconForSequenceClassification,
+            "text-generation": FalconForCausalLM,
+            "question-answering": FalconForQuestionAnswering,
+            "token-classification": FalconForTokenClassification,
+            "zero-shot": FalconForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = FalconModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FalconConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_position_embedding_types(self):
+        config, *inputs = self.model_tester.prepare_config_and_inputs()
+        for alibi in [True, False]:
+            config.alibi = alibi
+            self.model_tester.create_and_check_model(config, *inputs)
+
+    def test_falcon_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = FalconForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_falcon_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = FalconForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_cache_conversions(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = input_dict["input_ids"]
+        model = FalconForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, use_cache=True)
+        batch_size = input_ids.shape[0]
+        rw_cache = model._convert_to_rw_cache(result.past_key_values)
+        standard_cache = model._convert_cache_to_standard_format(rw_cache, batch_size)
+        for layer in range(len(rw_cache)):
+            for tensor_idx in range(2):
+                self.assertTrue(rw_cache[layer][tensor_idx].ndim == 3)
+                self.assertTrue(result.past_key_values[layer][tensor_idx].ndim == 4)
+                self.assertTrue(
+                    torch.all(result.past_key_values[layer][tensor_idx] == standard_cache[layer][tensor_idx])
+                )
+
+    def test_falcon_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = FalconForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_past_key_values_format(self):
+        # Falcon can have different numbers of KV-heads than the number of query heads, so we need
+        # to override this test to use the right head counts.
+        for model_class in self.all_generative_model_classes:
+            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+            # If it doesn't support cache, pass the test
+            if not hasattr(config, "use_cache"):
+                return
+
+            model = model_class(config).to(torch_device)
+            if "use_cache" not in inputs:
+                inputs["use_cache"] = True
+            outputs = model(**inputs)
+
+            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
+            if "past_key_values" not in outputs:
+                return
+
+            num_hidden_layers = (
+                getattr(config, "decoder_layers", None)
+                or getattr(config, "num_decoder_layers", None)
+                or config.num_hidden_layers
+            )
+            num_attention_heads = getattr(config, "num_kv_heads", config.num_attention_heads)
+            embed_dim = getattr(config, "d_model", config.hidden_size)
+            per_head_embed_dim = embed_dim // num_attention_heads
+
+            past_kv = outputs["past_key_values"]
+            self.assertEqual(len(past_kv), num_hidden_layers)
+
+            batch_size, seq_length = inputs["input_ids"].shape
+            for i in range(num_hidden_layers):
+                if config.new_decoder_architecture:
+                    num_attention_heads = config.num_attention_heads
+                elif config.multi_query:
+                    num_attention_heads = 1
+                self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
+                self.assertEqual(
+                    past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                )
+                self.assertEqual(
+                    past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
+                )
+
+
+@require_torch
+class FalconLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_falcon(self):
+        tokenizer = AutoTokenizer.from_pretrained("Rocketknight1/falcon-rw-1b")
+        model = FalconForCausalLM.from_pretrained("Rocketknight1/falcon-rw-1b")
+        model.eval()
+        model.to(torch_device)
+        inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+
+        EXPECTED_OUTPUT = (
+            "My favorite food is pizza. I love it so much that I have a pizza party every year for my birthday."
+        )
+
+        output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=19)
+        output_str = tokenizer.batch_decode(output_ids)[0]
+
+        self.assertEqual(output_str, EXPECTED_OUTPUT)
+
+    @slow
+    def test_lm_generation_big_models(self):
+        # The big models are way too big for the CI, so we use tiny random models that resemble their
+        # architectures but with much smaller and fewer layers
+        for repo in ["Rocketknight1/tiny-random-falcon-7b", "Rocketknight1/tiny-random-falcon-40b"]:
+            tokenizer = AutoTokenizer.from_pretrained(repo)
+            model = FalconForCausalLM.from_pretrained(repo)
+            model.eval()
+            model.to(torch_device)
+            inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+
+            # We just test that these run without errors - the models are randomly initialized
+            # and so the actual text outputs will be garbage
+            model.generate(**inputs, do_sample=False, max_new_tokens=4)
+            model.generate(**inputs, do_sample=True, max_new_tokens=4)
+            model.generate(**inputs, num_beams=2, max_new_tokens=4)
+
+    @slow
+    def test_lm_generation_use_cache(self):
+        # The big models are way too big for the CI, so we use tiny random models that resemble their
+        # architectures but with much smaller and fewer layers
+        with torch.no_grad():
+            for repo in [
+                "Rocketknight1/falcon-rw-1b",
+                "Rocketknight1/tiny-random-falcon-7b",
+                "Rocketknight1/tiny-random-falcon-40b",
+            ]:
+                tokenizer = AutoTokenizer.from_pretrained(repo)
+                model = FalconForCausalLM.from_pretrained(repo)
+                model.eval()
+                model.to(device=torch_device)
+                inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
+
+                # Test results are the same with and without cache
+                outputs_no_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
+                outputs_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=True)
+                self.assertTrue((outputs_cache - outputs_no_cache).sum().item() == 0)

From b15343de6fc25a72e5bc4ac8be238359f2178b12 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 11 Jul 2023 22:02:18 +0900
Subject: [PATCH 621/935] [Patch-t5-tokenizer] Patches the changes on T5 to
 make sure previous behaviour is still valide for beginning of words (#24622)

* patch `_tokenize` function

* more tests

* properly fix

* fixup

* Update src/transformers/models/t5/tokenization_t5.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix without ifs

* update

* protect import

* add python processing

* is first needed

* add doc and update with lefacy

* updaate

* fix T5 SPM converter

* styling

* fix T5 warning

* add is_seqio_available

* remove is_first

* revert some changes

* more tests and update

* update llama test batterie

* fixup

* refactor T5 spm common tests

* draft the llama tests

* update

* uopdate test

* nits

* refine

* name nit

* fix t5 tests

* fix T5

* update

* revert convert slow to fast changes that fail lots of tests

* legacy support

* fixup

* nits is first not defined

* don't use legacy behaviour for switch transformers

* style

* My attempt to check.

* nits

* fixes

* update

* fixup

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* updates

* fixup

* add legacy warning

* fixup

* warning_once nit

* update t5 documentation test

* update llama tok documentation

* add space to warning

* nits

* nit

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* last nits

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 src/transformers/convert_slow_tokenizer.py    |  23 ++-
 src/transformers/file_utils.py                |   1 +
 .../models/llama/tokenization_llama.py        |  62 ++++++-
 src/transformers/models/t5/tokenization_t5.py |  58 +++++-
 src/transformers/testing_utils.py             |   8 +
 src/transformers/utils/__init__.py            |  10 +-
 src/transformers/utils/import_utils.py        |   5 +
 tests/models/llama/test_tokenization_llama.py |  86 +++++++++
 .../test_modeling_switch_transformers.py      |  10 +-
 tests/models/t5/test_tokenization_t5.py       | 175 +++++++++++++++---
 tests/models/umt5/test_modeling_umt5.py       |   9 +-
 11 files changed, 391 insertions(+), 56 deletions(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 39d1b0bc43bb..f4074664f693 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -22,10 +22,22 @@
 import warnings
 from typing import Dict, List, Tuple
 
+from packaging import version
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .utils import requires_backends
+from .utils import is_protobuf_available, requires_backends
+
+
+def import_protobuf():
+    if is_protobuf_available():
+        import google.protobuf
+
+        if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
+            from transformers.utils import sentencepiece_model_pb2
+        else:
+            from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
+    return sentencepiece_model_pb2
 
 
 class SentencePieceExtractor:
@@ -445,7 +457,8 @@ def __init__(self, *args):
 
         super().__init__(*args)
 
-        from .utils import sentencepiece_model_pb2 as model_pb2
+        # from .utils import sentencepiece_model_pb2 as model_pb2
+        model_pb2 = import_protobuf()
 
         m = model_pb2.ModelProto()
         with open(self.original_tokenizer.vocab_file, "rb") as f:
@@ -1146,9 +1159,9 @@ def tokenizer(self, proto):
             )
             tokenizer.add_special_tokens(
                 [
-                    AddedToken("<unk>", normalized=False),
-                    AddedToken("<s>", normalized=False),
-                    AddedToken("</s>", normalized=False),
+                    AddedToken("<unk>"),
+                    AddedToken("<s>"),
+                    AddedToken("</s>"),
                 ]
             )
         else:
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 63230b5b8452..921974634d0d 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -101,6 +101,7 @@
     is_sagemaker_mp_enabled,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_sklearn_available,
     is_soundfile_availble,
     is_spacy_available,
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 50b7c7a262a9..193d4edd59fb 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -44,6 +44,7 @@
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     "hf-internal-testing/llama-tokenizer": 2048,
 }
+SPIECE_UNDERLINE = "▁"
 
 
 class LlamaTokenizer(PreTrainedTokenizer):
@@ -53,6 +54,29 @@ class LlamaTokenizer(PreTrainedTokenizer):
     Args:
         vocab_file (`str`):
             Path to the vocabulary file.
+        legacy (`bool`, *optional*, defaults to `True`):
+            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
+            which includes fixes to properly handle tokens that appear after special tokens. A simple example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
+            more details.
+
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -71,6 +95,7 @@ def __init__(
         add_bos_token=True,
         add_eos_token=False,
         clean_up_tokenization_spaces=False,
+        legacy=True,
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
@@ -87,8 +112,15 @@ def __init__(
             add_eos_token=add_eos_token,
             sp_model_kwargs=self.sp_model_kwargs,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            legacy=legacy,
             **kwargs,
         )
+        if legacy:
+            logger.warning_once(
+                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
+                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
+            )
+        self.legacy = legacy
         self.vocab_file = vocab_file
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
@@ -117,9 +149,35 @@ def get_vocab(self):
         vocab.update(self.added_tokens_encoder)
         return vocab
 
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text, **kwargs) -> List[str]:
+        # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+        # the beginning of the text
+        if not self.legacy:
+            text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+        return super().tokenize(text, **kwargs)
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
     def _tokenize(self, text):
-        """Returns a tokenized string."""
-        return self.sp_model.encode(text, out_type=str)
+        """
+        Returns a tokenized string.
+
+        Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
+        we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
+        function is called with specials tokens: the input is split on the special tokens, and each subsequence is
+        passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
+        the extra `SPIECE_UNDERLINE` prepended.
+        """
+        if not self.legacy:
+            is_first = text.startswith(SPIECE_UNDERLINE)
+            if is_first:
+                text = text[1:]
+
+        tokens = self.sp_model.encode(text, out_type=str)
+
+        if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+        return tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 70821333bdef..b76ad2d19598 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -106,6 +106,28 @@ class T5Tokenizer(PreTrainedTokenizer):
 
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
+        legacy (`bool`, *optional*, defaults to `True`):
+            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
+            which includes fixes to properly handle tokens that appear after special tokens. A simple example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
+            more details.
 
     Attributes:
         sp_model (`SentencePieceProcessor`):
@@ -126,6 +148,7 @@ def __init__(
         extra_ids=100,
         additional_special_tokens=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        legacy=True,
         **kwargs,
     ) -> None:
         # Add extra_ids to the special token list
@@ -140,7 +163,13 @@ def __init__(
                     " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                     " tokens"
                 )
+        if legacy:
+            logger.warning_once(
+                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
+                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
+            )
 
+        self.legacy = legacy
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
         super().__init__(
@@ -150,6 +179,7 @@ def __init__(
             extra_ids=extra_ids,
             additional_special_tokens=additional_special_tokens,
             sp_model_kwargs=self.sp_model_kwargs,
+            legacy=legacy,
             **kwargs,
         )
 
@@ -301,15 +331,31 @@ def __setstate__(self, d):
         self.sp_model.Load(self.vocab_file)
 
     def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
-        if not text.startswith(" "):
-            text = " " + text
+        # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+        # the beginning of the text
+        if not self.legacy:
+            text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
         return super().tokenize(text, **kwargs)
 
-    def _tokenize(self, text: str) -> List[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
+        we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
+        function is called with specials tokens: the input is split on the special tokens, and each subsequence is
+        passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
+        the extra `SPIECE_UNDERLINE` prepended.
+        """
+        if not self.legacy:
+            is_first = text.startswith(SPIECE_UNDERLINE)
+            if is_first:
+                text = text[1:]
+
         tokens = self.sp_model.encode(text, out_type=str)
-        if not text.startswith(" ") and tokens[0] == SPIECE_UNDERLINE:
-            tokens = tokens[1:]
+
+        if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
         return tokens
 
     def _convert_token_to_id(self, token):
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index e911769a1251..698327f65862 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -77,6 +77,7 @@
     is_safetensors_available,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_soundfile_availble,
     is_spacy_available,
     is_sudachi_available,
@@ -442,6 +443,13 @@ def require_sentencepiece(test_case):
     return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case)
 
 
+def require_seqio(test_case):
+    """
+    Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed.
+    """
+    return unittest.skipUnless(is_seqio_available(), "test requires Seqio")(test_case)
+
+
 def require_scipy(test_case):
     """
     Decorator marking a test that requires Scipy. These tests are skipped when SentencePiece isn't installed.
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 8afe32403289..21430cd5ba32 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -142,6 +142,7 @@
     is_sagemaker_mp_enabled,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_sklearn_available,
     is_soundfile_availble,
     is_spacy_available,
@@ -177,15 +178,6 @@
 )
 
 
-if is_protobuf_available():
-    import google.protobuf
-
-    if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
-        from . import sentencepiece_model_pb2
-    else:
-        from . import sentencepiece_model_pb2_new as sentencepiece_model_pb2
-
-
 WEIGHTS_NAME = "pytorch_model.bin"
 WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
 ADAPTER_CONFIG_NAME = "adapter_config.json"
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 27700c6598e3..42671462655d 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -112,6 +112,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _safetensors_available = _is_package_available("safetensors")
 _scipy_available = _is_package_available("scipy")
 _sentencepiece_available = _is_package_available("sentencepiece")
+_is_seqio_available = _is_package_available("seqio")
 _sklearn_available = importlib.util.find_spec("sklearn") is not None
 if _sklearn_available:
     try:
@@ -507,6 +508,10 @@ def is_sentencepiece_available():
     return _sentencepiece_available
 
 
+def is_seqio_available():
+    return _is_seqio_available
+
+
 def is_protobuf_available():
     if importlib.util.find_spec("google") is None:
         return False
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 67d287fac108..e1d1b9ec76e1 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -498,3 +498,89 @@ def test_integration_test_xnli(self):
                 decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
 
                 self.assertEqual(decoded1, decoded2)
+
+
+@require_sentencepiece
+@require_tokenizers
+class CommonSpmIntegrationTests(unittest.TestCase):
+    """
+    A class that regroups important test to make sure that we properly handle the special tokens.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False)
+        tokenizer.add_special_tokens({"additional_special_tokens": ["<s>"]})
+        tokenizer._create_trie(tokenizer.all_special_tokens)
+        # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
+        # So the extra ids are split....
+        cls.tokenizer = tokenizer
+        return cls
+
+    def test_add_dummy_prefix(self):
+        # make sure `'▁'` is prepended, and outputs match sp_model's
+        # `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute
+        input_ids = self.tokenizer.encode(". Hello")
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode(". Hello")
+        self.assertEqual(input_ids, sp_encode)
+        tokens = self.tokenizer.tokenize(". Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+    def test_remove_extra_whitespaces(self):
+        # make sure the extra spaces are eaten. Since the sample vocab does not have
+        # `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False
+
+        input_ids = self.tokenizer.encode("       . Hello")
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode("       . Hello")
+        self.assertEqual(input_ids, sp_encode)
+        tokens = self.tokenizer.tokenize(" . Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        # `'▁'` is also a whitespace
+        input_ids = self.tokenizer.encode("▁He is not")
+        self.assertEqual(input_ids, [156, 46, 44])
+        tokens = self.tokenizer.tokenize("▁He is not")
+        sp_encode = self.tokenizer.sp_model.encode("▁He is not")
+        self.assertEqual(input_ids, sp_encode)
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not"])  # no extra space added
+
+        input_ids = self.tokenizer.encode("▁He is not<s>             ▁He")
+        self.assertEqual(input_ids, [156, 46, 44, 1, 156])
+        tokens = self.tokenizer.tokenize("▁He is not<s>              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "<s>", "▁He"])  # spaces are eaten by spm + our strip
+        # make sure that the output after the extra id is the same as if
+        # extra_id was not there
+        input_ids = self.tokenizer.encode("▁He is not             ▁He")
+        self.assertEqual(input_ids, [156, 46, 44, 156])
+        tokens = self.tokenizer.tokenize("▁He is not              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"])  # spaces are eaten by spm even if not start
+
+    def test_character_after_special_token(self):
+        # Make sure that `tokenizer.tokenize` is similar to
+        # adding the equivalent special token to the vocab
+        input_ids = self.tokenizer.encode("Hey <s>I")
+        self.assertEqual(input_ids, [156, 30, 1, 100])
+        sp_encode = self.tokenizer.sp_model.encode("Hey .I")
+        # the last token should be 100
+        self.assertEqual(input_ids[-1], sp_encode[-1])
+        tokens = self.tokenizer.tokenize("<s>I")
+        self.assertEqual(tokens, ["<s>", "I"])
+
+        input_ids = self.tokenizer.encode("Hello, <s>,")
+        self.assertEqual(input_ids, [156, 86, 20, 3, 1, 3])
+        tokens = self.tokenizer.tokenize("Hello, <s>,")
+        self.assertEqual(tokens, ["▁He", "ll", "o", ",", "<s>", ","])
+
+    def test_special_tokens_strip(self):
+        input_ids = self.tokenizer.encode(" <s> ,")
+        self.assertEqual(input_ids, [1, 7, 3])
+        tokens = self.tokenizer.tokenize(" <s> ,")
+        # spaces are eaten by rstrip / lstrip + spm sp_model.encode("  ") = []
+        self.assertEqual(tokens, ["<s>", "▁", ","])
+
+        input_ids = self.tokenizer.encode("No <s> ▁He")
+        self.assertEqual(input_ids, [284, 1, 156])
+        tokens = self.tokenizer.tokenize("No <s> ▁He")
+        self.assertEqual(tokens, ["▁No", "<s>", "▁He"])  # spaces are eaten by rstrip / lstrip
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index e633533e1a31..abe785eca083 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -1143,13 +1143,16 @@ def test_small_logits(self):
 
         torch.testing.assert_allclose(hf_logits, EXPECTED_MEAN_LOGITS, rtol=6e-3, atol=9e-3)
 
+    @unittest.skip(
+        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
+    )
     def test_small_generate(self):
         # Generate test using the smalled switch-C model.
 
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False, legacy=False)
         model = model.to(torch_device)
 
         input_ids = tokenizer(
@@ -1169,12 +1172,15 @@ def test_small_generate(self):
         EXPECTED_OUTPUT = "<pad><extra_id_0> man<extra_id_1> beer<extra_id_2> a<extra_id_3> whiskey<extra_id_4>.</s>"
         self.assertEqual(output_str, EXPECTED_OUTPUT)
 
+    @unittest.skip(
+        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
+    )
     def test_small_batch_generate(self):
         BATCH_SIZE = 4
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False, legacy=False)
 
         inputs = [
             "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index afe0d32948a9..e0587f0e8b49 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -19,7 +19,7 @@
 import unittest
 
 from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
 from transformers.utils import cached_property, is_tf_available, is_torch_available
 
 from ...test_tokenization_common import TokenizerTesterMixin
@@ -381,7 +381,7 @@ def test_tokenizer_integration(self):
     def test_get_sentinel_tokens(self):
         tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
         sentinel_tokens = tokenizer.get_sentinel_tokens()
-        self.assertEquals(len(sentinel_tokens), 10)
+        self.assertEqual(len(sentinel_tokens), 10)
         self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
         self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
 
@@ -392,7 +392,7 @@ def test_get_sentinel_token_ids(self):
     def test_get_sentinel_tokens_for_fasttokenizer(self):
         tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
         sentinel_tokens = tokenizer.get_sentinel_tokens()
-        self.assertEquals(len(sentinel_tokens), 10)
+        self.assertEqual(len(sentinel_tokens), 10)
         self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
         self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
 
@@ -400,34 +400,151 @@ def test_get_sentinel_token_ids_for_fasttokenizer(self):
         tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
         self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
 
-    def test_encode_extra_ids(self):
-        tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0)
+
+@require_sentencepiece
+@require_tokenizers
+class CommonSpmIntegrationTests(unittest.TestCase):
+    """
+    A class that regroups important test to make sure that we properly handle the special tokens.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0, legacy=False)
         tokenizer.add_special_tokens({"additional_special_tokens": ["<extra_id_0>"]})
         tokenizer._create_trie(tokenizer.all_special_tokens)
         # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
         # So the extra ids are split....
+        cls.tokenizer = tokenizer
+
+    def test_add_dummy_prefix(self):
+        # make sure `'▁'` is prepended, and outputs match sp_model's
+        # `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute
+        input_ids = self.tokenizer.encode(". Hello", add_special_tokens=False)
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode(". Hello")
+        self.assertEqual(input_ids, sp_encode)
+        tokens = self.tokenizer.tokenize(". Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+    def test_remove_extra_whitespaces(self):
+        # make sure the extra spaces are eaten
+        # sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute
+        input_ids = self.tokenizer.encode("       . Hello", add_special_tokens=False)
+        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
+        sp_encode = self.tokenizer.sp_model.encode("       . Hello")
+        self.assertEqual(input_ids, sp_encode)
+        tokens = self.tokenizer.tokenize(" . Hello")
+        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
+
+        # `'▁'` is also a whitespace
+        input_ids = self.tokenizer.encode("▁He is not")
+        self.assertEqual(input_ids, [156, 46, 44, 2])
+        tokens = self.tokenizer.tokenize("▁He is not")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not"])  # no extra space added
+
+        input_ids = self.tokenizer.encode("▁He is not<extra_id_0>             ▁He")
+        # here t5x does not eat with lstrip, so there is and extra ▁He in the original one
+        # TODO @arthurzucker we should probably not srip right since it is done by default
+        # for certain models...
+        self.assertEqual(input_ids, [156, 46, 44, 999, 0, 2])
+        tokens = self.tokenizer.tokenize("▁He is not<extra_id_0>              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "<extra_id_0>", "He"])  # spaces are eaten by spm + our strip
+        # make sure that the output after the extra id is the same as if
+        # extra_id was not there
+        input_ids = self.tokenizer.encode("▁He is not             ▁He")
+        self.assertEqual(input_ids, [156, 46, 44, 156, 2])
+        tokens = self.tokenizer.tokenize("▁He is not              ▁He")
+        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"])  # spaces are eaten by spm even if not start
+
+    def test_character_after_special_token(self):
+        # Make sure that `tokenizer.tokenize` is similar to
+        # adding the equivalent special token to the vocab
+        input_ids = self.tokenizer.encode("Hey <extra_id_0>I")
+        self.assertEqual(input_ids, [156, 30, 999, 100, 2])
+        tokens = self.tokenizer.tokenize("Hey <extra_id_0>I")
+        self.assertEqual(tokens, ["▁He", "y", "<extra_id_0>", "I"])
+
+        input_ids = self.tokenizer.encode("Hello, <extra_id_0>,")
+        self.assertEqual(input_ids, [156, 86, 20, 3, 999, 3, 2])
+        tokens = self.tokenizer.tokenize("Hello, <extra_id_0>,")
+        self.assertEqual(tokens, ["▁He", "ll", "o", ",", "<extra_id_0>", ","])
+
+    def test_special_tokens_strip(self):
+        input_ids = self.tokenizer.encode(" <extra_id_0> ,")
+        self.assertEqual(input_ids, [999, 3, 2])
+        tokens = self.tokenizer.tokenize(" <extra_id_0> ,")
+        # spaces are eaten by rstrip / lstrip
+        self.assertEqual(tokens, ["<extra_id_0>", ","])
+
+        # test with a begin of word like `▁He`
+        input_ids = self.tokenizer.encode("No <extra_id_0> He")
+        self.assertEqual(input_ids, [284, 999, 0, 2])
+        # spaces are eaten by rstrip / lstrip, so this is expected. Don't strip otherwise you break
+        tokens = self.tokenizer.tokenize("No <extra_id_0> He")
+        self.assertEqual(tokens, ["▁No", "<extra_id_0>", "He"])
+
+        # Make sure this does not happen if we don't strip
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=0)
+        tokenizer.add_special_tokens({"bos_token": AddedToken("<bos>")})
+        input_ids = tokenizer.encode("No <bos> He")
+        self.assertEqual(input_ids, [284, 1000, 156, 2])
+        tokens = tokenizer.tokenize("No <bos> He")
+        # the first `' '` after `'No'` is eaten by spm:
+        self.assertEqual(tokenizer.sp_model.encode("No         ", out_type=str), ["▁No"])
+        self.assertEqual(tokens, ["▁No", "<bos>", "▁He"])
+
+    @require_seqio
+    @unittest.skipIf(
+        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
+        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
+    )
+    def test_integration_seqio(self):
+        from datasets import load_dataset
+        from seqio import SentencePieceVocabulary
+
+        ds = load_dataset("xnli", "all_languages", split="train+test+validation")
+
+        # TODO ArthurZucker fix the 3 commented tests with #23909
+        input_texts = [
+            "Bonjour <extra_id_0>.",
+            # "Bonjour<extra_id_0>.",  # this will fail. In T5 the special token has to be at the end.
+            # because in T5 they add `_<extra_id_0>` to the vocab, not `<extra_id_0>`.
+            "                   Hey <extra_id_0>I love you",
+            # "Hey <extra_id_0> I love you", # this will fail, we strip left, to _I vs I
+            # "Hey <extra_id_0>▁He", # this will fail for the same reason, we replace `_` then strip
+        ]
 
-        input_ids = tokenizer.encode(". Hello")
-        self.assertEquals(input_ids, [7, 4, 156, 86, 20, 2])
-        tokens = tokenizer.tokenize(". Hello")
-        self.assertEquals(tokens, ["▁", ".", "▁He", "ll", "o"])
-
-        input_ids = tokenizer.encode(" . Hello")
-        self.assertEquals(input_ids, [7, 4, 156, 86, 20, 2])
-        tokens = tokenizer.tokenize(" . Hello")
-        self.assertEquals(tokens, ["▁", ".", "▁He", "ll", "o"])
-
-        input_ids = tokenizer.encode("Hello, <extra_id_0>I")
-        self.assertEquals(input_ids, [156, 86, 20, 3, 999, 8, 2])
-        tokens = tokenizer.tokenize("Hello, <extra_id_0>I")
-        self.assertEquals(tokens, ["▁He", "ll", "o", ",", "<extra_id_0>", "▁I"])
-
-        input_ids = tokenizer.encode("Hello, <extra_id_0>,")
-        self.assertEquals(input_ids, [156, 86, 20, 3, 999, 3, 2])
-        tokens = tokenizer.tokenize("Hello, <extra_id_0>,")
-        self.assertEquals(tokens, ["▁He", "ll", "o", ",", "<extra_id_0>", ","])
-
-        input_ids = tokenizer.encode(" <extra_id_0> ,")
-        self.assertEquals(input_ids, [999, 3, 2])
-        tokens = tokenizer.tokenize(" <extra_id_0> ,")
-        self.assertEquals(tokens, ["<extra_id_0>", ","])  # spaces are eaten by rstrip / lstrip
+        import tqdm
+
+        # Test with umt5
+        vocab_path = "gs://t5-data/vocabs/umt5.256000/sentencepiece.model"
+        t5x_tokenizer = SentencePieceVocabulary(vocab_path, extra_ids=300)
+        hf_tokenizer = T5Tokenizer.from_pretrained("google/umt5-small", legacy=False)
+        for text in input_texts:
+            self.assertEqual(
+                hf_tokenizer.encode(text, add_special_tokens=False), t5x_tokenizer.tokenizer.tokenize(text), f"{text}"
+            )
+        for texts in tqdm.tqdm(ds["premise"]):
+            for text in texts:
+                self.assertEqual(
+                    hf_tokenizer.encode(text, add_special_tokens=False),
+                    t5x_tokenizer.tokenizer.tokenize(text),
+                    f"{text}",
+                )
+
+        # Test with T5
+        hf_tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        vocab_path = "gs://t5-data/vocabs/cc_all.32000/sentencepiece.model"
+        t5x_tokenizer = SentencePieceVocabulary(vocab_path, extra_ids=300)
+        for text in input_texts:
+            self.assertEqual(
+                hf_tokenizer.encode(text, add_special_tokens=False), t5x_tokenizer.tokenizer.tokenize(text), f"{text}"
+            )
+        for texts in tqdm.tqdm(ds["premise"]):
+            for text in texts:
+                self.assertEqual(
+                    hf_tokenizer.encode(text, add_special_tokens=False),
+                    t5x_tokenizer.tokenizer.tokenize(text),
+                    f"{text}",
+                )
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 8bdab8ca736c..b5910ced3672 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -347,13 +347,16 @@ def test_disk_offload(self):
 @require_tokenizers
 class Umt5IntegrationTest(unittest.TestCase):
     @slow
+    @unittest.skip(
+        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
+    )
     def test_small_integration_test(self):
         """
         For comparison run the kaggle notbook available here : https://www.kaggle.com/arthurzucker/umt5-inference
         """
 
         model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small", return_dict=True).to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("google/umt5-small", use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained("google/umt5-small", use_fast=False, legacy=False)
         input_text = [
             "Bonjour monsieur <extra_id_0> bien <extra_id_1>.",
             "No se como puedo <extra_id_0>.",
@@ -373,7 +376,7 @@ def test_small_integration_test(self):
             ]
         )
         # fmt: on
-        self.assertEqual(input_ids, EXPECTED_IDS)
+        torch.testing.assert_allclose(input_ids, EXPECTED_IDS)
 
         generated_ids = model.generate(input_ids.to(torch_device))
         EXPECTED_FILLING = [
@@ -384,4 +387,4 @@ def test_small_integration_test(self):
             "<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
         ]
         filling = tokenizer.batch_decode(generated_ids)
-        self.assertTrue(filling, EXPECTED_FILLING)
+        self.assertEqual(filling, EXPECTED_FILLING)

From 8a5e8a9c2a4dc54b4485994311de6b839976828c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jegor=20Kit=C5=A1kerkin?= <jegor.kitskerkin@gmail.com>
Date: Tue, 11 Jul 2023 16:04:04 +0300
Subject: [PATCH 622/935] Add ViViT (#22518)

* Add model

* Add ability to get classification head weights

* Add docs

* Add imports to __init__.py

* Run style

* Fix imports and add mdx doc

* Run style

* Fix copyright

* Fix config docstring

* Remove imports of ViViTLayer and load_tf_weights_in_vivit

* Remove FeatureExtractor and replace with ImageProcessor everywhere

* Remove ViViTForPreTraining from vivit.mdx

* Change ViViT -> Vivit everywhere

* Add model_doc to _toctree.yml

* Replace tuples with lists in arguments of VivitConfig

* Rename patch_size to tubelet_size in TubeletEmbeddings

* Fix checkpoint names

* Add tests

* Remove unused num_frames

* Fix imports for VivitImageProcessor

* Minor fixes

* Decrease number of frames in VivitModelTester from 32 to 16

* Decrease number of frames in VivitModelTester from 16 to 8

* Add initialization for pos embeddings

* Rename Vivit -> ViViT in some places

* Fix docstring and formatting

* Rename TubeletEmbeddings -> VivitTubeletEmbeddings

* Remove load_tf_weights_in_vivit

* Change checkpoint name

* Remove Vivit _TOKENIZER_FOR_DOC

* Fix

* Fix VivitTubeletEmbeddings and pass config object as parameter

* Use image_size and num_frames instead of video_size

* Change conversion script and fix differences with the orig implementation

* Fix docstrings

* Add attention head pruning

* Run style and fixup

* Fix tests

* Add ViViT to video_classification.mdx

* Save processor in conversion script

* Fix

* Add image processor test

* Run fixup and style

* Run fix-copies

* Update tests/models/vivit/test_modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/vivit/test_modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Use PyAV instead of decord

* Add unittest.skip

* Run style

* Remove unneeded test

* Update docs/source/en/model_doc/vivit.mdx

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/configuration_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/image_processing_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/image_processing_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/modeling_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add model

* Add docs

* Run style

* Fix imports and add mdx doc

* Remove FeatureExtractor and replace with ImageProcessor everywhere

* Change ViViT -> Vivit everywhere

* Rename Vivit -> ViViT in some places

* Update src/transformers/models/vivit/image_processing_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Run make style

* Remove inputs save

* Fix image processor

* Fix

* Run `make style`

* Decrease parameters of VivitModelTester

* Decrease tubelet size

* Rename vivit.mdx

* Update src/transformers/models/vivit/image_processing_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/image_processing_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/vivit/image_processing_vivit.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Fix default values in image_processing_vivit.py

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.md                       |   2 +
 docs/source/en/model_doc/vivit.md             |  44 ++
 docs/source/en/tasks/video_classification.md  |   2 +-
 src/transformers/__init__.py                  |  21 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 src/transformers/models/vivit/__init__.py     |  78 ++
 .../models/vivit/configuration_vivit.py       | 122 +++
 .../vivit/convert_vivit_flax_to_pytorch.py    | 235 ++++++
 .../models/vivit/image_processing_vivit.py    | 404 ++++++++++
 .../models/vivit/modeling_vivit.py            | 736 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 .../utils/dummy_vision_objects.py             |   7 +
 tests/models/vivit/__init__.py                |   0
 .../vivit/test_image_processing_vivit.py      | 214 +++++
 tests/models/vivit/test_modeling_vivit.py     | 350 +++++++++
 25 files changed, 2253 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/model_doc/vivit.md
 create mode 100644 src/transformers/models/vivit/__init__.py
 create mode 100644 src/transformers/models/vivit/configuration_vivit.py
 create mode 100644 src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
 create mode 100644 src/transformers/models/vivit/image_processing_vivit.py
 create mode 100755 src/transformers/models/vivit/modeling_vivit.py
 create mode 100644 tests/models/vivit/__init__.py
 create mode 100644 tests/models/vivit/test_image_processing_vivit.py
 create mode 100644 tests/models/vivit/test_modeling_vivit.py

diff --git a/README.md b/README.md
index d0fddfd1b640..eba8fb5da9f1 100644
--- a/README.md
+++ b/README.md
@@ -482,6 +482,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
diff --git a/README_es.md b/README_es.md
index 62ba9546f449..9fd3e7b9cf4d 100644
--- a/README_es.md
+++ b/README_es.md
@@ -457,6 +457,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
diff --git a/README_hd.md b/README_hd.md
index 0c1dcb3bfd38..ce65206165a0 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -429,6 +429,7 @@ conda install -c huggingface transformers
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [मास्कड ऑटोएन्कोडर स्केलेबल विजन लर्नर्स हैं](https://arxiv.org/ एब्स/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा।
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI से) साथ वाला पेपर [FAIRSEQ S2T: FAIRSEQ के साथ फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग ](https://arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, सरव्या पोपुरी, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया।
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https:/ /arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 8c5b67adabb7..10bb7d12313f 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -491,6 +491,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171)
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
diff --git a/README_ko.md b/README_ko.md
index 53b619d1186b..4cdb0c24b756 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -406,6 +406,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다.
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 의 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI 에서) Qiantong Xu, Alexei Baevski, Michael Auli 의 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index d8b23fa833fe..7e74fa282ebe 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -430,6 +430,7 @@ conda install -c huggingface transformers
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index da52c5831102..ddd6242c0f95 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -442,6 +442,7 @@ conda install -c huggingface transformers
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 0a98f793134c..65e5187bfe89 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -538,6 +538,8 @@
         title: ViTMAE
       - local: model_doc/vit_msn
         title: ViTMSN
+      - local: model_doc/vivit
+        title: ViViT
       - local: model_doc/yolos
         title: YOLOS
       title: Vision models
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index d3632aea5cf5..dbf4f3dfbece 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -246,6 +246,7 @@ The documentation is organized into five sections:
 1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[ViViT](model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
@@ -457,6 +458,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md
new file mode 100644
index 000000000000..755629a76752
--- /dev/null
+++ b/docs/source/en/model_doc/vivit.md
@@ -0,0 +1,44 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Video Vision Transformer (ViViT)
+
+## Overview
+
+The Vivit model was proposed in [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+The paper proposes one of the first successful pure-transformer based set of models for video understanding.
+
+The abstract from the paper is the following:
+
+*We present pure-transformer based models for video classification, drawing upon the recent success of such models in image classification. Our model extracts spatio-temporal tokens from the input video, which are then encoded by a series of transformer layers. In order to handle the long sequences of tokens encountered in video, we propose several, efficient variants of our model which factorise the spatial- and temporal-dimensions of the input. Although transformer-based models are known to only be effective when large training datasets are available, we show how we can effectively regularise the model during training and leverage pretrained image models to be able to train on comparatively small datasets. We conduct thorough ablation studies, and achieve state-of-the-art results on multiple video classification benchmarks including Kinetics 400 and 600, Epic Kitchens, Something-Something v2 and Moments in Time, outperforming prior methods based on deep 3D convolutional networks.*
+
+
+This model was contributed by [jegormeister](https://huggingface.co/jegormeister). The original code (written in JAX) can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit).
+
+## VivitConfig
+
+[[autodoc]] VivitConfig
+
+## VivitImageProcessor
+
+[[autodoc]] VivitImageProcessor
+    - preprocess
+
+## VivitModel
+
+[[autodoc]] VivitModel
+    - forward
+
+## VivitForVideoClassification
+
+[[autodoc]] transformers.VivitForVideoClassification
+    - forward
diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
index c28f1c401999..a140ba373099 100644
--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae)
+[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae), [ViViT](../model_doc/vivit)
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 82f461e0fd48..19789d9924fe 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -552,6 +552,10 @@
     "models.vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
+    "models.vivit": [
+        "VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "VivitConfig",
+    ],
     "models.wav2vec2": [
         "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Wav2Vec2Config",
@@ -933,6 +937,7 @@
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
     _import_structure["models.vit_hybrid"].extend(["ViTHybridImageProcessor"])
+    _import_structure["models.vivit"].append("VivitImageProcessor")
     _import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
 
 
@@ -2801,6 +2806,14 @@
             "ViTMSNPreTrainedModel",
         ]
     )
+    _import_structure["models.vivit"].extend(
+        [
+            "VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VivitForVideoClassification",
+            "VivitModel",
+            "VivitPreTrainedModel",
+        ]
+    )
     _import_structure["models.wav2vec2"].extend(
         [
             "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4435,6 +4448,7 @@
     from .models.vit_hybrid import VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTHybridConfig
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
+    from .models.vivit import VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, VivitConfig
     from .models.wav2vec2 import (
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Wav2Vec2Config,
@@ -4776,6 +4790,7 @@
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
         from .models.vit_hybrid import ViTHybridImageProcessor
+        from .models.vivit import VivitImageProcessor
         from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
 
     # Modeling
@@ -6300,6 +6315,12 @@
             ViTMSNModel,
             ViTMSNPreTrainedModel,
         )
+        from .models.vivit import (
+            VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VivitForVideoClassification,
+            VivitModel,
+            VivitPreTrainedModel,
+        )
         from .models.wav2vec2 import (
             WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             Wav2Vec2ForAudioFrameClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7acaee768525..b235f9ba35b3 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -210,6 +210,7 @@
     vit_hybrid,
     vit_mae,
     vit_msn,
+    vivit,
     wav2vec2,
     wav2vec2_conformer,
     wav2vec2_phoneme,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 3f5c88be52bf..0cc8f93dea91 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -210,6 +210,7 @@
         ("vit_hybrid", "ViTHybridConfig"),
         ("vit_mae", "ViTMAEConfig"),
         ("vit_msn", "ViTMSNConfig"),
+        ("vivit", "VivitConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
@@ -393,6 +394,7 @@
         ("vit_hybrid", "VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_msn", "VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vivit", "VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("whisper", "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -616,6 +618,7 @@
         ("vit_hybrid", "ViT Hybrid"),
         ("vit_mae", "ViTMAE"),
         ("vit_msn", "ViTMSN"),
+        ("vivit", "ViViT"),
         ("wav2vec2", "Wav2Vec2"),
         ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
         ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 415ee5e20c3e..c11fee596b21 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -198,6 +198,7 @@
         ("vit_hybrid", "ViTHybridModel"),
         ("vit_mae", "ViTMAEModel"),
         ("vit_msn", "ViTMSNModel"),
+        ("vivit", "VivitModel"),
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
@@ -535,6 +536,7 @@
     [
         ("timesformer", "TimesformerForVideoClassification"),
         ("videomae", "VideoMAEForVideoClassification"),
+        ("vivit", "VivitForVideoClassification"),
     ]
 )
 
diff --git a/src/transformers/models/vivit/__init__.py b/src/transformers/models/vivit/__init__.py
new file mode 100644
index 000000000000..ec446b797072
--- /dev/null
+++ b/src/transformers/models/vivit/__init__.py
@@ -0,0 +1,78 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_vivit": ["VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VivitConfig"],
+}
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_vivit"] = ["VivitImageProcessor"]
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vivit"] = [
+        "VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VivitModel",
+        "VivitPreTrainedModel",
+        "VivitForVideoClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_vivit import VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, VivitConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_vivit import VivitImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vivit import (
+            VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VivitForVideoClassification,
+            VivitModel,
+            VivitPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/vivit/configuration_vivit.py b/src/transformers/models/vivit/configuration_vivit.py
new file mode 100644
index 000000000000..c554999b9064
--- /dev/null
+++ b/src/transformers/models/vivit/configuration_vivit.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViViT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIVIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/vivit-b-16x2-kinetics400": (
+        "https://huggingface.co/google/vivit-b-16x2-kinetics400/resolve/main/config.json"
+    ),
+    # See all Vivit models at https://huggingface.co/models?filter=vivit
+}
+
+
+class VivitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VivitModel`]. It is used to instantiate a ViViT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ViViT
+    [google/vivit-b-16x2-kinetics400](https://huggingface.co/google/vivit-b-16x2-kinetics400) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        num_frames (`int`, *optional*, defaults to 32):
+            The number of frames in each video.
+        tubelet_size (`List[int]`, *optional*, defaults to `[2, 16, 16]`):
+            The size (resolution) of each tubelet.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_fast"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"gelu_fast"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import VivitConfig, VivitModel
+
+    >>> # Initializing a ViViT google/vivit-b-16x2-kinetics400 style configuration
+    >>> configuration = VivitConfig()
+
+    >>> # Initializing a model (with random weights) from the google/vivit-b-16x2-kinetics400 style configuration
+    >>> model = VivitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "vivit"
+
+    def __init__(
+        self,
+        image_size=224,
+        num_frames=32,
+        tubelet_size=[2, 16, 16],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-06,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.image_size = image_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
new file mode 100644
index 000000000000..bcd2e37c0a6a
--- /dev/null
+++ b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Flax ViViT checkpoints from the original repository to PyTorch. URL:
+https://github.com/google-research/scenic/tree/main/scenic/projects/vivit
+"""
+import argparse
+import json
+import os.path
+from collections import OrderedDict
+
+import numpy as np
+import requests
+import torch
+from flax.training.checkpoints import restore_checkpoint
+from huggingface_hub import hf_hub_download
+
+from transformers import VivitConfig, VivitForVideoClassification, VivitImageProcessor
+from transformers.image_utils import PILImageResampling
+
+
+def download_checkpoint(path):
+    url = "https://storage.googleapis.com/scenic-bucket/vivit/kinetics_400/vivit_base_16x2_unfactorized/checkpoint"
+
+    with open(path, "wb") as f:
+        with requests.get(url, stream=True) as req:
+            for chunk in req.iter_content(chunk_size=2048):
+                f.write(chunk)
+
+
+def get_vivit_config() -> VivitConfig:
+    config = VivitConfig()
+
+    config.num_labels = 400
+    repo_id = "huggingface/label-files"
+    filename = "kinetics400-id2label.json"
+
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    return config
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [ 47, 51, 55, 59, 63, 67, 71, 75, 80, 84, 88, 92, 96, 100, 104, 108, 113, 117,
+# 121, 125, 129, 133, 137, 141, 146, 150, 154, 158, 162, 166, 170, 174]
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
+    )
+    video = np.load(file)
+    return list(video)
+
+
+def transform_attention(current: np.ndarray):
+    if np.ndim(current) == 2:
+        return transform_attention_bias(current)
+
+    elif np.ndim(current) == 3:
+        return transform_attention_kernel(current)
+
+    else:
+        raise Exception(f"Invalid number of dimesions: {np.ndim(current)}")
+
+
+def transform_attention_bias(current: np.ndarray):
+    return current.flatten()
+
+
+def transform_attention_kernel(current: np.ndarray):
+    return np.reshape(current, (current.shape[0], current.shape[1] * current.shape[2])).T
+
+
+def transform_attention_output_weight(current: np.ndarray):
+    return np.reshape(current, (current.shape[0] * current.shape[1], current.shape[2])).T
+
+
+def transform_state_encoder_block(state_dict, i):
+    state = state_dict["optimizer"]["target"]["Transformer"][f"encoderblock_{i}"]
+
+    prefix = f"encoder.layer.{i}."
+    new_state = {
+        prefix + "intermediate.dense.bias": state["MlpBlock_0"]["Dense_0"]["bias"],
+        prefix + "intermediate.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_0"]["kernel"]),
+        prefix + "output.dense.bias": state["MlpBlock_0"]["Dense_1"]["bias"],
+        prefix + "output.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_1"]["kernel"]),
+        prefix + "layernorm_before.bias": state["LayerNorm_0"]["bias"],
+        prefix + "layernorm_before.weight": state["LayerNorm_0"]["scale"],
+        prefix + "layernorm_after.bias": state["LayerNorm_1"]["bias"],
+        prefix + "layernorm_after.weight": state["LayerNorm_1"]["scale"],
+        prefix
+        + "attention.attention.query.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["query"]["bias"]
+        ),
+        prefix
+        + "attention.attention.query.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["query"]["kernel"]
+        ),
+        prefix
+        + "attention.attention.key.bias": transform_attention(state["MultiHeadDotProductAttention_0"]["key"]["bias"]),
+        prefix
+        + "attention.attention.key.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["key"]["kernel"]
+        ),
+        prefix
+        + "attention.attention.value.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["value"]["bias"]
+        ),
+        prefix
+        + "attention.attention.value.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["value"]["kernel"]
+        ),
+        prefix + "attention.output.dense.bias": state["MultiHeadDotProductAttention_0"]["out"]["bias"],
+        prefix
+        + "attention.output.dense.weight": transform_attention_output_weight(
+            state["MultiHeadDotProductAttention_0"]["out"]["kernel"]
+        ),
+    }
+
+    return new_state
+
+
+def get_n_layers(state_dict):
+    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"].keys()])
+
+
+def transform_state(state_dict, classification_head=False):
+    transformer_layers = get_n_layers(state_dict)
+
+    new_state = OrderedDict()
+
+    new_state["layernorm.bias"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["bias"]
+    new_state["layernorm.weight"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["scale"]
+
+    new_state["embeddings.patch_embeddings.projection.weight"] = np.transpose(
+        state_dict["optimizer"]["target"]["embedding"]["kernel"], (4, 3, 0, 1, 2)
+    )
+    new_state["embeddings.patch_embeddings.projection.bias"] = state_dict["optimizer"]["target"]["embedding"]["bias"]
+
+    new_state["embeddings.cls_token"] = state_dict["optimizer"]["target"]["cls"]
+    new_state["embeddings.position_embeddings"] = state_dict["optimizer"]["target"]["Transformer"]["posembed_input"][
+        "pos_embedding"
+    ]
+
+    for i in range(transformer_layers):
+        new_state.update(transform_state_encoder_block(state_dict, i))
+
+    if classification_head:
+        new_state = {"vivit." + k: v for k, v in new_state.items()}
+        new_state["classifier.weight"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["kernel"])
+        new_state["classifier.bias"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["bias"])
+
+    return {k: torch.tensor(v) for k, v in new_state.items()}
+
+
+# checks that image processor settings are the same as in the original implementation
+# original: https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/data/video_tfrecord_dataset.py
+# dataset specific config:
+# https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/configs/kinetics400/vivit_base_k400.py
+def get_processor() -> VivitImageProcessor:
+    extractor = VivitImageProcessor()
+
+    assert extractor.do_resize is True
+    assert extractor.size == {"shortest_edge": 256}
+    assert extractor.do_center_crop is True
+    assert extractor.crop_size == {"width": 224, "height": 224}
+    assert extractor.resample == PILImageResampling.BILINEAR
+
+    # here: https://github.com/deepmind/dmvr/blob/master/dmvr/modalities.py
+    # one can seen that add_image has default values for normalization_mean and normalization_std set to 0 and 1
+    # which effectively means no normalization (and ViViT does not overwrite those when calling this func)
+    assert extractor.do_normalize is False
+    assert extractor.do_rescale is True
+    assert extractor.rescale_factor == 1 / 255
+
+    # zero-centering = True in original implementation
+    assert extractor.do_zero_centering is True
+
+    return extractor
+
+
+def convert(output_path: str):
+    flax_model_path = "checkpoint"
+
+    if not os.path.exists(flax_model_path):
+        download_checkpoint(flax_model_path)
+
+    state_dict = restore_checkpoint(flax_model_path, None)
+    new_state = transform_state(state_dict, classification_head=True)
+
+    config = get_vivit_config()
+
+    assert config.image_size == 224
+    assert config.num_frames == 32
+
+    model = VivitForVideoClassification(config)
+    model.load_state_dict(new_state)
+    model.eval()
+
+    extractor = get_processor()
+
+    video = prepare_video()
+    inputs = extractor(video, return_tensors="pt")
+
+    outputs = model(**inputs)
+
+    expected_shape = torch.Size([1, 400])
+    expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658])
+
+    assert outputs.logits.shape == expected_shape
+    assert torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4), outputs.logits[0, :5]
+
+    model.save_pretrained(output_path)
+    extractor.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output_model_name", "-o", type=str, help="Output path for the converted HuggingFace model")
+
+    args = parser.parse_args()
+    convert(args.output_model_name)
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
new file mode 100644
index 000000000000..b6a1adaf08be
--- /dev/null
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -0,0 +1,404 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Vivit."""
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils import is_vision_available
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    center_crop,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import logging
+
+
+if is_vision_available():
+    import PIL
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> List[List[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        return [videos]
+
+    elif is_valid_image(videos):
+        return [[videos]]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class VivitImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Vivit image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        offset (`bool`, *optional*, defaults to `True`):
+            Whether to scale the image in both negative and positive directions. Can be overriden by the `offset` in
+            the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        offset: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 256}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.offset = offset
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
+        edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        offset: bool = True,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor.
+
+        If offset is `True`, image scaled between [-1, 1]: image = (image - 127.5) * scale. If offset is `False`, image
+        scaled between [0, 1]: image = image * scale
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+           offset (`bool`, *optional*):
+                Whether to scale the image in both negative and positive directions.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        image = image.astype(np.float32)
+        if offset:
+            image = image - (scale / 2)
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        offset: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        if offset and not do_rescale:
+            raise ValueError("For offset, do_rescale must also be set to True.")
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, offset=offset)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std)
+
+        image = to_channel_dimension_format(image, data_format)
+        return image
+
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        offset: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            videos (`ImageInput`):
+                Video frames to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between `[-1 - 1]` if `offset` is `True`, `[0, 1]` otherwise.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            offset (`bool`, *optional*, defaults to `self.offset`):
+                Whether to scale the image in both negative and positive directions.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        offset = offset if offset is not None else self.offset
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    offset=offset,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        data = {"pixel_values": videos}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py
new file mode 100755
index 000000000000..fe633a5f205f
--- /dev/null
+++ b/src/transformers/models/vivit/modeling_vivit.py
@@ -0,0 +1,736 @@
+# coding=utf-8
+# Copyright 2023 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViViT model."""
+
+
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_vivit import VivitConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/vivit-b-16x2-kinetics400"
+_CONFIG_FOR_DOC = "VivitConfig"
+
+VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/vivit-b-16x2-kinetics400",
+    # See all Vivit models at https://huggingface.co/models?filter=vivit
+]
+
+
+class VivitTubeletEmbeddings(nn.Module):
+    """
+    Construct Vivit Tubelet embeddings.
+
+    This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
+    shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.
+
+    The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
+    (width // tubelet_size[2]).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_frames = config.num_frames
+        self.image_size = config.image_size
+        self.patch_size = config.tubelet_size
+        self.num_patches = (
+            (self.image_size // self.patch_size[2])
+            * (self.image_size // self.patch_size[1])
+            * (self.num_frames // self.patch_size[0])
+        )
+        self.embed_dim = config.hidden_size
+
+        self.projection = nn.Conv3d(
+            config.num_channels, config.hidden_size, kernel_size=config.tubelet_size, stride=config.tubelet_size
+        )
+
+    def forward(self, pixel_values):
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if height != self.image_size or width != self.image_size:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+
+        # permute to (batch_size, num_channels, num_frames, height, width)
+        pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
+
+        x = self.projection(pixel_values)
+        # out_batch_size, out_num_channels, out_num_frames, out_height, out_width = x.shape
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+class VivitEmbeddings(nn.Module):
+    """
+    Vivit Embeddings.
+
+    Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = VivitTubeletEmbeddings(config)
+
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, self.patch_embeddings.num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, pixel_values):
+        batch_size = pixel_values.shape[0]
+        embeddings = self.patch_embeddings(pixel_values)
+
+        cls_tokens = self.cls_token.tile([batch_size, 1, 1])
+
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Vivit
+class VivitSelfAttention(nn.Module):
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Vivit
+class VivitSelfOutput(nn.Module):
+    """
+    The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Vivit
+class VivitAttention(nn.Module):
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        self.attention = VivitSelfAttention(config)
+        self.output = VivitSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VivitIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class VivitOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class VivitLayer(nn.Module):
+    """This corresponds to the EncoderBlock class in the scenic/vivit implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VivitAttention(config)
+        self.intermediate = VivitIntermediate(config)
+        self.output = VivitOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, head_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(
+            # in Vivit, layernorm is applied before self-attention
+            self.layernorm_before(hidden_states),
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        # add self attentions if we output attention weights
+        outputs = self_attention_outputs[1:]
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in Vivit, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class VivitEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VivitLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VivitPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class VivitPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VivitConfig
+    base_model_prefix = "vivit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Parameter):
+            module.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, VivitEncoder):
+            module.gradient_checkpointing = value
+
+
+VIVIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VivitConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIVIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`VivitImageProcessor`]. See
+            [`VivitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViViT Transformer model outputting raw hidden-states without any specific head on top.",
+    VIVIT_START_DOCSTRING,
+)
+class VivitModel(VivitPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VivitEmbeddings(config)
+        self.encoder = VivitEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = VivitPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model.
+
+        Args:
+            heads_to_prune:
+                dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIVIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+
+        >>> from transformers import VivitImageProcessor, VivitModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 32 frames
+        >>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=len(videoreader))
+        >>> video = videoreader.get_batch(indices).asnumpy()
+
+        >>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+        >>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+        >>> # prepare video for the model
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 3137, 768]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
+[CLS] token) e.g. for Kinetics-400.""",
+    VIVIT_START_DOCSTRING,
+)
+class VivitForVideoClassification(VivitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vivit = VivitModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIVIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        head_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+
+        >>> from transformers import VivitImageProcessor, VivitModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 32 frames
+        >>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=len(videoreader))
+        >>> video = videoreader.get_batch(indices).asnumpy()
+
+        >>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+        >>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     logits = outputs.logits
+
+        >>> # model predicts one of the 400 Kinetics-400 classes
+        >>> predicted_label = logits.argmax(-1).item()
+        >>> print(model.config.id2label[predicted_label])
+        eating spaghetti
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vivit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 9a41a8aa94b1..25a894302186 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -7573,6 +7573,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VivitForVideoClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VivitModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VivitPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index bfb3cdcaff5a..79c0e3eac7c1 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -499,6 +499,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class VivitImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class YolosFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/vivit/__init__.py b/tests/models/vivit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
new file mode 100644
index 000000000000..f33553db0f25
--- /dev/null
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingSavingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VivitImageProcessor
+
+
+class VivitImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_frames=10,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        crop_size=None,
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.crop_size = crop_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "crop_size": self.crop_size,
+        }
+
+
+@require_torch
+@require_vision
+class VivitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
+    image_processing_class = VivitImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = VivitImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL videos
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
new file mode 100644
index 000000000000..4440f46d046d
--- /dev/null
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -0,0 +1,350 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViViT model. """
+
+
+import copy
+import inspect
+import unittest
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import VivitConfig
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VivitForVideoClassification, VivitModel
+    from transformers.models.vivit.modeling_vivit import VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from transformers import VivitImageProcessor
+
+
+class VivitModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        is_training=True,
+        use_labels=True,
+        num_labels=10,
+        image_size=10,
+        num_frames=8,  # decreased, because default 32 takes too much RAM at inference
+        tubelet_size=[2, 4, 4],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-06,
+        qkv_bias=True,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.image_size = image_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.scope = scope
+
+        self.seq_length = (
+            (self.image_size // self.tubelet_size[2])
+            * (self.image_size // self.tubelet_size[1])
+            * (self.num_frames // self.tubelet_size[0])
+        ) + 1  # CLS token
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
+        )
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        config = VivitConfig(
+            num_frames=self.num_frames,
+            image_size=self.image_size,
+            tubelet_size=self.tubelet_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            layer_norm_eps=self.layer_norm_eps,
+            qkv_bias=self.qkv_bias,
+        )
+        config.num_labels = self.num_labels
+        return config
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = VivitModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_video_classification(self, config, pixel_values, labels):
+        model = VivitForVideoClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values)
+
+        # verify the logits shape
+        expected_shape = torch.Size((self.batch_size, self.num_labels))
+        self.parent.assertEqual(result.logits.shape, expected_shape)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VivitModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Vivit does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VivitModel, VivitForVideoClassification) if is_torch_available() else ()
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VivitModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VivitConfig, has_text_modality=False, hidden_size=37)
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Vivit does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values", "head_mask"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_video_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_video_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VIVIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = VivitModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            seq_len = self.model_tester.seq_length
+
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + 1, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
+    )
+    video = np.load(file)
+    return list(video)
+
+
+@require_torch
+@require_vision
+class VivitModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return VivitImageProcessor() if is_vision_available() else None
+
+    @slow
+    def test_inference_for_video_classification(self):
+        model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400").to(torch_device)
+
+        image_processor = self.default_image_processor
+        video = prepare_video()
+        inputs = image_processor(video, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 400))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        # taken from original model
+        expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))

From 2489e380e45177eb3da108366f65b8108b2815c5 Mon Sep 17 00:00:00 2001
From: Justin Martin <jaming@protonmail.com>
Date: Tue, 11 Jul 2023 09:04:50 -0400
Subject: [PATCH 623/935] Fix typo in LocalAgent (#24736)

---
 src/transformers/tools/agents.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index fdbae381f4ec..226fe735abf8 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -725,7 +725,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
     def _model_device(self):
         if hasattr(self.model, "hf_device_map"):
             return list(self.model.hf_device_map.values())[0]
-        for param in self.mode.parameters():
+        for param in self.model.parameters():
             return param.device
 
     def generate_one(self, prompt, stop):

From 5739726fccaddde4506242974c5c27468ff86d81 Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Tue, 11 Jul 2023 11:07:58 -0400
Subject: [PATCH 624/935] fix: Text splitting in the BasicTokenizer (#22280)

* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer

* account for apostrophe at start of new word

* remove _run_split_on_punc, use re.findall instead

* remove debugging, make style and quality

* use pattern and punc splitting, repo-consistency will fail

* remove commented out debugging

* adds bool args to BasicTokenizer, remove pattern

* do_split_on_punc default True

* clean stray comments and line breaks

* rebase, repo-consistency

* update to just do punctuation split

* add unicode normalizing back

* remove redundant line
---
 .../models/bert/tokenization_bert.py          | 22 +++++++++++++----
 .../tokenization_bert_japanese.py             | 22 +++++++++++++----
 .../models/clip/tokenization_clip.py          | 24 ++++++++++++++-----
 .../models/convbert/tokenization_convbert.py  | 22 +++++++++++++----
 .../distilbert/tokenization_distilbert.py     | 22 +++++++++++++----
 .../models/electra/tokenization_electra.py    | 22 +++++++++++++----
 .../models/funnel/tokenization_funnel.py      | 22 +++++++++++++----
 .../models/herbert/tokenization_herbert.py    | 22 +++++++++++++----
 .../models/layoutlm/tokenization_layoutlm.py  | 22 +++++++++++++----
 .../layoutlmv2/tokenization_layoutlmv2.py     | 22 +++++++++++++----
 .../models/lxmert/tokenization_lxmert.py      | 22 +++++++++++++----
 .../mobilebert/tokenization_mobilebert.py     | 22 +++++++++++++----
 .../models/mpnet/tokenization_mpnet.py        | 22 +++++++++++++----
 .../models/openai/tokenization_openai.py      | 22 +++++++++++++----
 .../prophetnet/tokenization_prophetnet.py     | 22 +++++++++++++----
 .../retribert/tokenization_retribert.py       | 22 +++++++++++++----
 .../models/roc_bert/tokenization_roc_bert.py  | 22 +++++++++++++----
 .../models/roformer/tokenization_roformer.py  | 22 +++++++++++++----
 .../squeezebert/tokenization_squeezebert.py   | 22 +++++++++++++----
 .../models/tapas/tokenization_tapas.py        | 22 +++++++++++++----
 tests/models/bert/test_tokenization_bert.py   |  6 +++++
 tests/models/clip/test_tokenization_clip.py   |  4 ++--
 22 files changed, 349 insertions(+), 103 deletions(-)

diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index 8d13bb4e546c..536eb08640c0 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -385,20 +385,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -417,7 +427,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -445,7 +457,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 5af9984bb9e9..dd2901091852 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -748,20 +748,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -780,7 +790,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -808,7 +820,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py
index e3ff5f8626fa..127480b90cad 100644
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -126,20 +126,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -158,7 +168,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -186,7 +198,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
@@ -316,7 +328,7 @@ def __init__(
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
+            self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
             self.fix_text = None
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index cbee21aafe3a..4fbed8fe10fd 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -325,20 +325,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -357,7 +367,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -385,7 +397,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 76582ae4eab1..02596825863e 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -350,20 +350,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -382,7 +392,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -410,7 +422,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index 673c1db6119e..e202f773efa8 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -342,20 +342,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -374,7 +384,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -402,7 +414,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 245694bfac52..f085fd7c4776 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -359,20 +359,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -391,7 +401,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -419,7 +431,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index 3d07e68e1880..91ce0dcca584 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -143,20 +143,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -175,7 +185,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -203,7 +215,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index 99f517c6a2a2..57c29d5870ed 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -324,20 +324,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -356,7 +366,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -384,7 +396,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index c6a8857325f4..1799cc292114 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -1362,20 +1362,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -1394,7 +1404,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -1422,7 +1434,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index 84d05bebce2e..daa761878d94 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -316,20 +316,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -348,7 +358,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -376,7 +388,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index 0ccf9efe02cc..63c0ab28a730 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -314,20 +314,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -346,7 +356,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -374,7 +386,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 57a06beeead0..f1347da08a3f 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -342,20 +342,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -374,7 +384,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -402,7 +414,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index 36035eafec2a..0a7f93a7b2de 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -71,20 +71,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -103,7 +113,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -131,7 +143,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 36104d49fb2d..03e9083e749e 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -72,20 +72,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -104,7 +114,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -132,7 +144,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/retribert/tokenization_retribert.py
index 0c04c363ebe0..b4a0f0619216 100644
--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/retribert/tokenization_retribert.py
@@ -333,20 +333,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -365,7 +375,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -393,7 +405,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index 4338c098ba79..cee778dc8781 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -963,7 +973,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -991,7 +1003,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index 1e4907f50c58..dc406fa480ee 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -107,20 +107,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -139,7 +149,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -167,7 +179,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index ed7be941e036..ccce92809e32 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -328,20 +328,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -360,7 +370,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -388,7 +400,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 0bd558aee856..c3d35d0c87be 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -2055,20 +2055,30 @@ class BasicTokenizer(object):
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
     """
 
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
 
     def tokenize(self, text, never_split=None):
         """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
 
         Args:
             never_split (`List[str]`, *optional*)
@@ -2087,7 +2097,9 @@ def tokenize(self, text, never_split=None):
         # words in the English Wikipedia.).
         if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
         split_tokens = []
         for token in orig_tokens:
             if token not in never_split:
@@ -2115,7 +2127,7 @@ def _run_strip_accents(self, text):
 
     def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
             return [text]
         chars = list(text)
         i = 0
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index c5ebd6dbf34a..f9383756e3b2 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -182,6 +182,12 @@ def test_basic_tokenizer_respects_never_split_tokens(self):
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
         )
 
+    def test_basic_tokenizer_splits_on_punctuation(self):
+        tokenizer = BasicTokenizer()
+        text = "a\n'll !!to?'d of, can't."
+        expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
+        self.assertListEqual(tokenizer.tokenize(text), expected)
+
     def test_wordpiece_tokenizer(self):
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
 
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index fc958267105c..27387be42bad 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -81,7 +81,7 @@ def test_check_encoding_slow_fast(self):
                 tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
-                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
+                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
                 text_tokenized_s = tokenizer_s.tokenize(text)
                 text_tokenized_r = tokenizer_r.tokenize(text)
 
@@ -122,7 +122,7 @@ def test_check_encoding_slow_fast(self):
                     # "\u0085", # (next line)
                 ]
 
-                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
+                # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
                 # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
                 # space (and thus into an empty list).
 

From 2642d8d04b14c18199ebe7b35f976da02df61752 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 11 Jul 2023 16:21:29 +0100
Subject: [PATCH 625/935] Docs: add `kwargs` type to fix formatting (#24733)

---
 src/transformers/configuration_utils.py                 | 2 +-
 src/transformers/feature_extraction_utils.py            | 2 +-
 src/transformers/generation/configuration_utils.py      | 2 +-
 src/transformers/generation/flax_logits_process.py      | 2 +-
 src/transformers/generation/flax_utils.py               | 2 +-
 src/transformers/generation/logits_process.py           | 2 +-
 src/transformers/generation/stopping_criteria.py        | 2 +-
 src/transformers/generation/tf_logits_process.py        | 2 +-
 src/transformers/generation/tf_utils.py                 | 2 +-
 src/transformers/generation/utils.py                    | 2 +-
 src/transformers/hf_argparser.py                        | 4 ++--
 src/transformers/image_processing_utils.py              | 2 +-
 src/transformers/modeling_flax_utils.py                 | 2 +-
 src/transformers/modeling_tf_utils.py                   | 9 ++++-----
 src/transformers/modeling_utils.py                      | 3 +--
 src/transformers/models/jukebox/tokenization_jukebox.py | 5 -----
 src/transformers/models/musicgen/modeling_musicgen.py   | 4 ++--
 src/transformers/models/rag/modeling_rag.py             | 4 ++--
 src/transformers/models/rag/modeling_tf_rag.py          | 4 ++--
 src/transformers/models/whisper/modeling_tf_whisper.py  | 2 +-
 src/transformers/models/whisper/modeling_whisper.py     | 2 +-
 src/transformers/optimization_tf.py                     | 2 +-
 src/transformers/pipelines/__init__.py                  | 4 ++--
 src/transformers/processing_utils.py                    | 2 +-
 src/transformers/tokenization_utils.py                  | 2 +-
 src/transformers/tokenization_utils_base.py             | 2 +-
 src/transformers/tokenization_utils_fast.py             | 2 +-
 src/transformers/tools/agents.py                        | 2 +-
 src/transformers/trainer.py                             | 4 ++--
 src/transformers/trainer_pt_utils.py                    | 2 +-
 30 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index bd1ce0e78521..cb5bb423dca4 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -432,7 +432,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 1eb1218d6fc5..9b0872c4983a 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -379,7 +379,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 4514ccef768b..096424b8585b 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -353,7 +353,7 @@ def save_pretrained(
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index 61ddb392c6df..e6b45ded8043 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -38,7 +38,7 @@
         scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional logits processor specific kwargs.
 
     Return:
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index f18cc0ea84ff..260595091a6f 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -296,7 +296,7 @@ def generate(
                 Custom logits processors that complement the default logits processors built from arguments and
                 generation config. If a logit processor is passed that is already created with the arguments or a
                 generation config an error is thrown. This feature is intended for advanced users.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 9982f941de8d..99d99bdf254b 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -39,7 +39,7 @@
         scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional logits processor specific kwargs.
 
     Return:
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 8d1c3a0f4f16..4e0a294e7c34 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -24,7 +24,7 @@
         scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
             or scores for each vocabulary token after SoftMax.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional stopping criteria specific kwargs.
 
     Return:
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 7e442a1659c8..02e33caf79ac 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -42,7 +42,7 @@
         cur_len (`int`):
             The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
             is the maximum length generate can produce, and we need to know which of its tokens are valid.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional logits processor specific kwargs.
 
     Return:
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 40c418a714a5..2a48b79406d7 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -705,7 +705,7 @@ def generate(
             seed (`List[int]`, *optional*):
                 Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                 `seed` argument from stateless functions in `tf.random`.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index f5748f6d267f..b4ef0af48ec9 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1225,7 +1225,7 @@ def generate(
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index c8f8bb17787d..34570588744a 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -122,8 +122,8 @@ def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]
         Args:
             dataclass_types:
                 Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
-            kwargs:
-                (Optional) Passed to `argparse.ArgumentParser()` in the regular way.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Passed to `argparse.ArgumentParser()` in the regular way.
         """
         # To make the default appear when using --help
         if "formatter_class" not in kwargs:
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index a712ed490f43..d0d98964beec 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -208,7 +208,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index c2732a7818a4..51b3a941b99b 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -1043,7 +1043,7 @@ def save_pretrained(
 
                 </Tip>
 
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index dd65c7b23b21..7bf284333946 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2371,8 +2371,7 @@ def save_pretrained(
                 Whether or not to create a PR with the uploaded files or directly commit.
             safe_serialization (`bool`, *optional*, defaults to `False`):
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
@@ -3166,7 +3165,7 @@ class TFConv1D(tf.keras.layers.Layer):
             The number of input features.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation to use to initialize the weights.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
@@ -3208,7 +3207,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
         initializer_range (`float`, *optional*):
             The standard deviation to use when initializing the weights. If no value is provided, it will default to
             \\(1/\sqrt{hidden\_size}\\).
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
     # TODO (joao): flagged for delection due to embeddings refactor
@@ -3322,7 +3321,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
             - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
 
         initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 4c761bc3119a..bf6d7171f703 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1700,8 +1700,7 @@ def save_pretrained(
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
             variant (`str`, *optional*):
                 If specified, weights are saved in the format pytorch_model.<variant>.bin.
-
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         # Checks if the model has been loaded in 8-bit
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index bf2f3b97e1d0..9a4a37b871e4 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -202,9 +202,6 @@ def prepare_for_tokenization(
         """
         Performs any necessary transformations before tokenization.
 
-        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
-        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
-
         Args:
             artist (`str`):
                 The artist name to prepare. This will mostly lower the string
@@ -216,8 +213,6 @@ def prepare_for_tokenization(
                 Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            kwargs:
-                Keyword arguments to use for the tokenization.
         """
         for idx in range(len(self.version)):
             if self.version[idx] == "v3":
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index bcd83e476fe2..accff7bed5b0 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1228,7 +1228,7 @@ def generate(
                 generation config an error is thrown. This feature is intended for advanced users.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
@@ -2225,7 +2225,7 @@ def generate(
                 generation config an error is thrown. This feature is intended for advanced users.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 1e615512c91c..21ee10386a81 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -962,7 +962,7 @@ def generate(
                 Number of beams for beam search. 1 means no beam search.
             n_docs (`int`, *optional*, defaults to `config.n_docs`)
                 Number of documents to retrieve and/or number of documents for which to generate an answer.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].
 
         Return:
@@ -1444,7 +1444,7 @@ def generate(
                 Custom stopping criteria that complement the default stopping criteria built from arguments and a
                 model's config. If a stopping criteria is passed that is already created with the arguments or a
                 model's config an error is thrown.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
 
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index d91fa71df8a6..cb4d0dd0cc54 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -1051,7 +1051,7 @@ def generate(
                 Custom logits processors that complement the default logits processors built from arguments and a
                 model's config. If a logit processor is passed that is already created with the arguments or a model's
                 config an error is thrown.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model.
 
@@ -1629,7 +1629,7 @@ def generate(
                 Number of beams for beam search. 1 means no beam search.
             n_docs (`int`, *optional*, defaults to `config.n_docs`)
                 Number of documents to retrieve and/or number of documents for which to generate an answer.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional kwargs will be passed to [`~generation.GenerationMixin.generate`]
 
         Return:
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 653dd168ae5e..474c04499515 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -1394,7 +1394,7 @@ def generate(
                 Whether to return token-level timestamps with the text. This can be used with or without the
                 `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
                 words.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 10e624d497b0..005ef5786675 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1608,7 +1608,7 @@ def generate(
                 Whether to return token-level timestamps with the text. This can be used with or without the
                 `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
                 words.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 382eae2a30e2..a9f9ec120744 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -201,7 +201,7 @@ class AdamWeightDecay(Adam):
             `include_in_weight_decay` is passed, the names in it will supersede this list.
         name (`str`, *optional*, defaults to 'AdamWeightDecay'):
             Optional name for the operations created when applying gradients.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
             norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
             inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 499d94e73018..66822183f1b7 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -634,10 +634,10 @@ def pipeline(
             Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
             tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
             and in which you have read the code, as it will execute code present on the Hub on your local machine.
-        model_kwargs:
+        model_kwargs (`Dict[str, Any]`, *optional*):
             Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
             **model_kwargs)` function.
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
             corresponding pipeline class for possible values).
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index f7f5de7d7c7b..3a760183e84f 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -111,7 +111,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
         os.makedirs(save_directory, exist_ok=True)
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index a1454a744dbb..c1dd9c329a1c 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -834,7 +834,7 @@ def prepare_for_tokenization(
                 Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Keyword arguments to use for the tokenization.
 
         Returns:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index bcf2d8f7ec1a..c9284c10a2ca 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2133,7 +2133,7 @@ def save_pretrained(
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
 
         Returns:
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 106e0d5bf841..471221e713b0 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -630,7 +630,7 @@ def train_new_from_iterator(
             special_tokens_map (`Dict[str, str]`, *optional*):
                 If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                 token name to new special token name in this argument.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
 
         Returns:
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 226fe735abf8..ec4e0c1cc36d 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -704,7 +704,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
                 The name of a repo on the Hub or a local path to a folder containing both model and tokenizer.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Keyword arguments passed along to [`~PreTrainedModel.from_pretrained`].
 
         Example:
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 0ea1d50b4640..879e6ea033c1 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1475,7 +1475,7 @@ def train(
             ignore_keys_for_eval (`List[str]`, *optional*)
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions for evaluation during the training.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments used to hide deprecated arguments
         """
         if resume_from_checkpoint is False:
@@ -3567,7 +3567,7 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
                 Message to commit while pushing.
             blocking (`bool`, *optional*, defaults to `True`):
                 Whether the function should return only when the `git push` has finished.
-            kwargs:
+            kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to [`~Trainer.create_model_card`].
 
         Returns:
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index bb9b00d2cc42..dd995b9bfe70 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -257,7 +257,7 @@ class DistributedSamplerWithLoop(DistributedSampler):
             Dataset used for sampling.
         batch_size (`int`):
             The batch size used with this sampler
-        kwargs:
+        kwargs (`Dict[str, Any]`, *optional*):
             All other keyword arguments passed to `DistributedSampler`.
     """
 

From 3d8697261e8ec98831800442e788dabb9217f2f2 Mon Sep 17 00:00:00 2001
From: Zehan Li <69186130+jordane95@users.noreply.github.com>
Date: Tue, 11 Jul 2023 23:29:47 +0800
Subject: [PATCH 626/935] add gradient checkpointing for distilbert (#24719)

* add gradient checkpointing for distilbert

* reformatted
---
 .../models/distilbert/modeling_distilbert.py  | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 97300dec2d64..d0ceb4c2464f 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -324,6 +324,7 @@ def __init__(self, config: PretrainedConfig):
         super().__init__()
         self.n_layers = config.n_layers
         self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
+        self.gradient_checkpointing = False
 
     def forward(
         self,
@@ -356,9 +357,28 @@ def forward(
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_state,)
 
-            layer_outputs = layer_module(
-                x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions
-            )
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_state,
+                    attn_mask,
+                    head_mask[i],
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_state,
+                    attn_mask,
+                    head_mask[i],
+                    output_attentions,
+                )
+
             hidden_state = layer_outputs[-1]
 
             if output_attentions:
@@ -392,6 +412,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
     config_class = DistilBertConfig
     load_tf_weights = None
     base_model_prefix = "distilbert"
+    supports_gradient_checkpointing = True
 
     def _init_weights(self, module: nn.Module):
         """Initialize the weights."""
@@ -409,6 +430,10 @@ def _init_weights(self, module: nn.Module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Transformer):
+            module.gradient_checkpointing = value
+
 
 DISTILBERT_START_DOCSTRING = r"""
 

From 33aafc26ee68df65c7d9457259fc3d59f79eef4f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 11 Jul 2023 12:40:21 -0400
Subject: [PATCH 627/935] Skip keys not in the state dict when finding
 mismatched weights (#24749)

---
 src/transformers/modeling_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index bf6d7171f703..45459ed22b0a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3157,6 +3157,9 @@ def _find_mismatched_keys(
             mismatched_keys = []
             if ignore_mismatched_sizes:
                 for checkpoint_key in loaded_keys:
+                    # If the checkpoint is sharded, we may not have the key here.
+                    if checkpoint_key not in state_dict:
+                        continue
                     model_key = checkpoint_key
                     if remove_prefix_from_model:
                         # The model key starts with `prefix` but `checkpoint_key` doesn't so we add it.

From aac4c7996837b77abe91cd1ea734b6ef74117156 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 11 Jul 2023 20:55:04 +0200
Subject: [PATCH 628/935] Fix non-deterministic Megatron-LM checkpoint name
 (#24674)

Fix non-deterministic checkpoint name

`os.listdir`'s order is not deterministic, which is a problem when
querying the first listed file as in the code (`os.listdir(...)[0]`).

This can return a checkpoint name such as `distrib_optim.pt`, which does
not include desired information such as the saved arguments originally
given to Megatron-LM.
---
 .../checkpoint_reshaping_and_interoperability.py            | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index 5a3d30b649b2..cb75a80946bb 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -291,8 +291,10 @@ def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank):
     tp_state_dicts = []
     for i in range(tp_size):
         sub_dir_name = f"mp_rank_{i:02d}" if pp_size == 1 else f"mp_rank_{i:02d}_{pp_rank:03d}"
-        checkpoint_name = os.listdir(os.path.join(args.load_path, sub_dir_name))[0]
-        checkpoint_path = os.path.join(args.load_path, sub_dir_name, checkpoint_name)
+        for checkpoint_name in ["model_optim_rng.pt", "model_rng.pt"]:
+            checkpoint_path = os.path.join(args.load_path, sub_dir_name, checkpoint_name)
+            if os.path.isfile(checkpoint_path):
+                break
         state_dict = torch.load(checkpoint_path, map_location="cpu")
         tp_state_dicts.append(state_dict)
     return tp_state_dicts

From bb13a92859b544ba607d3c30622713cccd4a907d Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 11 Jul 2023 21:43:01 +0200
Subject: [PATCH 629/935] [InstructBLIP] Fix bos token of LLaMa checkpoints
 (#24492)

* Add fix

* Fix doctest
---
 .../models/instructblip/modeling_instructblip.py       | 10 ++++++++--
 .../models/instructblip/test_modeling_instructblip.py  |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index a26884c85282..1254ef558f6b 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1360,7 +1360,7 @@ def forward(
         >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
 
         >>> device = "cuda" if torch.cuda.is_available() else "cpu"
-        >>> model.to(device)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
 
         >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
@@ -1380,7 +1380,7 @@ def forward(
         ... )
         >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
         >>> print(generated_text)
-        What is unusual about this image? The image is unusual because it depicts a person standing on top of a car, which is parked on the side of the road. This is an unusual position for a person to be in, as they are typically not expected to stand on top of a car while it is parked. Additionally, the person in the image appears to be wearing a suit and tie, which is not typical attire for someone who is standing on top of a car. It is unclear why the person is in this unusual position or what they are doing there.
+        The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1554,4 +1554,10 @@ def generate(
             **generate_kwargs,
         )
 
+        # the InstructBLIP authors used inconsistent tokenizer/model files during training,
+        # with the tokenizer's bos token being set to </s> which has ID=2,
+        # whereas the model's text config has bos token id = 0
+        if self.config.text_config.architectures[0] == "LLaMAForCausalLM":
+            outputs[outputs == 0] = 2
+
         return outputs
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index df3f8aab15e9..e3fe2074e602 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -544,7 +544,6 @@ def test_inference_vicuna_7b(self):
 
         # verify generation
         outputs = model.generate(**inputs, max_new_tokens=30)
-        outputs[outputs == 0] = 2
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 
         # fmt: off

From 1be0145d6a045652c075fd5965d1e394cdb17654 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 11 Jul 2023 22:08:14 +0200
Subject: [PATCH 630/935] Skip some slow tests for doctesting in PRs (Circle)CI
 (#24753)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/slow_documentation_tests.txt | 1 +
 utils/tests_fetcher.py             | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 utils/slow_documentation_tests.txt

diff --git a/utils/slow_documentation_tests.txt b/utils/slow_documentation_tests.txt
new file mode 100644
index 000000000000..680dea094e0e
--- /dev/null
+++ b/utils/slow_documentation_tests.txt
@@ -0,0 +1 @@
+docs/source/en/task_summary.md
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 88552a11ca11..f5a2904027b1 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -310,10 +310,17 @@ def get_doctest_files(diff_with_last_commit=False):
             print(f"Parent commit: {commit}")
         test_files_to_run = get_diff_for_doctesting(repo, repo.head.commit, parent_commits)
 
+    # This is the full list of doctest tests
     with open("utils/documentation_tests.txt") as fp:
         documentation_tests = set(fp.read().strip().split("\n"))
+    # Not to run slow doctest tests
+    with open("utils/slow_documentation_tests.txt") as fp:
+        slow_documentation_tests = set(fp.read().strip().split("\n"))
+
     # So far we don't have 100% coverage for doctest. This line will be removed once we achieve 100%.
-    test_files_to_run = [x for x in test_files_to_run if x in documentation_tests]
+    test_files_to_run = [
+        x for x in test_files_to_run if x in documentation_tests and x not in slow_documentation_tests
+    ]
     # Make sure we did not end up with a test file that was removed
     test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
 

From 253d43d46d1291633fb21116b737f2bd8799d3da Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 11 Jul 2023 16:37:04 -0400
Subject: [PATCH 631/935] Fix lr scheduler not being reset on reruns (#24758)

* Try this

* Solved!

* Rm extranious

* Rm extranious

* self

* Args'

* Check for if we created the lr scheduler

* Move comment

* Clean
---
 src/transformers/trainer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 879e6ea033c1..05bfccfee102 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -688,8 +688,9 @@ def __init__(
         self.can_return_loss = can_return_loss(self.model.__class__)
         self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
 
-        # Internal variables to keep track of the original batch size
+        # Internal variables to help with automatic batch size reduction
         self._train_batch_size = args.train_batch_size
+        self._created_lr_scheduler = False
 
         # very last
         self._memory_tracker.stop_and_update_metrics()
@@ -1150,6 +1151,7 @@ def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optim
                 num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                 num_training_steps=num_training_steps,
             )
+            self._created_lr_scheduler = True
         return self.lr_scheduler
 
     def num_examples(self, dataloader: DataLoader) -> int:
@@ -1613,6 +1615,11 @@ def _inner_training_loop(
             or self.fsdp is not None
         )
 
+        # We need to reset the scheduler, as its parameters may be different on subsequent calls
+        if self._created_lr_scheduler:
+            self.lr_scheduler = None
+            self._created_lr_scheduler = False
+
         if self.is_deepspeed_enabled:
             self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
 

From 4c0e251dc7fc18132b6817dfbf3fa17f9d7a369a Mon Sep 17 00:00:00 2001
From: Gaurav Kumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 11 Jul 2023 15:48:06 -0500
Subject: [PATCH 632/935] :bug: Handle empty gen_kwargs for seq2seq trainer
 prediction_step function (#24759)

* :bug: Handle empty gen_kwargs for seq2seq trainer prediction_step fn

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>

* Update src/transformers/trainer_seq2seq.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/trainer_seq2seq.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 2d7b04c0e199..2ebc6036b0bb 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -221,6 +221,7 @@ def prediction_step(
         inputs: Dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
         ignore_keys: Optional[List[str]] = None,
+        **gen_kwargs,
     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
@@ -237,6 +238,8 @@ def prediction_step(
                 argument `labels`. Check your model's documentation for all accepted arguments.
             prediction_loss_only (`bool`):
                 Whether or not to return the loss only.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
 
         Return:
             Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
@@ -254,7 +257,10 @@ def prediction_step(
         # XXX: adapt synced_gpus for fairscale as well
         # Priority (handled in generate):
         # gen_kwargs > model.generation_config > default GenerationConfig()
-        gen_kwargs = self._gen_kwargs.copy()
+
+        if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
+            gen_kwargs = self._gen_kwargs.copy()
+
         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
             gen_kwargs["max_length"] = self.model.config.max_length
         gen_kwargs["num_beams"] = (

From 6aadb8d016a06cfebd0be7d365ec53705e0b093e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 11 Jul 2023 16:52:34 -0400
Subject: [PATCH 633/935] Allow existing configs to be registered (#24760)

---
 src/transformers/models/auto/auto_factory.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index cdda6add0c79..d322f83668ea 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -418,7 +418,7 @@ def from_config(cls, config, **kwargs):
             else:
                 repo_id = config.name_or_path
             model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
-            cls._model_mapping.register(config.__class__, model_class)
+            cls._model_mapping.register(config.__class__, model_class, exist_ok=True)
             _ = kwargs.pop("code_revision", None)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
@@ -477,7 +477,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
             )
             _ = hub_kwargs.pop("code_revision", None)
-            cls._model_mapping.register(config.__class__, model_class)
+            cls._model_mapping.register(config.__class__, model_class, exist_ok=True)
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
@@ -492,7 +492,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         )
 
     @classmethod
-    def register(cls, config_class, model_class):
+    def register(cls, config_class, model_class, exist_ok=False):
         """
         Register a new model for this class.
 
@@ -508,7 +508,7 @@ def register(cls, config_class, model_class):
                 f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix "
                 "one of those so they match!"
             )
-        cls._model_mapping.register(config_class, model_class)
+        cls._model_mapping.register(config_class, model_class, exist_ok=exist_ok)
 
 
 class _BaseAutoBackboneClass(_BaseAutoModelClass):
@@ -719,13 +719,13 @@ def __contains__(self, item):
         model_type = self._reverse_config_mapping[item.__name__]
         return model_type in self._model_mapping
 
-    def register(self, key, value):
+    def register(self, key, value, exist_ok=False):
         """
         Register a new model in this mapping.
         """
         if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping:
             model_type = self._reverse_config_mapping[key.__name__]
-            if model_type in self._model_mapping.keys():
+            if model_type in self._model_mapping.keys() and not exist_ok:
                 raise ValueError(f"'{key}' is already used by a Transformers model.")
 
         self._extra_content[key] = value

From 45025d92f815675e483f32812caa28cce3a960e7 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 11 Jul 2023 23:55:55 +0200
Subject: [PATCH 634/935] Unpin protobuf in docker file (for daily CI) (#24761)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index c5ada9209bca..86d267f3c461 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -32,7 +32,7 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 tensorflow_text tensorflow_probability
 RUN python3 -m pip uninstall -y flax jax
 
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://developer.intel.com/ipex-whl-stable-cpu

From 7edc33ac7a2572698045fed3b5115bca23f40805 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 12 Jul 2023 05:49:12 -0400
Subject: [PATCH 635/935] Fix eval_accumulation_steps leading to incorrect
 metrics (#24756)

Fix eval steps
---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 05bfccfee102..63b123120bf0 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3154,7 +3154,7 @@ def evaluation_loop(
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+            if args.eval_accumulation_steps is not None and self.accelerator.sync_gradients:
                 if losses_host is not None:
                     losses = nested_numpify(losses_host)
                     all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)

From 6e2f0696502576882b757773caad0b80b6673b74 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 12 Jul 2023 12:06:17 +0100
Subject: [PATCH 636/935] Add MobileVitV2 to doctests (#24771)

* Add to doctests

* Alphabetical order
---
 src/transformers/models/auto/image_processing_auto.py | 1 -
 utils/documentation_tests.txt                         | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 7d7502670126..8b1dea55c3c9 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -75,7 +75,6 @@
         ("mgp-str", "ViTImageProcessor"),
         ("mobilenet_v1", "MobileNetV1ImageProcessor"),
         ("mobilenet_v2", "MobileNetV2ImageProcessor"),
-        ("mobilenet_v2", "MobileNetV2ImageProcessor"),
         ("mobilevit", "MobileViTImageProcessor"),
         ("mobilevit", "MobileViTImageProcessor"),
         ("mobilevitv2", "MobileViTImageProcessor"),
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 1fbfb0f9d84f..73fcd371060c 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -299,6 +299,8 @@ src/transformers/models/mobilevit/feature_extraction_mobilevit.py
 src/transformers/models/mobilevit/image_processing_mobilevit.py
 src/transformers/models/mobilevit/modeling_mobilevit.py
 src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
+src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
 src/transformers/models/mpnet/tokenization_mpnet.py
 src/transformers/models/mpnet/tokenization_mpnet_fast.py
 src/transformers/models/musicgen/configuration_musicgen.py

From 430a04a75a44fbec3dc5da470bffd09217006bc5 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 12 Jul 2023 12:21:30 +0100
Subject: [PATCH 637/935] Docs: Update logit processors __call__ docs (#24729)

* tmp commit

* __call__ docs

* kwargs documented; shorter input_ids doc

* nit

* Update src/transformers/generation/logits_process.py
---
 src/transformers/generation/logits_process.py | 84 ++++++++++++++-----
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 99d99bdf254b..c1d5e9338936 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -30,17 +30,10 @@
 LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
         scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
-        kwargs (`Dict[str, Any]`, *optional*):
-            Additional logits processor specific kwargs.
 
     Return:
         `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
@@ -53,7 +46,6 @@ class LogitsProcessor:
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """Torch method for processing logits."""
         raise NotImplementedError(
             f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
         )
@@ -64,7 +56,6 @@ class LogitsWarper:
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """Torch method for warping logits."""
         raise NotImplementedError(
             f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
         )
@@ -77,8 +68,22 @@ class LogitsProcessorList(list):
     [`LogitsProcessor`] or [`LogitsWarper`] to the inputs.
     """
 
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional kwargs that are specific to a logits processor.
+
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
+                The processed prediction scores.
+
+        """
         for processor in self:
             function_args = inspect.signature(processor.__call__).parameters
             if len(function_args) > 2:
@@ -116,6 +121,7 @@ def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
         self.min_length = min_length
         self.eos_token_id = eos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len < self.min_length:
@@ -154,6 +160,7 @@ def __init__(self, prompt_length_to_skip: int, min_new_tokens: int, eos_token_id
         self.min_new_tokens = min_new_tokens
         self.eos_token_id = eos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         new_tokens_length = input_ids.shape[-1] - self.prompt_length_to_skip
         if new_tokens_length < self.min_new_tokens:
@@ -178,7 +185,8 @@ def __init__(self, temperature: float):
 
         self.temperature = temperature
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.FloatTensor:
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         scores = scores / self.temperature
         return scores
 
@@ -199,6 +207,7 @@ def __init__(self, penalty: float):
 
         self.penalty = penalty
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         score = torch.gather(scores, 1, input_ids)
 
@@ -227,6 +236,7 @@ def __init__(self, penalty: float, encoder_input_ids: torch.LongTensor):
         self.penalty = 1 / penalty
         self.encoder_input_ids = encoder_input_ids
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         score = torch.gather(scores, 1, self.encoder_input_ids)
 
@@ -262,6 +272,7 @@ def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens
         self.filter_value = filter_value
         self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         sorted_logits, sorted_indices = torch.sort(scores, descending=False)
         cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
@@ -297,6 +308,7 @@ def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_t
         self.top_k = max(top_k, min_tokens_to_keep)
         self.filter_value = filter_value
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         top_k = min(self.top_k, scores.size(-1))  # Safety check
         # Remove all tokens with a probability less than the last token of the top-k
@@ -330,6 +342,7 @@ def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_t
         self.mass = mass
         self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # calculate entropy
         normalized = torch.nn.functional.log_softmax(scores, dim=-1)
@@ -383,6 +396,7 @@ def __init__(self, epsilon: float, filter_value: float = -float("Inf"), min_toke
         self.filter_value = filter_value
         self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # Determine which indices to remove
         probabilities = scores.softmax(dim=-1)
@@ -422,6 +436,7 @@ def __init__(self, epsilon: float, filter_value: float = -float("Inf"), min_toke
         self.filter_value = filter_value
         self.min_tokens_to_keep = min_tokens_to_keep
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # Calculate the adaptive cutoff
         probabilities = scores.softmax(dim=-1)
@@ -487,6 +502,7 @@ def __init__(self, ngram_size: int):
             raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
         self.ngram_size = ngram_size
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         num_batch_hypotheses = scores.shape[0]
         cur_len = input_ids.shape[-1]
@@ -521,6 +537,7 @@ def __init__(self, encoder_ngram_size: int, encoder_input_ids: torch.LongTensor)
         self.batch_size = encoder_input_ids.shape[0]
         self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # B x num_beams
         num_hypos = scores.shape[0]
@@ -612,6 +629,7 @@ def __init__(self, sequence_bias: Dict[Tuple[int], float]):
         self.length_greather_than_1_bias = None
         self.prepared_bias_variables = False
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # 1 - Prepares the bias tensors. This is only needed the first time the logit processor is called.
         if not self.prepared_bias_variables:
@@ -774,6 +792,7 @@ def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[
         self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
         self._num_beams = num_beams
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         mask = torch.full_like(scores, -math.inf)
         for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
@@ -821,6 +840,23 @@ def __call__(
         current_tokens: torch.LongTensor,
         beam_group_idx: int,
     ) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            current_tokens (`torch.LongTensor` of shape `(batch_size)`):
+                Indices of input sequence tokens in the vocabulary, corresponding to the tokens selected by the other
+                beam groups in the current generation step.
+            beam_group_idx (`int`):
+                The index of the beam group currently being processed.
+
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
+                The processed prediction scores.
+        """
         # hamming diversity: penalise using same token in current group which was used in previous groups at
         # the same time step
         batch_size = current_tokens.shape[0] // self._num_beams
@@ -855,6 +891,7 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
     def __init__(self, bos_token_id: int):
         self.bos_token_id = bos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len == 1:
@@ -882,6 +919,7 @@ def __init__(self, max_length: int, eos_token_id: Union[int, List[int]]):
             eos_token_id = [eos_token_id]
         self.eos_token_id = eos_token_id
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len == self.max_length - 1:
@@ -898,6 +936,7 @@ class InfNanRemoveLogitsProcessor(LogitsProcessor):
     the logits processor should only be used if necessary since it can slow down the generation method.
     """
 
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # set all nan values to 0.0
         scores[scores != scores] = 0.0
@@ -935,7 +974,8 @@ def __init__(
             eos_token_id = [eos_token_id]
         self.eos_token_id = eos_token_id
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.FloatTensor:
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         cur_len = input_ids.shape[-1]
         if cur_len > self.regulation_start:
             for i in self.eos_token_id:
@@ -951,7 +991,8 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     the scores are normalized when comparing the hypotheses.
     """
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         scores = scores.log_softmax(dim=-1)
         return scores
 
@@ -967,7 +1008,8 @@ def __init__(self, begin_suppress_tokens, begin_index):
         self.begin_suppress_tokens = list(begin_suppress_tokens)
         self.begin_index = begin_index
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if input_ids.shape[1] == self.begin_index:
             scores[:, self.begin_suppress_tokens] = -float("inf")
 
@@ -981,7 +1023,8 @@ class SuppressTokensLogitsProcessor(LogitsProcessor):
     def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         scores[:, self.suppress_tokens] = -float("inf")
         return scores
 
@@ -994,7 +1037,8 @@ class ForceTokensLogitsProcessor(LogitsProcessor):
     def __init__(self, force_token_map: List[List[int]]):
         self.force_token_map = dict(force_token_map)
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         generation_idx = input_ids.shape[-1]
         current_token = self.force_token_map.get(generation_idx, None)
         if current_token is not None:
@@ -1030,7 +1074,8 @@ def __init__(self, generate_config):  # support for the kwargs
             self.begin_index -= 1
         self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # suppress <|notimestamps|> which is handled by without_timestamps
         scores[:, self.no_timestamps_token_id] = -float("inf")
 
@@ -1089,7 +1134,8 @@ def __init__(self, guidance_scale):
                 f"{guidance_scale}."
             )
 
-    def __call__(self, input_ids, scores):
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # simple check to make sure we have compatible batch sizes between our
         # logits scores (cond + uncond) and input ids (cond only)
         if scores.shape[0] != 2 * input_ids.shape[0]:

From fc9e387dc0a61a5742579ba7aed6a75b4fb09efa Mon Sep 17 00:00:00 2001
From: Bauke Brenninkmeijer <bauke.brenninkmeijer@gmail.com>
Date: Wed, 12 Jul 2023 13:45:09 +0200
Subject: [PATCH 638/935] Replacement of 20 asserts with exceptions (#24757)

* initial replacements of asserts with errors/exceptions

* replace assert with exception in generation, align and bart

* reset formatting change

* reset another formatting issue

* Apply suggestion

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* don't touch this file

* change to 'is not False'

* fix type

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../language-modeling/run_bart_dlm_flax.py    |  6 ++--
 .../flax/language-modeling/run_clm_flax.py    |  6 ++--
 .../benchmark/benchmark_args_utils.py         | 11 ++++---
 src/transformers/benchmark/benchmark_tf.py    | 33 +++++++++++--------
 src/transformers/benchmark/benchmark_utils.py |  3 +-
 src/transformers/data/metrics/__init__.py     |  3 +-
 src/transformers/generation/beam_search.py    |  6 ++--
 .../models/align/convert_align_tf_to_hf.py    |  6 ++--
 ..._original_pytorch_checkpoint_to_pytorch.py | 13 ++++++--
 9 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index 4cb862bb37b9..e863f97e0d62 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -242,10 +242,12 @@ def __post_init__(self):
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("train_file` should be a csv, json or text file.")
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("`validation_file` should be a csv, json or text file.")
 
 
 @flax.struct.dataclass
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 952419dc9656..656bfa680776 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -251,10 +251,12 @@ def __post_init__(self):
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("train_file` should be a csv, json or text file.")
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("`validation_file` should be a csv, json or text file.")
 
 
 class TrainState(train_state.TrainState):
diff --git a/src/transformers/benchmark/benchmark_args_utils.py b/src/transformers/benchmark/benchmark_args_utils.py
index d9233906d281..48fcb311b437 100644
--- a/src/transformers/benchmark/benchmark_args_utils.py
+++ b/src/transformers/benchmark/benchmark_args_utils.py
@@ -147,11 +147,12 @@ def to_json_string(self):
         return json.dumps(dataclasses.asdict(self), indent=2)
 
     @property
-    def model_names(self):
-        assert len(self.models) > 0, (
-            "Please make sure you provide at least one model name / model identifier, *e.g.* `--models"
-            " bert-base-cased` or `args.models = ['bert-base-cased']."
-        )
+    def model_names(self) -> List[str]:
+        if len(self.models) <= 0:
+            raise ValueError(
+                "Please make sure you provide at least one model name / model identifier, *e.g.* `--models"
+                " bert-base-cased` or `args.models = ['bert-base-cased']."
+            )
         return self.models
 
     @property
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
index 126172ffbd30..c813591be0be 100644
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -60,9 +60,10 @@ def run_in_graph_mode(*args, **kwargs):
             return func(*args, **kwargs)
 
         if do_eager_mode is True:
-            assert (
-                use_xla is False
-            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            if use_xla is not False:
+                raise ValueError(
+                    "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+                )
             return run_in_eager_mode
         else:
             return run_in_graph_mode
@@ -88,13 +89,15 @@ def framework_version(self):
     def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
         # initialize GPU on separate process
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
         _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
         return self._measure_speed(_inference)
 
     def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
         _train = self._prepare_train_func(model_name, batch_size, sequence_length)
         return self._measure_speed(_train)
 
@@ -105,7 +108,8 @@ def _inference_memory(
         if self.args.is_gpu:
             tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
         _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
         return self._measure_memory(_inference)
 
@@ -115,7 +119,8 @@ def _train_memory(
         if self.args.is_gpu:
             tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
         strategy = self.args.strategy
-        assert strategy is not None, "A device strategy has to be initialized before using TensorFlow."
+        if strategy is None:
+            raise ValueError("A device strategy has to be initialized before using TensorFlow.")
 
         _train = self._prepare_train_func(model_name, batch_size, sequence_length)
         return self._measure_memory(_train)
@@ -164,9 +169,8 @@ def encoder_forward():
     def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
         config = self.config_dict[model_name]
 
-        assert (
-            self.args.eager_mode is False
-        ), "Training cannot be done in eager mode. Please make sure that `args.eager_mode = False`."
+        if self.args.eager_mode is not False:
+            raise ValueError("Training cannot be done in eager mode. Please make sure that `args.eager_mode = False`.")
 
         if self.args.fp16:
             raise NotImplementedError("Mixed precision is currently not supported.")
@@ -240,10 +244,11 @@ def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
         with self.args.strategy.scope():
             try:
                 if self.args.trace_memory_line_by_line:
-                    assert self.args.eager_mode, (
-                        "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory"
-                        " consumption line by line."
-                    )
+                    if not self.args.eager_mode:
+                        raise ValueError(
+                            "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory"
+                            " consumption line by line."
+                        )
                     trace = start_memory_tracing("transformers")
 
                 if self.args.is_tpu:
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index b7008a7ab755..a71b1fb65a23 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -890,7 +890,8 @@ def save_to_csv(self, result_dict, filename):
             return
         self.print_fn("Saving results to csv.")
         with open(filename, mode="w") as csv_file:
-            assert len(self.args.model_names) > 0, f"At least 1 model should be defined, but got {self.model_names}"
+            if len(self.args.model_names) <= 0:
+                raise ValueError(f"At least 1 model should be defined, but got {self.model_names}")
 
             fieldnames = ["model", "batch_size", "sequence_length"]
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"])
diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py
index 6f51f44dfeb2..ebd0d17aa55b 100644
--- a/src/transformers/data/metrics/__init__.py
+++ b/src/transformers/data/metrics/__init__.py
@@ -90,7 +90,8 @@ def glue_compute_metrics(task_name, preds, labels):
 def xnli_compute_metrics(task_name, preds, labels):
     warnings.warn(DEPRECATION_WARNING, FutureWarning)
     requires_backends(xnli_compute_metrics, "sklearn")
-    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
+    if len(preds) != len(labels):
+        raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
     if task_name == "xnli":
         return {"acc": simple_accuracy(preds, labels)}
     else:
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 71a459c06852..1aeec5aeae86 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -380,7 +380,8 @@ def finalize(
 
         # shorter batches are padded if needed
         if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
             decoded.fill_(pad_token_id)
 
         if indices is not None:
@@ -855,7 +856,8 @@ def finalize(
         decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
         # shorter batches are padded if needed
         if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
             decoded.fill_(pad_token_id)
 
         # fill with hypotheses and eos_token_id if the latter fits in
diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py
index e4221d5283b7..96e981079769 100644
--- a/src/transformers/models/align/convert_align_tf_to_hf.py
+++ b/src/transformers/models/align/convert_align_tf_to_hf.py
@@ -346,8 +346,10 @@ def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_mod
     text_features = tf.nn.l2_normalize(text_features, axis=-1)
 
     # Check whether original and HF model outputs match  -> np.allclose
-    assert np.allclose(image_features, hf_image_features, atol=1e-3), "The predicted image features are not the same."
-    assert np.allclose(text_features, hf_text_features, atol=1e-3), "The predicted text features are not the same."
+    if not np.allclose(image_features, hf_image_features, atol=1e-3):
+        raise ValueError("The predicted image features are not the same.")
+    if not np.allclose(text_features, hf_text_features, atol=1e-3):
+        raise ValueError("The predicted text features are not the same.")
     print("Model outputs match!")
 
     if save_model:
diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
index baa2fff290f7..d09b39d51e00 100644
--- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
@@ -101,7 +101,10 @@ def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkp
     config = BartConfig.from_pretrained(hf_checkpoint_name)
     tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
     tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
-    assert torch.eq(tokens, tokens2).all()
+    if not torch.eq(tokens, tokens2).all():
+        raise ValueError(
+            f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
+        )
 
     if checkpoint_path == "bart.large.mnli":
         state_dict = bart.state_dict()
@@ -130,8 +133,12 @@ def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkp
             new_model_outputs = model.model(tokens)[0]
 
     # Check results
-    assert fairseq_output.shape == new_model_outputs.shape
-    assert (fairseq_output == new_model_outputs).all().item()
+    if fairseq_output.shape != new_model_outputs.shape:
+        raise ValueError(
+            f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
+        )
+    if (fairseq_output != new_model_outputs).any().item():
+        raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     model.save_pretrained(pytorch_dump_folder_path)
 

From 4f85aaa6c9714a6b2bbc2cdfd2399d9e4ace966f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 12 Jul 2023 13:50:26 +0200
Subject: [PATCH 639/935] Update default values of bos/eos token ids in
 `CLIPTextConfig` (#24773)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/clip/configuration_clip.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index e3efba36c5db..01fc5a3ea178 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -105,9 +105,11 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        bos_token_id=49406,
+        eos_token_id=49407,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

From 02842855010a9784db45b32fa5317e47e569c554 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 12 Jul 2023 10:01:51 -0400
Subject: [PATCH 640/935] Fix pad across processes dim in trainer and not being
 able to set the timeout (#24775)

* dim, and rm copy

* Don't rm copy for now

* Oops

* pad index

* Should be a working test

* Tickle down ddp timeout

* Put fix back in now that testing locally is done

* Better comment specifying timeout

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/trainer.py       |  6 ++--
 src/transformers/training_args.py |  4 ++-
 tests/trainer/test_trainer.py     | 46 +++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 63b123120bf0..88815cf00042 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3131,9 +3131,9 @@ def evaluation_loop(
                 losses = self.accelerator.gather_for_metrics((loss.repeat(batch_size)))
                 losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
             if labels is not None:
-                labels = self.accelerator.pad_across_processes(labels)
+                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
             if inputs_decode is not None:
-                inputs_decode = self.accelerator.pad_across_processes(inputs_decode)
+                inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
                 inputs_decode = self.accelerator.gather_for_metrics((inputs_decode))
                 inputs_host = (
                     inputs_decode
@@ -3141,7 +3141,7 @@ def evaluation_loop(
                     else nested_concat(inputs_host, inputs_decode, padding_index=-100)
                 )
             if logits is not None:
-                logits = self.accelerator.pad_across_processes(logits)
+                logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
                 if self.preprocess_logits_for_metrics is not None:
                     logits = self.preprocess_logits_for_metrics(logits, labels)
                 logits = self.accelerator.gather_for_metrics((logits))
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index e8c2823f3793..e5b40d25d2c7 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1714,7 +1714,9 @@ def _setup_devices(self) -> "torch.device":
             del os.environ["ACCELERATE_USE_DEEPSPEED"]
             self._n_gpu = 1
         else:
-            self.distributed_state = PartialState(backend=self.ddp_backend)
+            self.distributed_state = PartialState(
+                backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
+            )
             self._n_gpu = 1
         if not is_sagemaker_mp_enabled():
             device = self.distributed_state.device
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 3442b52b0171..d8df18bdcbee 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -49,6 +49,7 @@
     USER,
     CaptureLogger,
     TestCasePlus,
+    execute_subprocess_async,
     get_gpu_count,
     get_tests_dir,
     is_staging_test,
@@ -2098,6 +2099,51 @@ def test_no_wd_param_group(self):
         self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
         self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
 
+    @slow
+    @require_torch_multi_gpu
+    def test_end_to_end_example(self):
+        # Tests that `translation.py` will run without issues
+        script_path = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "..", "examples", "pytorch", "translation", "run_translation.py"
+            )
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            command = [
+                "accelerate",
+                "launch",
+                script_path,
+                "--model_name_or_path",
+                "t5-small",
+                "--per_device_train_batch_size",
+                "1",
+                "--output_dir",
+                tmpdir,
+                "--overwrite_output_dir",
+                "--do_train",
+                "--max_train_samples",
+                "64",
+                "--num_train_epochs",
+                "1",
+                "--dataset_name",
+                "wmt16",
+                "--dataset_config",
+                "ro-en",
+                "--source_lang",
+                "en",
+                "--target_lang",
+                "ro",
+                "--do_predict",
+                "--max_predict_samples",
+                "64",
+                "--predict_with_generate",
+                "--ddp_timeout",
+                "60",
+            ]
+            execute_subprocess_async(command)
+            # successful return here == success - any errors would have caused an error or a timeout in the sub-call
+
 
 @require_torch
 @is_staging_test

From 395e566a420f2466b32689686a631ecca7ac5c31 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 12 Jul 2023 16:38:25 +0200
Subject: [PATCH 641/935] gpt-bigcode: avoid `zero_` to support Core ML
 (#24755)

gpt-bigcode: avoid `zeros_` to support Core ML.

In-place `zeros_` is not supported by the Core ML conversion process.
This PR replaces it with `zeros_like` so conversion can proceed.

The change only affects a workaround for a PyTorch bug on the `cpu`
device.
---
 src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index a45b9bd4b261..8c93e583f883 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -164,7 +164,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             # This is needed because of a bug in pytorch https://github.com/pytorch/pytorch/issues/80588.
             # The bug was fixed in https://github.com/pytorch/pytorch/pull/96086,
             # but the fix has not been released as of pytorch version 2.0.0.
-            attn_weights.zero_()
+            attn_weights = torch.zeros_like(attn_weights)
             beta = 1
         else:
             beta = 0

From cfc8a05305b4c89c5393766161d89ef24e72fdfa Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Wed, 12 Jul 2023 10:58:08 -0400
Subject: [PATCH 642/935] Remove WWT from README (#24672)

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index eba8fb5da9f1..527cb32fb194 100644
--- a/README.md
+++ b/README.md
@@ -113,8 +113,6 @@ In Multimodal tasks:
 - [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
 - [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
 
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.
-
 
 ## 100 projects using Transformers
 

From f1732e1374a082bf8e43bd0e4aa8a2da21a32a21 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 12 Jul 2023 11:47:21 -0400
Subject: [PATCH 643/935] Rm duplicate pad_across_processes (#24780)

Rm duplicate
---
 src/transformers/trainer.py | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 88815cf00042..8ed88d931e05 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3257,41 +3257,6 @@ def _nested_gather(self, tensors, name=None):
             tensors = distributed_concat(tensors)
         return tensors
 
-    # Copied from Accelerate.
-    def _pad_across_processes(self, tensor, pad_index=-100):
-        """
-        Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
-        they can safely be gathered.
-        """
-        if isinstance(tensor, (list, tuple)):
-            return type(tensor)(self._pad_across_processes(t, pad_index=pad_index) for t in tensor)
-        elif isinstance(tensor, dict):
-            return type(tensor)({k: self._pad_across_processes(v, pad_index=pad_index) for k, v in tensor.items()})
-        elif not isinstance(tensor, torch.Tensor):
-            raise TypeError(
-                f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
-            )
-
-        if len(tensor.shape) < 2:
-            return tensor
-        # Gather all sizes
-        size = torch.tensor(tensor.shape, device=tensor.device)[None]
-        sizes = self._nested_gather(size).cpu()
-
-        max_size = max(s[1] for s in sizes)
-        # When extracting XLA graphs for compilation, max_size is 0,
-        # so use inequality to avoid errors.
-        if tensor.shape[1] >= max_size:
-            return tensor
-
-        # Then pad to the maximum size
-        old_size = tensor.shape
-        new_size = list(old_size)
-        new_size[1] = max_size
-        new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
-        new_tensor[:, : old_size[1]] = tensor
-        return new_tensor
-
     def prediction_step(
         self,
         model: nn.Module,

From 906afa1d5c6054a641cb6abb009cdec732a5a094 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 13 Jul 2023 04:19:45 +0200
Subject: [PATCH 644/935] Revert "Unpin protobuf in docker file (for daily CI)"
 (#24800)

Revert "Unpin protobuf in docker file (for daily CI) (#24761)"

This reverts commit 45025d92f815675e483f32812caa28cce3a960e7.
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 86d267f3c461..c5ada9209bca 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -32,7 +32,7 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
 RUN python3 -m pip uninstall -y flax jax
 
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://developer.intel.com/ipex-whl-stable-cpu

From 1f6f32c24338ad1ff17475b836c7b4505da77714 Mon Sep 17 00:00:00 2001
From: Liyang90 <liyanglu@google.com>
Date: Thu, 13 Jul 2023 02:30:22 -0700
Subject: [PATCH 645/935] Removing unnecessary `device=device` in
 modeling_llama.py (#24696)

* Update modeling_llama.py

Removing unnecessary `device=device`

* fix in all occurrences of _make_causal_mask
---
 src/transformers/models/autoformer/modeling_autoformer.py       | 2 +-
 src/transformers/models/bart/modeling_bart.py                   | 2 +-
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py          | 2 +-
 src/transformers/models/biogpt/modeling_biogpt.py               | 2 +-
 src/transformers/models/blenderbot/modeling_blenderbot.py       | 2 +-
 .../models/blenderbot_small/modeling_blenderbot_small.py        | 2 +-
 src/transformers/models/clip/modeling_clip.py                   | 2 +-
 src/transformers/models/clipseg/modeling_clipseg.py             | 2 +-
 src/transformers/models/groupvit/modeling_groupvit.py           | 2 +-
 src/transformers/models/informer/modeling_informer.py           | 2 +-
 src/transformers/models/led/modeling_led.py                     | 2 +-
 src/transformers/models/llama/modeling_llama.py                 | 2 +-
 src/transformers/models/m2m_100/modeling_m2m_100.py             | 2 +-
 src/transformers/models/marian/modeling_marian.py               | 2 +-
 src/transformers/models/mbart/modeling_mbart.py                 | 2 +-
 src/transformers/models/musicgen/modeling_musicgen.py           | 2 +-
 src/transformers/models/mvp/modeling_mvp.py                     | 2 +-
 src/transformers/models/nllb_moe/modeling_nllb_moe.py           | 2 +-
 src/transformers/models/open_llama/modeling_open_llama.py       | 2 +-
 src/transformers/models/opt/modeling_opt.py                     | 2 +-
 src/transformers/models/owlvit/modeling_owlvit.py               | 2 +-
 src/transformers/models/pegasus/modeling_pegasus.py             | 2 +-
 src/transformers/models/pegasus_x/modeling_pegasus_x.py         | 2 +-
 src/transformers/models/plbart/modeling_plbart.py               | 2 +-
 .../models/speech_to_text/modeling_speech_to_text.py            | 2 +-
 .../models/speech_to_text_2/modeling_speech_to_text_2.py        | 2 +-
 src/transformers/models/speecht5/modeling_speecht5.py           | 2 +-
 .../time_series_transformer/modeling_time_series_transformer.py | 2 +-
 src/transformers/models/trocr/modeling_trocr.py                 | 2 +-
 src/transformers/models/whisper/modeling_whisper.py             | 2 +-
 src/transformers/models/x_clip/modeling_x_clip.py               | 2 +-
 src/transformers/models/xglm/modeling_xglm.py                   | 2 +-
 .../modeling_{{cookiecutter.lowercase_modelname}}.py            | 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index b3584ea2212d..96298c77a344 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -365,7 +365,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 1bc9c7e9d3e7..91f7bac90667 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -93,7 +93,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index a16403d7095d..2af4387da109 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -85,7 +85,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 7f6d44502c01..fa2cb1994de2 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -61,7 +61,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 4bdf693eaa2a..af7af01cbc6a 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -83,7 +83,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index af571aa29f6e..3e52052124bc 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -79,7 +79,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 292f932d9357..405fe27d49c9 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -682,7 +682,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index f15553763190..5f49160f4d3a 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -692,7 +692,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 41fe8127132a..89bfb8d005da 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1075,7 +1075,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 50919a8afaa2..ecdcaecab47e 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -238,7 +238,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index cf9b19d28a0d..3cf827fe33d5 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -79,7 +79,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min)
     mask_cond = torch.arange(mask.size(-1))
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index c8d003f6989c..41a077c32fc9 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -46,7 +46,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index e895e0864d8c..df6c50dc9cab 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -79,7 +79,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 022c0a4ea0b7..55f8127dc9d2 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -81,7 +81,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 28ab886c235c..5319e47680b2 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -86,7 +86,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index accff7bed5b0..8e2913263c69 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -86,7 +86,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 8982fbf1a147..4d03ecdcaa56 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -97,7 +97,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index f3c597fa3c24..09c0b91d8d29 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -84,7 +84,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 84d5c6e78fa2..db05b868b066 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -55,7 +55,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index a473d9bd5b6d..037fa8a7ecd2 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -71,7 +71,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index a5880962b29d..2cf67e63f23a 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -792,7 +792,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index a81a76e8d136..b90a60b25051 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -80,7 +80,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index caf736ba3ad8..c1b1d414771c 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -98,7 +98,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index ca2f6d574b23..f01a1735f3eb 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -85,7 +85,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index e98f5a46c558..441383781933 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -70,7 +70,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index e4270bc332d4..1bd51c4e0d4a 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -50,7 +50,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index b77d775714ad..5a3f117adbc8 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -97,7 +97,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 26b9616840c8..2caca5bd1051 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -233,7 +233,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index e6452303563b..5676689209e6 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -50,7 +50,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 005ef5786675..8f40379cc271 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -80,7 +80,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 248a9248a801..e767c701b95d 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -746,7 +746,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 9e578cebf190..02f230629f9a 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -135,7 +135,7 @@ def _make_causal_mask(
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
     mask_cond = torch.arange(mask.size(-1), device=device)
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 426cb5c41385..1b4a80112e40 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -1620,7 +1620,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     Make causal mask used for bi-directional self-attention.
     """
     bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min)
     mask_cond = torch.arange(mask.size(-1))
     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
     mask = mask.to(dtype)

From 21946a8cf4a273f35ac2f3a53edafc398699f527 Mon Sep 17 00:00:00 2001
From: SeongBeomLEE <2712qwer@naver.com>
Date: Thu, 13 Jul 2023 19:57:56 +0900
Subject: [PATCH 646/935] [fix] Change the condition of ValueError in
 "convert_checkpoint_from_transformers_to_megatron" (#24769)

* fix: half inference error

norm_factor is still torch.float32 after using model.half

So I changed it to register_buffer so I can change it to torch.float16 after using model.half

* fix: Added a variable "persistent=False"

* run make style

* [fix] Change the condition of ValueError
convert_checkpoint_from_transformers_to_megatron

* [fix] error wording
layers -> attention heads
---
 .../checkpoint_reshaping_and_interoperability.py      | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index cb75a80946bb..afafd4d7e4d1 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -741,11 +741,18 @@ def convert_checkpoint_from_transformers_to_megatron(args):
 
     # Transformer layers
     print("converting transformer layers")
-    if config.num_hidden_layers % args.target_tensor_model_parallel_size != 0:
+    if config.num_attention_heads % args.target_tensor_model_parallel_size != 0:
         raise ValueError(
-            f"Number of layers ({config.num_hidden_layers}) must be divisible by number of tensor parallelism"
+            f"Number of attention heads ({config.num_attention_heads}) must be divisible by number of tensor parallelism"
             f" ({args.target_tensor_model_parallel_size})"
         )
+
+    if config.num_hidden_layers % args.target_pipeline_model_parallel_size != 0:
+        raise ValueError(
+            f"Number of layers ({config.num_hidden_layers}) must be divisible by number of pipeline parallelism"
+            f" ({args.target_pipeline_model_parallel_size})"
+        )
+
     num_layers = config.num_hidden_layers // args.target_pipeline_model_parallel_size
 
     layer_re = re.compile(r"transformer.h\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")

From 6ba4d5de3ae88235db1869352b2c71e1705742e0 Mon Sep 17 00:00:00 2001
From: Bram Vanroy <2779410+BramVanroy@users.noreply.github.com>
Date: Thu, 13 Jul 2023 13:36:16 +0200
Subject: [PATCH 647/935] [DOC] Clarify relationshi load_best_model_at_end and
 save_total_limit (#24614)

* Update training_args.py

Clarify the relationship between `load_best_model_at_end` and `save_total_limit`.

* fix: faulty quotes

* make quality

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* DOCS: add explicit `True`

* DOCS: make style/quality

---------

Co-authored-by: Bram Vanroy <Bram.Vanroy@UGent.be>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/training_args.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index e5b40d25d2c7..696e35638934 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -283,7 +283,11 @@ class TrainingArguments:
             float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
         save_total_limit (`int`, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            `output_dir`.
+            `output_dir`. When `load_best_model_at_end` is enabled, the "best" checkpoint according to
+            `metric_for_best_model` will always be retained in addition to the most recent ones. For example, for
+            `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
+            alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
+            checkpoints are saved: the last one and the best one (if they are different).
         save_safetensors (`bool`, *optional*, defaults to `False`):
             Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of
             default `torch.load` and `torch.save`.
@@ -371,7 +375,10 @@ class TrainingArguments:
             except if the model used is one of the `XxxForQuestionAnswering` in which case it will also include the
             `["start_positions", "end_positions"]` keys.
         load_best_model_at_end (`bool`, *optional*, defaults to `False`):
-            Whether or not to load the best model found during training at the end of training.
+            Whether or not to load the best model found during training at the end of training. When this option is
+            enabled, the best checkpoint will always be saved. See
+            [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)
+            for more.
 
             <Tip>
 
@@ -761,8 +768,13 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Limit the total amount of checkpoints. "
-                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
+                "If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in"
+                " `output_dir`. When `load_best_model_at_end` is enabled, the 'best' checkpoint according to"
+                " `metric_for_best_model` will always be retained in addition to the most recent ones. For example,"
+                " for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
+                " retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
+                " it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
+                " Default is unlimited checkpoints"
             )
         },
     )
@@ -924,10 +936,14 @@ class TrainingArguments:
     label_names: Optional[List[str]] = field(
         default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
     )
-
     load_best_model_at_end: Optional[bool] = field(
         default=False,
-        metadata={"help": "Whether or not to load the best model found during training at the end of training."},
+        metadata={
+            "help": (
+                "Whether or not to load the best model found during training at the end of training. When this option"
+                " is enabled, the best checkpoint will always be saved. See `save_total_limit` for more."
+            )
+        },
     )
     metric_for_best_model: Optional[str] = field(
         default=None, metadata={"help": "The metric to use to compare two different models."}

From e538189931a0615b56ed96fae9f9cb3418644802 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 13 Jul 2023 13:57:30 +0200
Subject: [PATCH 648/935] Upgrade jax/jaxlib/flax pin versions (#24791)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 setup.py                                      | 6 +++---
 src/transformers/dependency_versions_table.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 72e1fa97cb10..ab484cb614a7 100644
--- a/setup.py
+++ b/setup.py
@@ -115,7 +115,7 @@
     "faiss-cpu",
     "fastapi",
     "filelock",
-    "flax>=0.4.1,<=0.6.9",
+    "flax>=0.4.1,<=0.7.0",
     "ftfy",
     "fugashi>=1.0",
     "GitPython<3.1.19",
@@ -124,8 +124,8 @@
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
-    "jax>=0.2.8,!=0.3.2,<=0.3.6",
-    "jaxlib>=0.1.65,<=0.3.6",
+    "jax>=0.2.8,!=0.3.2,<=0.4.13",
+    "jaxlib>=0.1.65,<=0.4.13",
     "jieba",
     "kenlm",
     "keras-nlp>=0.3.1",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 0f95cdf76fb3..350c31213466 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -20,7 +20,7 @@
     "faiss-cpu": "faiss-cpu",
     "fastapi": "fastapi",
     "filelock": "filelock",
-    "flax": "flax>=0.4.1,<=0.6.9",
+    "flax": "flax>=0.4.1,<=0.7.0",
     "ftfy": "ftfy",
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
@@ -29,8 +29,8 @@
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
-    "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
-    "jaxlib": "jaxlib>=0.1.65,<=0.3.6",
+    "jax": "jax>=0.2.8,!=0.3.2,<=0.4.13",
+    "jaxlib": "jaxlib>=0.1.65,<=0.4.13",
     "jieba": "jieba",
     "kenlm": "kenlm",
     "keras-nlp": "keras-nlp>=0.3.1",

From e367a9770fea240c001c958b7e0e39d4fa547519 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 13 Jul 2023 14:47:59 +0100
Subject: [PATCH 649/935] Fix MobileVitV2 doctest checkpoint (#24805)

* Fix doctest checkpoint

* Add import torch for mobilevit
---
 .../models/mobilevit/modeling_mobilevit.py             |  5 +++--
 .../models/mobilevitv2/modeling_mobilevitv2.py         | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index 3503e86c9c75..c3accb21e05e 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -1024,9 +1024,10 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation
-        >>> from PIL import Image
         >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
index b8c071a74f4b..5a0e08d7344d 100644
--- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -953,7 +953,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     """,
     MOBILEVITV2_START_DOCSTRING,
 )
-# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation with MOBILEVIT->MOBILEVITV2,MobileViT->MobileViTV2,mobilevit->mobilevitv2
 class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
     def __init__(self, config: MobileViTV2Config) -> None:
         super().__init__(config)
@@ -984,15 +983,16 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation
-        >>> from PIL import Image
         >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevitv2-small")
-        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevitv2-small")
+        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
+        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
 
         >>> inputs = image_processor(images=image, return_tensors="pt")
 

From 717dadc6f36be9f50abc66adfd918f9b0e6e3502 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 13 Jul 2023 15:54:18 +0200
Subject: [PATCH 650/935] Skip torchscript tests for
 `MusicgenForConditionalGeneration` (#24782)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/musicgen/test_modeling_musicgen.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 3d59becc7514..dbae06dbf3f8 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -506,6 +506,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_pruning = False  # training is not supported yet for MusicGen
     test_headmasking = False
     test_resize_embeddings = False
+    # not to test torchscript as the model tester doesn't prepare `input_values` and `padding_mask`
+    # (and `torchscript` hates `None` values).
+    test_torchscript = False
 
     def setUp(self):
         self.model_tester = MusicgenTester(self)

From 9342c8fb824dfc9a4a374292fa9995ae1bc52da0 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 13 Jul 2023 11:46:54 -0400
Subject: [PATCH 651/935] Deprecate models (#24787)

* Deprecate some models

* Fix imports

* Fix inits too

* Remove tests

* Add deprecated banner to documentation

* Remove from init

* Fix auto classes

* Style

* Remote upgrade strategy 1

* Remove site package cache

* Revert this part

* Fix typo...

* Update utils

* Update docs/source/en/model_doc/bort.md

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

* Address review comments

* With all files saved

---------

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
---
 docs/source/en/model_doc/bort.md              |   9 +
 docs/source/en/model_doc/mctct.md             |   9 +
 docs/source/en/model_doc/retribert.md         |   9 +
 docs/source/en/model_doc/tapex.md             |   9 +
 .../en/model_doc/trajectory_transformer.md    |   9 +
 docs/source/en/model_doc/van.md               |   9 +
 src/transformers/__init__.py                  | 152 +--
 src/transformers/models/__init__.py           |   8 +-
 .../models/auto/configuration_auto.py         |  15 +-
 .../models/{bort => deprecated}/__init__.py   |   0
 .../models/deprecated}/bort/__init__.py       |   0
 ...original_gluonnlp_checkpoint_to_pytorch.py |   0
 .../models/{ => deprecated}/mctct/__init__.py |   2 +-
 .../mctct/configuration_mctct.py              |   4 +-
 .../mctct/feature_extraction_mctct.py         |  10 +-
 .../{ => deprecated}/mctct/modeling_mctct.py  |  12 +-
 .../mctct/processing_mctct.py                 |   2 +-
 .../models/{ => deprecated}/mmbt/__init__.py  |   2 +-
 .../mmbt/configuration_mmbt.py                |   2 +-
 .../{ => deprecated}/mmbt/modeling_mmbt.py    |   6 +-
 .../{ => deprecated}/retribert/__init__.py    |   2 +-
 .../retribert/configuration_retribert.py      |   4 +-
 .../retribert/modeling_retribert.py           |   6 +-
 .../retribert/tokenization_retribert.py       |   4 +-
 .../retribert/tokenization_retribert_fast.py  |   4 +-
 .../models/{ => deprecated}/tapex/__init__.py |   2 +-
 .../tapex/tokenization_tapex.py               |   8 +-
 .../trajectory_transformer/__init__.py        |   2 +-
 .../configuration_trajectory_transformer.py   |   4 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   0
 .../modeling_trajectory_transformer.py        |   4 +-
 .../models/{ => deprecated}/van/__init__.py   |   2 +-
 .../{ => deprecated}/van/configuration_van.py |   4 +-
 .../van/convert_van_to_pytorch.py             |   0
 .../{ => deprecated}/van/modeling_van.py      |   8 +-
 src/transformers/utils/dummy_pt_objects.py    | 206 ++--
 .../utils/dummy_tokenizers_objects.py         |  14 +-
 tests/models/bort/test_modeling_bort.py       |  51 -
 tests/models/bort/test_modeling_tf_bort.py    |  53 -
 tests/models/mctct/__init__.py                |   0
 .../mctct/test_feature_extraction_mctct.py    | 311 ------
 tests/models/mctct/test_modeling_mctct.py     | 651 -------------
 tests/models/mctct/test_processor_mctct.py    | 158 ---
 tests/models/retribert/__init__.py            |   0
 .../retribert/test_tokenization_retribert.py  | 381 --------
 tests/models/tapex/__init__.py                |   0
 tests/models/tapex/test_tokenization_tapex.py | 904 ------------------
 .../models/trajectory_transformer/__init__.py |   0
 .../test_modeling_trajectory_transformer.py   | 276 ------
 tests/models/van/__init__.py                  |   0
 tests/models/van/test_modeling_van.py         | 278 ------
 utils/check_config_attributes.py              |  10 +-
 utils/check_config_docstrings.py              |   3 +
 utils/check_repo.py                           |   6 +
 utils/documentation_tests.txt                 |   8 -
 55 files changed, 328 insertions(+), 3305 deletions(-)
 rename src/transformers/models/{bort => deprecated}/__init__.py (100%)
 rename {tests/models => src/transformers/models/deprecated}/bort/__init__.py (100%)
 rename src/transformers/models/{ => deprecated}/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/mctct/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/mctct/configuration_mctct.py (99%)
 rename src/transformers/models/{ => deprecated}/mctct/feature_extraction_mctct.py (97%)
 rename src/transformers/models/{ => deprecated}/mctct/modeling_mctct.py (99%)
 rename src/transformers/models/{ => deprecated}/mctct/processing_mctct.py (99%)
 rename src/transformers/models/{ => deprecated}/mmbt/__init__.py (94%)
 rename src/transformers/models/{ => deprecated}/mmbt/configuration_mmbt.py (98%)
 rename src/transformers/models/{ => deprecated}/mmbt/modeling_mmbt.py (98%)
 rename src/transformers/models/{ => deprecated}/retribert/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/retribert/configuration_retribert.py (98%)
 rename src/transformers/models/{ => deprecated}/retribert/modeling_retribert.py (98%)
 rename src/transformers/models/{ => deprecated}/retribert/tokenization_retribert.py (99%)
 rename src/transformers/models/{ => deprecated}/retribert/tokenization_retribert_fast.py (98%)
 rename src/transformers/models/{ => deprecated}/tapex/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/tapex/tokenization_tapex.py (99%)
 rename src/transformers/models/{ => deprecated}/trajectory_transformer/__init__.py (95%)
 rename src/transformers/models/{ => deprecated}/trajectory_transformer/configuration_trajectory_transformer.py (98%)
 rename src/transformers/models/{ => deprecated}/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/trajectory_transformer/modeling_trajectory_transformer.py (99%)
 rename src/transformers/models/{ => deprecated}/van/__init__.py (93%)
 rename src/transformers/models/{ => deprecated}/van/configuration_van.py (98%)
 rename src/transformers/models/{ => deprecated}/van/convert_van_to_pytorch.py (100%)
 rename src/transformers/models/{ => deprecated}/van/modeling_van.py (98%)
 delete mode 100644 tests/models/bort/test_modeling_bort.py
 delete mode 100644 tests/models/bort/test_modeling_tf_bort.py
 delete mode 100644 tests/models/mctct/__init__.py
 delete mode 100644 tests/models/mctct/test_feature_extraction_mctct.py
 delete mode 100644 tests/models/mctct/test_modeling_mctct.py
 delete mode 100644 tests/models/mctct/test_processor_mctct.py
 delete mode 100644 tests/models/retribert/__init__.py
 delete mode 100644 tests/models/retribert/test_tokenization_retribert.py
 delete mode 100644 tests/models/tapex/__init__.py
 delete mode 100644 tests/models/tapex/test_tokenization_tapex.py
 delete mode 100644 tests/models/trajectory_transformer/__init__.py
 delete mode 100644 tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
 delete mode 100644 tests/models/van/__init__.py
 delete mode 100644 tests/models/van/test_modeling_van.py

diff --git a/docs/source/en/model_doc/bort.md b/docs/source/en/model_doc/bort.md
index 23f004cc9b60..dccf2b560b68 100644
--- a/docs/source/en/model_doc/bort.md
+++ b/docs/source/en/model_doc/bort.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # BORT
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://arxiv.org/abs/2010.10499) by
diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md
index 1fbe78dca6d2..72d4bedfac69 100644
--- a/docs/source/en/model_doc/mctct.md
+++ b/docs/source/en/model_doc/mctct.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # M-CTC-T
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md
index d982cf4a0b99..ab29ac966fe1 100644
--- a/docs/source/en/model_doc/retribert.md
+++ b/docs/source/en/model_doc/retribert.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # RetriBERT
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The RetriBERT model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md
index 8cebceeb73bb..52234b5c59bc 100644
--- a/docs/source/en/model_doc/tapex.md
+++ b/docs/source/en/model_doc/tapex.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # TAPEX
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu,
diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md
index 25b24e3db6e6..548642f7bb9f 100644
--- a/docs/source/en/model_doc/trajectory_transformer.md
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # Trajectory Transformer
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)  by Michael Janner, Qiyang Li, Sergey Levine.
diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md
index c6d765336338..b9539602d3b8 100644
--- a/docs/source/en/model_doc/van.md
+++ b/docs/source/en/model_doc/van.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # VAN
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
+You can do so by running the following command: `pip install -U transformers==4.30.0`.
+
+</Tip>
+
 ## Overview
 
 The VAN model was proposed in [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 19789d9924fe..872f2ddaadd6 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -202,7 +202,6 @@
         "Blip2VisionConfig",
     ],
     "models.bloom": ["BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP", "BloomConfig"],
-    "models.bort": [],
     "models.bridgetower": [
         "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "BridgeTowerConfig",
@@ -263,6 +262,26 @@
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
     "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
+    "models.deprecated": [],
+    "models.deprecated.bort": [],
+    "models.deprecated.mctct": [
+        "MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MCTCTConfig",
+        "MCTCTFeatureExtractor",
+        "MCTCTProcessor",
+    ],
+    "models.deprecated.mmbt": ["MMBTConfig"],
+    "models.deprecated.retribert": [
+        "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RetriBertConfig",
+        "RetriBertTokenizer",
+    ],
+    "models.deprecated.tapex": ["TapexTokenizer"],
+    "models.deprecated.trajectory_transformer": [
+        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TrajectoryTransformerConfig",
+    ],
+    "models.deprecated.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
     "models.deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
     "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
     "models.dialogpt": [],
@@ -390,13 +409,11 @@
     "models.maskformer": ["MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig", "MaskFormerSwinConfig"],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
-    "models.mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig", "MCTCTFeatureExtractor", "MCTCTProcessor"],
     "models.mega": ["MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegaConfig"],
     "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "models.megatron_gpt2": [],
     "models.mgp_str": ["MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP", "MgpstrConfig", "MgpstrProcessor", "MgpstrTokenizer"],
     "models.mluke": [],
-    "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
     "models.mobilenet_v1": ["MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV1Config"],
     "models.mobilenet_v2": ["MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV2Config"],
@@ -451,7 +468,6 @@
     "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
-    "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
     "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
     "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"],
     "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
@@ -498,17 +514,12 @@
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
     "models.table_transformer": ["TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TableTransformerConfig"],
     "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
-    "models.tapex": ["TapexTokenizer"],
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
     "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
     "models.timm_backbone": ["TimmBackboneConfig"],
-    "models.trajectory_transformer": [
-        "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "TrajectoryTransformerConfig",
-    ],
     "models.transfo_xl": [
         "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TransfoXLConfig",
@@ -536,7 +547,6 @@
         "UniSpeechSatConfig",
     ],
     "models.upernet": ["UperNetConfig"],
-    "models.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
     "models.videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
     "models.vilt": [
         "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -783,6 +793,7 @@
     _import_structure["models.cpm"].append("CpmTokenizerFast")
     _import_structure["models.deberta"].append("DebertaTokenizerFast")
     _import_structure["models.deberta_v2"].append("DebertaV2TokenizerFast")
+    _import_structure["models.deprecated.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
     _import_structure["models.dpr"].extend(
         ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
@@ -815,7 +826,6 @@
     _import_structure["models.realm"].append("RealmTokenizerFast")
     _import_structure["models.reformer"].append("ReformerTokenizerFast")
     _import_structure["models.rembert"].append("RemBertTokenizerFast")
-    _import_structure["models.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.roberta"].append("RobertaTokenizerFast")
     _import_structure["models.roformer"].append("RoFormerTokenizerFast")
     _import_structure["models.splinter"].append("SplinterTokenizerFast")
@@ -1497,6 +1507,33 @@
             "DeiTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.mctct"].extend(
+        [
+            "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MCTCTForCTC",
+            "MCTCTModel",
+            "MCTCTPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
+    _import_structure["models.deprecated.retribert"].extend(
+        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
+    )
+    _import_structure["models.deprecated.trajectory_transformer"].extend(
+        [
+            "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TrajectoryTransformerModel",
+            "TrajectoryTransformerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.deprecated.van"].extend(
+        [
+            "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VanForImageClassification",
+            "VanModel",
+            "VanPreTrainedModel",
+        ]
+    )
     _import_structure["models.deta"].extend(
         [
             "DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2043,14 +2080,6 @@
             "MBartPreTrainedModel",
         ]
     )
-    _import_structure["models.mctct"].extend(
-        [
-            "MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "MCTCTForCTC",
-            "MCTCTModel",
-            "MCTCTPreTrainedModel",
-        ]
-    )
     _import_structure["models.mega"].extend(
         [
             "MEGA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2087,7 +2116,6 @@
             "MgpstrPreTrainedModel",
         ]
     )
-    _import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
     _import_structure["models.mobilebert"].extend(
         [
             "MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2419,9 +2447,6 @@
             "ResNetPreTrainedModel",
         ]
     )
-    _import_structure["models.retribert"].extend(
-        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
-    )
     _import_structure["models.roberta"].extend(
         [
             "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2660,13 +2685,6 @@
         ]
     )
     _import_structure["models.timm_backbone"].extend(["TimmBackbone"])
-    _import_structure["models.trajectory_transformer"].extend(
-        [
-            "TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TrajectoryTransformerModel",
-            "TrajectoryTransformerPreTrainedModel",
-        ]
-    )
     _import_structure["models.transfo_xl"].extend(
         [
             "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2727,14 +2745,6 @@
             "UperNetPreTrainedModel",
         ]
     )
-    _import_structure["models.van"].extend(
-        [
-            "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "VanForImageClassification",
-            "VanModel",
-            "VanPreTrainedModel",
-        ]
-    )
     _import_structure["models.videomae"].extend(
         [
             "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4187,6 +4197,24 @@
     )
     from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
+    from .models.deprecated.mctct import (
+        MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MCTCTConfig,
+        MCTCTFeatureExtractor,
+        MCTCTProcessor,
+    )
+    from .models.deprecated.mmbt import MMBTConfig
+    from .models.deprecated.retribert import (
+        RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RetriBertConfig,
+        RetriBertTokenizer,
+    )
+    from .models.deprecated.tapex import TapexTokenizer
+    from .models.deprecated.trajectory_transformer import (
+        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TrajectoryTransformerConfig,
+    )
+    from .models.deprecated.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
     from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
@@ -4304,11 +4332,9 @@
     from .models.mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
     from .models.maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig, MaskFormerSwinConfig
     from .models.mbart import MBartConfig
-    from .models.mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig, MCTCTFeatureExtractor, MCTCTProcessor
     from .models.mega import MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP, MegaConfig
     from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .models.mgp_str import MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP, MgpstrConfig, MgpstrProcessor, MgpstrTokenizer
-    from .models.mmbt import MMBTConfig
     from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
     from .models.mobilenet_v1 import MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV1Config
     from .models.mobilenet_v2 import MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV2Config
@@ -4359,7 +4385,6 @@
     from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
-    from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
     from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
     from .models.roberta_prelayernorm import (
         ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4409,17 +4434,12 @@
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
     from .models.table_transformer import TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TableTransformerConfig
     from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
-    from .models.tapex import TapexTokenizer
     from .models.time_series_transformer import (
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
     from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
     from .models.timm_backbone import TimmBackboneConfig
-    from .models.trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TrajectoryTransformerConfig,
-    )
     from .models.transfo_xl import (
         TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TransfoXLConfig,
@@ -4432,7 +4452,6 @@
     from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
     from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
     from .models.upernet import UperNetConfig
-    from .models.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
     from .models.videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
     from .models.vilt import (
         VILT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4667,6 +4686,7 @@
         from .models.cpm import CpmTokenizerFast
         from .models.deberta import DebertaTokenizerFast
         from .models.deberta_v2 import DebertaV2TokenizerFast
+        from .models.deprecated.retribert import RetriBertTokenizerFast
         from .models.distilbert import DistilBertTokenizerFast
         from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
         from .models.electra import ElectraTokenizerFast
@@ -4697,7 +4717,6 @@
         from .models.realm import RealmTokenizerFast
         from .models.reformer import ReformerTokenizerFast
         from .models.rembert import RemBertTokenizerFast
-        from .models.retribert import RetriBertTokenizerFast
         from .models.roberta import RobertaTokenizerFast
         from .models.roformer import RoFormerTokenizerFast
         from .models.splinter import SplinterTokenizerFast
@@ -5262,6 +5281,29 @@
             DeiTModel,
             DeiTPreTrainedModel,
         )
+        from .models.deprecated.mctct import (
+            MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MCTCTForCTC,
+            MCTCTModel,
+            MCTCTPreTrainedModel,
+        )
+        from .models.deprecated.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+        from .models.deprecated.retribert import (
+            RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RetriBertModel,
+            RetriBertPreTrainedModel,
+        )
+        from .models.deprecated.trajectory_transformer import (
+            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TrajectoryTransformerModel,
+            TrajectoryTransformerPreTrainedModel,
+        )
+        from .models.deprecated.van import (
+            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VanForImageClassification,
+            VanModel,
+            VanPreTrainedModel,
+        )
         from .models.deta import (
             DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DetaForObjectDetection,
@@ -5698,7 +5740,6 @@
             MBartModel,
             MBartPreTrainedModel,
         )
-        from .models.mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel
         from .models.mega import (
             MEGA_PRETRAINED_MODEL_ARCHIVE_LIST,
             MegaForCausalLM,
@@ -5729,7 +5770,6 @@
             MgpstrModel,
             MgpstrPreTrainedModel,
         )
-        from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
         from .models.mobilebert import (
             MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             MobileBertForMaskedLM,
@@ -6011,7 +6051,6 @@
             ResNetModel,
             ResNetPreTrainedModel,
         )
-        from .models.retribert import RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST, RetriBertModel, RetriBertPreTrainedModel
         from .models.roberta import (
             ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             RobertaForCausalLM,
@@ -6204,11 +6243,6 @@
             TimesformerPreTrainedModel,
         )
         from .models.timm_backbone import TimmBackbone
-        from .models.trajectory_transformer import (
-            TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TrajectoryTransformerModel,
-            TrajectoryTransformerPreTrainedModel,
-        )
         from .models.transfo_xl import (
             TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
             AdaptiveEmbedding,
@@ -6252,12 +6286,6 @@
             UniSpeechSatPreTrainedModel,
         )
         from .models.upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel
-        from .models.van import (
-            VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
-            VanForImageClassification,
-            VanModel,
-            VanPreTrainedModel,
-        )
         from .models.videomae import (
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index b235f9ba35b3..7f698f15d9a6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -36,7 +36,6 @@
     blip,
     blip_2,
     bloom,
-    bort,
     bridgetower,
     byt5,
     camembert,
@@ -60,6 +59,7 @@
     decision_transformer,
     deformable_detr,
     deit,
+    deprecated,
     deta,
     detr,
     dialogpt,
@@ -122,13 +122,11 @@
     maskformer,
     mbart,
     mbart50,
-    mctct,
     mega,
     megatron_bert,
     megatron_gpt2,
     mgp_str,
     mluke,
-    mmbt,
     mobilebert,
     mobilenet_v1,
     mobilenet_v2,
@@ -164,7 +162,6 @@
     regnet,
     rembert,
     resnet,
-    retribert,
     roberta,
     roberta_prelayernorm,
     roc_bert,
@@ -188,11 +185,9 @@
     t5,
     table_transformer,
     tapas,
-    tapex,
     time_series_transformer,
     timesformer,
     timm_backbone,
-    trajectory_transformer,
     transfo_xl,
     trocr,
     tvlt,
@@ -200,7 +195,6 @@
     unispeech,
     unispeech_sat,
     upernet,
-    van,
     videomae,
     vilt,
     vision_encoder_decoder,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0cc8f93dea91..25f9ccc65ea9 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -640,6 +640,15 @@
     ]
 )
 
+DEPRECATED_MODELS = [
+    "bort",
+    "mctct",
+    "mmbt",
+    "retribert",
+    "trajectory_transformer",
+    "van",
+]
+
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
     [
         ("openai-gpt", "openai"),
@@ -659,7 +668,11 @@ def model_type_to_module_name(key):
     if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
         return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
 
-    return key.replace("-", "_")
+    key = key.replace("-", "_")
+    if key in DEPRECATED_MODELS:
+        key = f"deprecated.{key}"
+
+    return key
 
 
 def config_class_to_model_type(config):
diff --git a/src/transformers/models/bort/__init__.py b/src/transformers/models/deprecated/__init__.py
similarity index 100%
rename from src/transformers/models/bort/__init__.py
rename to src/transformers/models/deprecated/__init__.py
diff --git a/tests/models/bort/__init__.py b/src/transformers/models/deprecated/bort/__init__.py
similarity index 100%
rename from tests/models/bort/__init__.py
rename to src/transformers/models/deprecated/bort/__init__.py
diff --git a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/mctct/__init__.py b/src/transformers/models/deprecated/mctct/__init__.py
similarity index 95%
rename from src/transformers/models/mctct/__init__.py
rename to src/transformers/models/deprecated/mctct/__init__.py
index f389ebd9e70e..567be97b7cd8 100644
--- a/src/transformers/models/mctct/__init__.py
+++ b/src/transformers/models/deprecated/mctct/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/mctct/configuration_mctct.py b/src/transformers/models/deprecated/mctct/configuration_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/configuration_mctct.py
rename to src/transformers/models/deprecated/mctct/configuration_mctct.py
index 6389f2238fc1..4797a77d29ea 100644
--- a/src/transformers/models/mctct/configuration_mctct.py
+++ b/src/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """M-CTC-T model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
similarity index 97%
rename from src/transformers/models/mctct/feature_extraction_mctct.py
rename to src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
index 23ae02ecad60..e1e17c4b12f9 100644
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ b/src/transformers/models/deprecated/mctct/feature_extraction_mctct.py
@@ -20,11 +20,11 @@
 
 import numpy as np
 
-from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...file_utils import PaddingStrategy, TensorType
-from ...utils import logging
+from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ....feature_extraction_utils import BatchFeature
+from ....file_utils import PaddingStrategy, TensorType
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/modeling_mctct.py
rename to src/transformers/models/deprecated/mctct/modeling_mctct.py
index 4b965b27ec18..5b98a8a6079b 100755
--- a/src/transformers/models/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -22,17 +22,17 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from ...activations import ACT2FN
-from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput
-from ...modeling_utils import (
+from ....activations import ACT2FN
+from ....deepspeed import is_deepspeed_zero3_enabled
+from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ....modeling_outputs import BaseModelOutput, CausalLMOutput
+from ....modeling_utils import (
     PreTrainedModel,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ....utils import logging
 from .configuration_mctct import MCTCTConfig
 
 
diff --git a/src/transformers/models/mctct/processing_mctct.py b/src/transformers/models/deprecated/mctct/processing_mctct.py
similarity index 99%
rename from src/transformers/models/mctct/processing_mctct.py
rename to src/transformers/models/deprecated/mctct/processing_mctct.py
index eb20fa09b34c..764ed8d3db50 100644
--- a/src/transformers/models/mctct/processing_mctct.py
+++ b/src/transformers/models/deprecated/mctct/processing_mctct.py
@@ -18,7 +18,7 @@
 import warnings
 from contextlib import contextmanager
 
-from ...processing_utils import ProcessorMixin
+from ....processing_utils import ProcessorMixin
 
 
 class MCTCTProcessor(ProcessorMixin):
diff --git a/src/transformers/models/mmbt/__init__.py b/src/transformers/models/deprecated/mmbt/__init__.py
similarity index 94%
rename from src/transformers/models/mmbt/__init__.py
rename to src/transformers/models/deprecated/mmbt/__init__.py
index 29aee5a0cdbf..e467090cb4fb 100644
--- a/src/transformers/models/mmbt/__init__.py
+++ b/src/transformers/models/deprecated/mmbt/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {"configuration_mmbt": ["MMBTConfig"]}
diff --git a/src/transformers/models/mmbt/configuration_mmbt.py b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
similarity index 98%
rename from src/transformers/models/mmbt/configuration_mmbt.py
rename to src/transformers/models/deprecated/mmbt/configuration_mmbt.py
index aa453db592f8..df5161b0927a 100644
--- a/src/transformers/models/mmbt/configuration_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """ MMBT configuration"""
 
-from ...utils import logging
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
similarity index 98%
rename from src/transformers/models/mmbt/modeling_mmbt.py
rename to src/transformers/models/deprecated/mmbt/modeling_mmbt.py
index 220cfb49c997..db0cef3a6502 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -20,9 +20,9 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from ...modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
-from ...modeling_utils import ModuleUtilsMixin
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
+from ....modeling_utils import ModuleUtilsMixin
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/retribert/__init__.py b/src/transformers/models/deprecated/retribert/__init__.py
similarity index 95%
rename from src/transformers/models/retribert/__init__.py
rename to src/transformers/models/deprecated/retribert/__init__.py
index c4f4bf6cc0cb..dba5e14594e1 100644
--- a/src/transformers/models/retribert/__init__.py
+++ b/src/transformers/models/deprecated/retribert/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/retribert/configuration_retribert.py b/src/transformers/models/deprecated/retribert/configuration_retribert.py
similarity index 98%
rename from src/transformers/models/retribert/configuration_retribert.py
rename to src/transformers/models/deprecated/retribert/configuration_retribert.py
index 33663ad6167f..11d19193b360 100644
--- a/src/transformers/models/retribert/configuration_retribert.py
+++ b/src/transformers/models/deprecated/retribert/configuration_retribert.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ RetriBERT model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/deprecated/retribert/modeling_retribert.py
similarity index 98%
rename from src/transformers/models/retribert/modeling_retribert.py
rename to src/transformers/models/deprecated/retribert/modeling_retribert.py
index 240d9476e70b..00d47bce5121 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/deprecated/retribert/modeling_retribert.py
@@ -24,9 +24,9 @@
 import torch.utils.checkpoint as checkpoint
 from torch import nn
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ..bert.modeling_bert import BertModel
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ...bert.modeling_bert import BertModel
 from .configuration_retribert import RetriBertConfig
 
 
diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
similarity index 99%
rename from src/transformers/models/retribert/tokenization_retribert.py
rename to src/transformers/models/deprecated/retribert/tokenization_retribert.py
index b4a0f0619216..4529e8e9029b 100644
--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -19,8 +19,8 @@
 import unicodedata
 from typing import List, Optional, Tuple
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
similarity index 98%
rename from src/transformers/models/retribert/tokenization_retribert_fast.py
rename to src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
index c242213e1fae..30cb69c2b32b 100644
--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
@@ -19,8 +19,8 @@
 
 from tokenizers import normalizers
 
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import logging
 from .tokenization_retribert import RetriBertTokenizer
 
 
diff --git a/src/transformers/models/tapex/__init__.py b/src/transformers/models/deprecated/tapex/__init__.py
similarity index 95%
rename from src/transformers/models/tapex/__init__.py
rename to src/transformers/models/deprecated/tapex/__init__.py
index f6d293504e20..82bbacd15b0d 100644
--- a/src/transformers/models/tapex/__init__.py
+++ b/src/transformers/models/deprecated/tapex/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ....utils import _LazyModule
 
 
 _import_structure = {"tokenization_tapex": ["TapexTokenizer"]}
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
similarity index 99%
rename from src/transformers/models/tapex/tokenization_tapex.py
rename to src/transformers/models/deprecated/tapex/tokenization_tapex.py
index e2543a3378ca..d0cd49212c6d 100644
--- a/src/transformers/models/tapex/tokenization_tapex.py
+++ b/src/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -22,10 +22,10 @@
 
 import regex as re
 
-from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
-from ...utils import logging
+from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ....utils import logging
 
 
 if is_pandas_available():
diff --git a/src/transformers/models/trajectory_transformer/__init__.py b/src/transformers/models/deprecated/trajectory_transformer/__init__.py
similarity index 95%
rename from src/transformers/models/trajectory_transformer/__init__.py
rename to src/transformers/models/deprecated/trajectory_transformer/__init__.py
index d529275e5a30..b7af1bb48cb7 100644
--- a/src/transformers/models/trajectory_transformer/__init__.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
similarity index 98%
rename from src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
rename to src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
index 875980fde19e..a64a0cbd89e1 100644
--- a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ TrajectoryTransformer model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
similarity index 100%
rename from src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
similarity index 99%
rename from src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
rename to src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
index 1f634a9893d6..75415dbe77bf 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -25,8 +25,8 @@
 from torch import nn
 from torch.nn import functional as F
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/src/transformers/models/van/__init__.py b/src/transformers/models/deprecated/van/__init__.py
similarity index 93%
rename from src/transformers/models/van/__init__.py
rename to src/transformers/models/deprecated/van/__init__.py
index c696c5c5e5ae..2db730984ffa 100644
--- a/src/transformers/models/van/__init__.py
+++ b/src/transformers/models/deprecated/van/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {"configuration_van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"]}
diff --git a/src/transformers/models/van/configuration_van.py b/src/transformers/models/deprecated/van/configuration_van.py
similarity index 98%
rename from src/transformers/models/van/configuration_van.py
rename to src/transformers/models/deprecated/van/configuration_van.py
index 85a0dd20e477..798c8c194444 100644
--- a/src/transformers/models/van/configuration_van.py
+++ b/src/transformers/models/deprecated/van/configuration_van.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 """ VAN model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
similarity index 100%
rename from src/transformers/models/van/convert_van_to_pytorch.py
rename to src/transformers/models/deprecated/van/convert_van_to_pytorch.py
diff --git a/src/transformers/models/van/modeling_van.py b/src/transformers/models/deprecated/van/modeling_van.py
similarity index 98%
rename from src/transformers/models/van/modeling_van.py
rename to src/transformers/models/deprecated/van/modeling_van.py
index 59c8655aa93f..f7feebae4df6 100644
--- a/src/transformers/models/van/modeling_van.py
+++ b/src/transformers/models/deprecated/van/modeling_van.py
@@ -23,14 +23,14 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
     BaseModelOutputWithNoAttention,
     BaseModelOutputWithPoolingAndNoAttention,
     ImageClassifierOutputWithNoAttention,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_van import VanConfig
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 25a894302186..b583e8aad03c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2306,6 +2306,109 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MCTCTForCTC(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MCTCTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MCTCTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTForClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MMBTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ModalEmbeddings(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RetriBertModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RetriBertPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TrajectoryTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VanForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VanModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VanPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DETA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4484,30 +4587,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class MCTCTForCTC(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MCTCTModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MCTCTPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 MEGA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4664,27 +4743,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MMBTForClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class MMBTModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class ModalEmbeddings(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -6088,23 +6146,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class RetriBertModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class RetriBertPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -7030,23 +7071,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TrajectoryTransformerModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class TrajectoryTransformerPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -7276,30 +7300,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class VanForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class VanModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class VanPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 5ab17d1348e5..80c691387428 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -107,6 +107,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
+class RetriBertTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+
 class DistilBertTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
@@ -331,13 +338,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
 
 
-class RetriBertTokenizerFast(metaclass=DummyObject):
-    _backends = ["tokenizers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tokenizers"])
-
-
 class RobertaTokenizerFast(metaclass=DummyObject):
     _backends = ["tokenizers"]
 
diff --git a/tests/models/bort/test_modeling_bort.py b/tests/models/bort/test_modeling_bort.py
deleted file mode 100644
index 79ca94080107..000000000000
--- a/tests/models/bort/test_modeling_bort.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import AutoModel
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class BortIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = AutoModel.from_pretrained("amazon/bort")
-        model.to(torch_device)
-
-        input_ids = torch.tensor(
-            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
-            device=torch_device,
-            dtype=torch.long,
-        )  # Schloß Nymphenburg in Munich is really nice!
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = torch.Size((1, 15, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
-            device=torch_device,
-            dtype=torch.float,
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/bort/test_modeling_tf_bort.py b/tests/models/bort/test_modeling_tf_bort.py
deleted file mode 100644
index 35abe53d89d7..000000000000
--- a/tests/models/bort/test_modeling_tf_bort.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-
-    from transformers import TFAutoModel
-
-
-@require_tf
-@require_sentencepiece
-@require_tokenizers
-class TFBortIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = TFAutoModel.from_pretrained("amazon/bort")
-
-        input_ids = tf.convert_to_tensor(
-            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
-            dtype=tf.int32,
-        )  # Schloß Nymphenburg in Munich is really nice!
-
-        output = model(input_ids)["last_hidden_state"]
-        expected_shape = tf.TensorShape((1, 15, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = tf.convert_to_tensor(
-            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
-            dtype=tf.float32,
-        )
-
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/models/mctct/__init__.py b/tests/models/mctct/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
deleted file mode 100644
index f1825c364043..000000000000
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from transformers import MCTCTFeatureExtractor
-from transformers.testing_utils import require_torch
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-global_rng = random.Random()
-
-
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for _batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-@require_torch
-class MCTCTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=24,
-        num_mel_bins=24,
-        padding_value=0.0,
-        sampling_rate=16_000,
-        return_attention_mask=True,
-        do_normalize=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.feature_size = feature_size
-        self.num_mel_bins = num_mel_bins
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "num_mel_bins": self.num_mel_bins,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-            "do_normalize": self.do_normalize,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_torch
-class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = MCTCTFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = MCTCTFeatureExtractionTester(self)
-
-    def _check_zero_mean_unit_variance(self, input_vector):
-        self.assertTrue(np.all(np.mean(input_vector) < 1e-3))
-        self.assertTrue(np.all(np.abs(np.var(input_vector) - 1) < 1e-3))
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_cepstral_mean_and_variance_normalization(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                padding=padding,
-                max_length=max_length,
-                return_attention_mask=True,
-                truncation=max_length is not None,  # reference to #16419
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_np(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                max_length=max_length,
-                padding=padding,
-                return_tensors="np",
-                return_attention_mask=True,
-                truncation=max_length is not None,
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="max_length",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 4, 24))
-
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=16,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 16, 24))
-
-    def test_double_precision_pad(self):
-        import torch
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
-            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
-
-    def test_different_window(self):
-        import torch
-
-        init_dict = self.feat_extract_tester.prepare_feat_extract_dict()
-        init_dict["win_function"] = "hann_window"
-
-        feature_extractor = self.feature_extraction_class(**init_dict)
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
-            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_integration(self):
-        # fmt: off
-        expected = np.array([
-            [
-                1.1280,  1.1319,  1.2744,  1.4369,  1.4328,  1.3671,  1.2889,  1.3046,
-                1.4419,  0.8387,  0.2995,  0.0404,  0.1068,  0.0472,  0.3728,  1.3356,
-                1.4491,  0.4770,  0.3997,  0.2776,  0.3184, -0.1243, -0.1170, -0.0828
-            ],
-            [
-                1.0826,  1.0565,  1.2110,  1.3886,  1.3416,  1.2009,  1.1894,  1.2707,
-                1.5153,  0.7005,  0.4916,  0.4017,  0.3743,  0.1935,  0.4228,  1.1084,
-                0.9768,  0.0608,  0.2044,  0.1723,  0.0433, -0.2360, -0.2478, -0.2643
-            ],
-            [
-                1.0590,  0.9923,  1.1185,  1.3309,  1.1971,  1.0067,  1.0080,  1.2036,
-                1.5397,  1.0383,  0.7672,  0.7551,  0.4878,  0.8771,  0.7565,  0.8775,
-                0.9042,  0.4595,  0.6157,  0.4954,  0.1857,  0.0307,  0.0199,  0.1033
-            ],
-        ])
-        # fmt: on
-
-        input_speech = self._load_datasamples(1)
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
-        self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4))
diff --git a/tests/models/mctct/test_modeling_mctct.py b/tests/models/mctct/test_modeling_mctct.py
deleted file mode 100644
index 21fadd633c3a..000000000000
--- a/tests/models/mctct/test_modeling_mctct.py
+++ /dev/null
@@ -1,651 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch MCTCT model. """
-
-import inspect
-import math
-import unittest
-
-from datasets import load_dataset
-
-from transformers import MCTCTConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import MCTCTForCTC, MCTCTModel, MCTCTProcessor
-
-
-class MCTCTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=10,
-        seq_length=40,  # speech is longer
-        is_training=False,
-        vocab_size=32,
-        hidden_size=128 * 4,
-        num_hidden_layers=4,
-        intermediate_size=20,
-        num_attention_heads=4,
-        attention_head_dim=128,
-        max_position_embeddings=920,
-        layer_norm_eps=1e-5,
-        layerdrop=0.3,
-        hidden_act="relu",
-        initializer_range=0.02,
-        hidden_dropout_prob=0.3,
-        attention_probs_dropout_prob=0.3,
-        conv_glu_dim=1,
-        conv_dropout=0.3,
-        num_conv_layers=1,
-        conv_kernel=(7,),
-        conv_stride=(3,),
-        input_feat_per_channel=80,
-        input_channels=1,
-        conv_channels=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length  # speech is longer
-        self.is_training = is_training
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.num_attention_heads = num_attention_heads
-
-        self.attention_head_dim = attention_head_dim
-        self.max_position_embeddings = max_position_embeddings
-
-        self.layer_norm_eps = layer_norm_eps
-        self.layerdrop = layerdrop
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-
-        self.conv_glu_dim = conv_glu_dim
-        self.conv_dropout = conv_dropout
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel = conv_kernel
-        self.conv_stride = conv_stride
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.conv_channels = conv_channels
-
-        output_seq_length = self.seq_length
-        dilation = 1
-        for _, kernel_sz, stride in zip(range(self.num_conv_layers), self.conv_kernel, self.conv_stride):
-            padding = kernel_sz // 2
-            output_seq_length = output_seq_length + 2 * padding - dilation * (kernel_sz - 1) - 1
-            output_seq_length = torch.div(output_seq_length, stride, rounding_mode="trunc") + 1
-
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
-
-        config = self.get_config()
-
-        return config, input_features, attention_mask
-
-    def get_config(self):
-        return MCTCTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            num_attention_heads=self.num_attention_heads,
-            attention_head_dim=self.attention_head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            layer_norm_eps=self.layer_norm_eps,
-            layerdrop=self.layerdrop,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            conv_glu_dim=self.conv_glu_dim,
-            conv_dropout=self.conv_dropout,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel=self.conv_kernel,
-            conv_stride=self.conv_stride,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            conv_channels=self.conv_channels,
-        )
-
-    def create_and_check_model(self, config, input_features, attention_mask):
-        model = MCTCTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_features, attention_mask=attention_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_for_ctc(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_features, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = MCTCTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_features = input_features[:3]
-        attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_features.shape[-1] // i for i in [2, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_features.shape[0]):
-            input_slice = input_features[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_features, *args):
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_features = input_features[:3]
-
-        # input_features is a 2D window for each sequence
-        attention_mask = torch.ones(input_features.shape[:-1], device=torch_device, dtype=torch.long)
-
-        # -2 since input_features is a 2D window for each sequence in batch
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_ctc_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = MCTCTForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_features, *args):
-        model = MCTCTForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_features, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_features": input_features, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class MCTCTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"automatic-speech-recognition": MCTCTForCTC, "feature-extraction": MCTCTModel} if is_torch_available() else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "head_mask",
-                "output_attentions",
-                "output_hidden_states",
-                "return_dict",
-            ]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-        config.layerdrop = 0.0
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = torch.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class MCTCTRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC, MCTCTModel) if is_torch_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        pass
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = torch.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-@require_soundfile
-@slow
-class MCTCTModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_normal(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        model.to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe, sir, i exist."]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        model.to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor."',
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large").to(torch_device)
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True, return_attention_mask=True)
-
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor." "',
-            "\"the cadona's chest still-dripping bloodthe acofis overstrained eyes, even the soring arena around him"
-            " with thousands of spectators retrivialities not worth-thinking about.",
-            "his instant panic was followed by a small sharp blow high on his chestr.",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/models/mctct/test_processor_mctct.py b/tests/models/mctct/test_processor_mctct.py
deleted file mode 100644
index 306d4b174fb7..000000000000
--- a/tests/models/mctct/test_processor_mctct.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from transformers import MCTCTProcessor, is_speech_available, is_torch_available
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME
-from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer
-from transformers.testing_utils import require_torch, require_torchaudio
-
-
-if is_speech_available() and is_torch_available():
-    from transformers import MCTCTFeatureExtractor
-
-    from .test_feature_extraction_mctct import floats_list
-
-
-@require_torch
-@require_torchaudio
-class MCTCTProcessorTest(unittest.TestCase):
-    def setUp(self):
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.add_kwargs_tokens_map = {
-            "pad_token": "<pad>",
-            "unk_token": "<unk>",
-            "bos_token": "<s>",
-            "eos_token": "</s>",
-        }
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
-        kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return MCTCTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = MCTCTProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = MCTCTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = MCTCTProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/models/retribert/__init__.py b/tests/models/retribert/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/retribert/test_tokenization_retribert.py b/tests/models/retribert/test_tokenization_retribert.py
deleted file mode 100644
index 25b3df6f3e34..000000000000
--- a/tests/models/retribert/test_tokenization_retribert.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the RetriBERT tokenizer. """
-
-
-import os
-import unittest
-
-from transformers import RetriBertTokenizer, RetriBertTokenizerFast
-from transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from transformers.testing_utils import require_tokenizers, require_torch, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
-
-
-# Copied from transformers.tests.bert.test_modeling_bert.py with Bert->RetriBert
-@require_tokenizers
-class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = RetriBertTokenizer
-    test_slow_tokenizer = True
-    rust_tokenizer_class = RetriBertTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        self.assertListEqual(
-            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("yjernite/retribert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_change_tokenize_chinese_chars(self):
-        list_of_commun_chinese_char = ["的", "人", "有"]
-        text_with_chinese_char = "".join(list_of_commun_chinese_char)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that each Chinese character is not preceded by "##"
-                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
-                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
-
-                kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that only the first Chinese character is not preceded by "##".
-                expected_tokens = [
-                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
-                ]
-                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
-                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
-
-    # RetriBertModel doesn't define `get_input_embeddings` and it's forward method doesn't take only the output of the tokenizer as input
-    @require_torch
-    @slow
-    def test_torch_encode_plus_sent_to_model(self):
-        import torch
-
-        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
-
-                model = model_class(config)
-
-                # The following test is different from the common's one
-                self.assertGreaterEqual(model.bert_query.get_input_embeddings().weight.shape[0], len(tokenizer))
-
-                # Build sequence
-                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
-                sequence = " ".join(first_ten_tokens)
-                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="pt")
-
-                # Ensure that the BatchEncoding.to() method works.
-                encoded_sequence.to(model.device)
-
-                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
-                # This should not fail
-
-                with torch.no_grad():  # saves some time
-                    # The following lines are different from the common's ones
-                    model.embed_questions(**encoded_sequence)
-                    model.embed_questions(**batch_encoded_sequence)
diff --git a/tests/models/tapex/__init__.py b/tests/models/tapex/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py
deleted file mode 100644
index 9bc61acb7f0d..000000000000
--- a/tests/models/tapex/test_tokenization_tapex.py
+++ /dev/null
@@ -1,904 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from typing import List
-
-import pandas as pd
-
-from transformers import AddedToken, TapexTokenizer
-from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
-from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_pandas
-class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = TapexTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"cls_token": "<s>"}
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        # fmt: off
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]  # noqa: E231
-        # fmt: on
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_table(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if length == 0:
-            data = {}
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
-
-        table = pd.DataFrame.from_dict(data)
-
-        return table
-
-    def get_table_and_query(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-        table = self.get_table(tokenizer, length=length - 3)
-        query = " ".join(toks[:3])
-
-        return table, query
-
-    def get_clean_sequence(
-        self,
-        tokenizer,
-        with_prefix_space=False,
-        max_length=20,
-        min_length=5,
-        empty_table: bool = False,
-        add_special_tokens: bool = True,
-        return_table_and_query: bool = False,
-    ):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if empty_table:
-            table = pd.DataFrame.from_dict({})
-            query = " ".join(toks[:min_length])
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
-            table = pd.DataFrame.from_dict(data)
-            query = " ".join(toks[:3])
-
-        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
-        output_txt = tokenizer.decode(output_ids)
-
-        if len(output_ids) < min_length:
-            raise ValueError("Update the code to generate the sequences so that they are larger")
-        if len(output_ids) > max_length:
-            raise ValueError("Update the code to generate the sequences so that they are smaller")
-
-        if return_table_and_query:
-            return output_txt, output_ids, table, query
-
-        return output_txt, output_ids
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer_roberta(self):
-        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def roberta_dict_integration_testing(self):
-        tokenizer = self.get_tokenizer()
-
-        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
-        self.assertListEqual(
-            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
-        )
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    table,
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                    add_special_tokens=False,
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                empty_table = self.get_table(tokenizer, length=0)
-                seq_0 = "Test this method."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-                self.assertIn(0, output["token_type_ids"])
-
-    def test_add_special_tokens(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_table = self.get_table(tokenizer, length=0)
-
-                special_token = "[SPECIAL_TOKEN]"
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_batch_encode_plus_overflowing_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            table = self.get_table(tokenizer, length=10)
-            string_sequences = ["Testing the prepare_for_model method.", "Test"]
-
-            if tokenizer.pad_token is None:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-            tokenizer.batch_encode_plus(
-                table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
-            )
-
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                table = self.get_table(tokenizer, length=0)
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        table, sequences, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # Test not batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
-                encoded_sequences_2 = tokenizer(table, sequences[0])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
-                encoded_sequences_2 = tokenizer(table, sequences[1])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
-                encoded_sequences_2 = tokenizer(table, sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                input_text, output_text = self.get_input_output_texts(tokenizer)
-
-                tokens = tokenizer.tokenize(input_text)
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                self.assertEqual(text_2, output_text)
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                table = self.get_table(tokenizer, length=0)
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table, query = self.get_table_and_query(tokenizer)
-
-                sequences = tokenizer.encode(table, query, add_special_tokens=False)
-                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
-
-                self.assertEqual(2, len(attached_sequences) - len(sequences))
-
-    @unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
-    def test_prepare_for_model(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    @unittest.skip("Not implemented")
-    def test_right_and_left_truncation(self):
-        pass
-
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-
-                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
-                tokenizer.add_tokens(new_toks)
-                input = "[ABC][DEF][ABC][DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode(table, input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertIn(decoded, [output, output.lower()])
-
-    def test_tokenize_special_tokens(self):
-        """Test `tokenize` with special tokens."""
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
-                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
-
-                # TODO:
-                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
-                # with one variable(property) for a better maintainability?
-
-                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
-                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
-                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
-                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
-
-                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
-                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
-
-                self.assertEqual(len(token_1), 1)
-                self.assertEqual(len(token_2), 1)
-                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
-                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence_0 = "Encode this."
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    pad_to_max_length=True,
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-                token_type_padding_idx = tokenizer.pad_token_type_id
-
-                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
-                self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
-                self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
-                self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    self.assertListEqual(
-                        (token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
-                    )
-                    self.assertListEqual(
-                        [[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
-                    )
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
-                    self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
-                    for sequence in sequences
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                empty_table = self.get_table(tokenizer, length=0)
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
-                number_of_tokens = len(encoded_sequence)
-                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table,
-                    sequence_0,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                # NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
-                # in order to have equivalent results with encoding an empty table or empty sequence
-                del filtered_sequence[number_of_tokens + 1]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                print("Encoded sequence:", encoded_sequence)
-                print("Filtered sequence:", filtered_sequence)
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    @slow
-    def test_full_tokenizer(self):
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
-        encoding = tokenizer(table, question)
-
-        # fmt: off
-        expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
-        # fmt: on
-
-        self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
-
-    def test_tokenizer_as_target(self):
-        # by default the tokenizer do_lower_case
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
-        answer_text = "tapex is a good model!"
-        expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
-        answer_encoding = tokenizer(answer=answer_text)
-        self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
-
-    @slow
-    def test_tokenizer_lower_case(self):
-        cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
-        uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
-        answer_text = "Beijing, London, Paris"
-        answer_text_lower = "beijing, london, paris"
-
-        self.assertNotEqual(
-            cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=answer_text_lower).input_ids,
-            uncased_tokenizer(answer=answer_text).input_ids,
-        )
-        # batched encoding assert
-        self.assertNotEqual(
-            cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=[answer_text_lower]).input_ids,
-            uncased_tokenizer(answer=[answer_text]).input_ids,
-        )
-        # test input encoding lowercase
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        self.assertNotEqual(
-            cased_tokenizer(table=table, query=question).input_ids,
-            uncased_tokenizer(table=table, query=question).input_ids,
-        )
diff --git a/tests/models/trajectory_transformer/__init__.py b/tests/models/trajectory_transformer/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
deleted file mode 100644
index cf553bdf75b3..000000000000
--- a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch TrajectoryTransformer model. """
-
-
-import inspect
-import unittest
-
-import numpy as np
-
-from transformers import TrajectoryTransformerConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import TrajectoryTransformerModel
-    from transformers.models.trajectory_transformer.modeling_trajectory_transformer import (
-        TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-
-
-class TrajectoryTransformerModelTester:
-    def __init__(self, parent, batch_size=13, n_embd=128, action_dim=6, observation_dim=17, is_training=True):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.n_embd = n_embd
-        self.action_dim = action_dim
-        self.observation_dim = observation_dim
-        self.is_training = is_training
-        self.seq_length = self.action_dim + self.observation_dim + 1
-
-    def prepare_config_and_inputs(self):
-        trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
-            torch_device
-        )
-        attention_mask = random_attention_mask((self.batch_size, self.seq_length)).to(torch_device)
-        targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(self.batch_size)]).to(
-            torch_device
-        )
-
-        config = self.get_config()
-        return config, trajectories, attention_mask, targets
-
-    def get_config(self):
-        return TrajectoryTransformerConfig(
-            batch_size=self.batch_size,
-            n_embd=self.n_embd,
-            action_dim=self.action_dim,
-            observation_dim=self.observation_dim,
-        )
-
-    def create_and_check_model(self, config, input_dict):
-        model = TrajectoryTransformerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(trajectories=input_dict["trajectories"], attention_mask=input_dict["attention_mask"])
-        result = model(
-            trajectories=input_dict["trajectories"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        self.parent.assertEqual(result.hidden_states[-1].shape, (self.batch_size, self.seq_length, self.n_embd))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, trajectories, attention_mask, targets) = config_and_inputs
-        inputs_dict = {"trajectories": trajectories, "attention_mask": attention_mask, "targets": targets}
-        return config, inputs_dict
-
-
-@require_torch
-class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TrajectoryTransformerModel} if is_torch_available() else {}
-
-    # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
-    test_generate_without_input_ids = False
-
-    # Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_attention_outputs = False
-    test_hidden_states_output = False
-    test_inputs_embeds = False
-    test_model_common_attributes = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = TrajectoryTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TrajectoryTransformerConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_conditional_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["trajectories"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    # # Input is 'trajectories' not 'input_ids'
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(getattr(TrajectoryTransformerModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(TrajectoryTransformerModel.main_input_name, observed_main_input_name)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        model = TrajectoryTransformerModel(config)
-        model.to(torch_device)
-
-        outputs = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        output = outputs[0]
-        hidden_states = outputs.hidden_states[0]
-        hidden_states.retain_grad()
-
-        if self.has_attentions:
-            attentions = outputs.attentions[0]
-            attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-        if self.has_attentions:
-            self.assertIsNotNone(attentions.grad)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = TrajectoryTransformerModel(config)
-        model.to(torch_device)
-        model.train()
-        loss = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        ).loss
-        loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = TrajectoryTransformerModel(config)
-        model.gradient_checkpointing_enable()
-        model.to(torch_device)
-        model.train()
-        loss = model(
-            trajectories=input_dict["trajectories"],
-            attention_mask=input_dict["attention_mask"],
-            targets=input_dict["targets"],
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=False,
-            return_dict=True,
-        ).loss
-        loss.backward()
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TrajectoryTransformerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_torch
-class TrajectoryTransformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_prediction(self):
-        batch_size = 1
-
-        config = TrajectoryTransformerConfig.from_pretrained("CarlCochet/trajectory-transformer-halfcheetah-medium-v2")
-        model = TrajectoryTransformerModel.from_pretrained(
-            "CarlCochet/trajectory-transformer-halfcheetah-medium-v2", config=config
-        )
-        model.to(torch_device)
-        model.eval()
-
-        seq_length = model.config.action_dim + model.config.observation_dim + 1
-
-        trajectories = torch.LongTensor(
-            [[3, 19, 20, 22, 9, 7, 23, 10, 18, 14, 13, 4, 17, 11, 5, 6, 15, 21, 2, 8, 1, 0, 12, 16]]
-        ).to(torch_device)
-        outputs = model(
-            trajectories=trajectories,
-            output_hidden_states=True,
-            output_attentions=True,
-            use_cache=True,
-            return_dict=True,
-        )
-
-        output = outputs.logits
-
-        expected_shape = torch.Size((batch_size, seq_length, model.config.vocab_size + 1))
-        expected_slice = torch.tensor(
-            [[[-0.7193, -0.2532, -0.0898], [1.9429, 2.0434, 2.3975], [-3.3651, -2.8744, -2.4532]]]
-        ).to(torch_device)
-        output_slice = output[:, :3, :3]
-
-        self.assertEqual(output.shape, expected_shape)
-        self.assertTrue(torch.allclose(output_slice, expected_slice, atol=1e-4))
diff --git a/tests/models/van/__init__.py b/tests/models/van/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py
deleted file mode 100644
index 1296e0a22586..000000000000
--- a/tests/models/van/test_modeling_van.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Van model. """
-
-
-import inspect
-import math
-import unittest
-
-from transformers import VanConfig
-from transformers.testing_utils import require_scipy, require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_scipy_available, is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_scipy_available():
-    from scipy import stats
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import VanForImageClassification, VanModel
-    from transformers.models.van.modeling_van import VAN_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class VanModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=224,
-        num_channels=3,
-        hidden_sizes=[16, 32, 64, 128],
-        depths=[1, 1, 1, 1],
-        is_training=True,
-        use_labels=True,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.type_sequence_label_size = num_labels
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return VanConfig(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_labels=self.num_labels,
-            is_decoder=False,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VanModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = VanForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class VanModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Van does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (VanModel, VanForImageClassification) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": VanModel, "image-classification": VanForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = VanModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VanConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="Van does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Van does not support input and output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @require_scipy
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, module in model.named_modules():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
-                    self.assertTrue(
-                        torch.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        torch.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                elif isinstance(module, nn.Conv2d):
-                    fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
-                    fan_out //= module.groups
-                    std = math.sqrt(2.0 / fan_out)
-                    # divide by std -> mean = 0, std = 1
-                    data = module.weight.data.cpu().flatten().numpy() / std
-                    test = stats.anderson(data)
-                    self.assertTrue(test.statistic > 0.05)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = len(self.model_tester.hidden_sizes)
-            # van has no embeddings
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # Van's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in VAN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = VanModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class VanModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0])
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = VanForImageClassification.from_pretrained(VAN_PRETRAINED_MODEL_ARCHIVE_LIST[0]).to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([0.1029, -0.0904, -0.6365]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 1175f0774029..ac68337ea28a 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -69,10 +69,6 @@
     "CvtConfig": ["layer_norm_eps"],
     # having default values other than `1e-5` - we can't fix them without breaking
     "PerceiverConfig": ["layer_norm_eps"],
-    # having default values other than `1e-5` - we can't fix them without breaking
-    "RetriBertConfig": ["layer_norm_eps"],
-    # having default values other than `1e-5` - we can't fix them without breaking
-    "TrajectoryTransformerConfig": ["layer_norm_eps"],
     # used internally to calculate the feature size
     "InformerConfig": ["num_static_real_features", "num_time_features"],
     # used internally to calculate the feature size
@@ -106,7 +102,6 @@
         "OneFormerConfig": True,
         "PerceiverConfig": True,
         "RagConfig": True,
-        "RetriBertConfig": True,
         "SpeechT5Config": True,
         "SwinConfig": True,
         "Swin2SRConfig": True,
@@ -114,11 +109,9 @@
         "SwitchTransformersConfig": True,
         "TableTransformerConfig": True,
         "TapasConfig": True,
-        "TrajectoryTransformerConfig": True,
         "TransfoXLConfig": True,
         "UniSpeechConfig": True,
         "UniSpeechSatConfig": True,
-        "VanConfig": True,
         "WavLMConfig": True,
         "WhisperConfig": True,
         # TODO: @Arthur (for `alignment_head` and `alignment_layer`)
@@ -267,6 +260,9 @@ def check_config_attributes():
     """Check the arguments in `__init__` of all configuration classes are used in  python files"""
     configs_with_unused_attributes = {}
     for _config_class in list(CONFIG_MAPPING.values()):
+        # Skip deprecated models
+        if "models.deprecated" in _config_class.__module__:
+            continue
         # Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
         config_classes_in_module = [
             cls
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index de47348a9ef0..b92a2e559d0f 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -74,6 +74,9 @@ def check_config_docstrings_have_checkpoints():
     configs_without_checkpoint = []
 
     for config_class in list(CONFIG_MAPPING.values()):
+        # Skip deprecated models
+        if "models.deprecated" in config_class.__module__:
+            continue
         checkpoint = get_checkpoint_from_config_class(config_class)
 
         name = config_class.__name__
diff --git a/utils/check_repo.py b/utils/check_repo.py
index c6e5e39374e1..b39bc8489e10 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -400,6 +400,8 @@ def check_model_list():
     models_dir = os.path.join(PATH_TO_TRANSFORMERS, "models")
     _models = []
     for model in os.listdir(models_dir):
+        if model == "deprecated":
+            continue
         model_dir = os.path.join(models_dir, model)
         if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir):
             _models.append(model)
@@ -445,6 +447,8 @@ def get_model_modules():
     ]
     modules = []
     for model in dir(transformers.models):
+        if model == "deprecated":
+            continue
         # There are some magic dunder attributes in the dir, we ignore them
         if not model.startswith("__"):
             model_module = getattr(transformers.models, model)
@@ -767,6 +771,8 @@ def check_objects_being_equally_in_main_init():
         obj = getattr(transformers, attr)
         if hasattr(obj, "__module__"):
             module_path = obj.__module__
+            if "models.deprecated" in module_path:
+                continue
             module_name = module_path.split(".")[-1]
             module_dir = ".".join(module_path.split(".")[:-1])
             if (
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 73fcd371060c..3c8d9ef4ea25 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -277,9 +277,6 @@ src/transformers/models/mbart/tokenization_mbart.py
 src/transformers/models/mbart/tokenization_mbart_fast.py
 src/transformers/models/mbart50/tokenization_mbart50.py
 src/transformers/models/mbart50/tokenization_mbart50_fast.py
-src/transformers/models/mctct/configuration_mctct.py
-src/transformers/models/mctct/feature_extraction_mctct.py
-src/transformers/models/mctct/processing_mctct.py
 src/transformers/models/megatron_bert/configuration_megatron_bert.py
 src/transformers/models/mgp_str/processing_mgp_str.py
 src/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -362,8 +359,6 @@ src/transformers/models/rembert/tokenization_rembert_fast.py
 src/transformers/models/resnet/configuration_resnet.py
 src/transformers/models/resnet/modeling_resnet.py
 src/transformers/models/resnet/modeling_tf_resnet.py
-src/transformers/models/retribert/tokenization_retribert.py
-src/transformers/models/retribert/tokenization_retribert_fast.py
 src/transformers/models/roberta/configuration_roberta.py
 src/transformers/models/roberta/modeling_roberta.py
 src/transformers/models/roberta/modeling_tf_roberta.py
@@ -413,12 +408,10 @@ src/transformers/models/t5/tokenization_t5.py
 src/transformers/models/t5/tokenization_t5_fast.py
 src/transformers/models/table_transformer/modeling_table_transformer.py
 src/transformers/models/tapas/tokenization_tapas.py
-src/transformers/models/tapex/tokenization_tapex.py
 src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
 src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
 src/transformers/models/timesformer/configuration_timesformer.py
 src/transformers/models/timesformer/modeling_timesformer.py
-src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
 src/transformers/models/transfo_xl/configuration_transfo_xl.py
 src/transformers/models/transfo_xl/tokenization_transfo_xl.py
 src/transformers/models/trocr/configuration_trocr.py
@@ -431,7 +424,6 @@ src/transformers/models/unispeech/configuration_unispeech.py
 src/transformers/models/unispeech/modeling_unispeech.py
 src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
 src/transformers/models/upernet/modeling_upernet.py
-src/transformers/models/van/modeling_van.py
 src/transformers/models/videomae/feature_extraction_videomae.py
 src/transformers/models/videomae/image_processing_videomae.py
 src/transformers/models/videomae/modeling_videomae.py

From 34d94094279d2c903d9d8a51a65edb265f22c849 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 13 Jul 2023 16:47:30 +0100
Subject: [PATCH 652/935] Llama/GPTNeoX: add RoPE scaling  (#24653)

* add rope_scaling

* tmp commit

* add gptneox

* add tests

* GPTNeoX can now handle long inputs, so the pipeline test was wrong

* Update src/transformers/models/open_llama/configuration_open_llama.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* remove ntk

* remove redundant validation

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/gpt_neox/configuration_gpt_neox.py |  35 +++++
 .../models/gpt_neox/modeling_gpt_neox.py      | 126 ++++++++++++++----
 .../modeling_gpt_neox_japanese.py             |  25 ++--
 .../models/llama/configuration_llama.py       |  34 +++++
 .../models/llama/modeling_llama.py            |  91 +++++++++++--
 .../open_llama/configuration_open_llama.py    |  35 +++++
 .../models/open_llama/modeling_open_llama.py  |  96 +++++++++++--
 .../models/gpt_neox/test_modeling_gpt_neox.py |  37 ++++-
 tests/models/llama/test_modeling_llama.py     |  35 ++++-
 .../open_llama/test_modeling_open_llama.py    |  35 ++++-
 .../test_pipelines_text_generation.py         |   2 +-
 11 files changed, 484 insertions(+), 67 deletions(-)

diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index 046d858bf3b1..24c4b4a3df03 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -78,6 +78,15 @@ class GPTNeoXConfig(PretrainedConfig):
         use_parallel_residual (`bool`, *optional*, defaults to `True`):
             Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
             speedup at large scales (e.g. 20B).
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
         Example:
 
     ```python
@@ -115,6 +124,7 @@ def __init__(
         eos_token_id=2,
         tie_word_embeddings=False,
         use_parallel_residual=True,
+        rope_scaling=None,
         **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -135,7 +145,32 @@ def __init__(
         self.use_cache = use_cache
         self.tie_word_embeddings = tie_word_embeddings
         self.use_parallel_residual = use_parallel_residual
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
         if self.hidden_size % self.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
             )
+
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 8546d85ffa3e..b1d694e61de1 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -86,6 +86,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 class GPTNeoXAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
         if self.hidden_size % self.num_attention_heads != 0:
@@ -94,18 +95,11 @@ def __init__(self, config):
             )
         self.head_size = self.hidden_size // self.num_attention_heads
         self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
-                1, 1, max_positions, max_positions
-            ),
-            persistent=False,
-        )
+        self._init_bias(config.max_position_embeddings)
+
         self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
-        self.rotary_emb = RotaryEmbedding(
-            self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
-        )
+        self._init_rope()
+
         self.register_buffer(
             "norm_factor",
             torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()),
@@ -113,9 +107,44 @@ def __init__(self, config):
         )
         self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-
         self.attention_dropout = nn.Dropout(config.attention_dropout)
 
+    def _init_bias(self, max_positions, device=None):
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        if device is not None:
+            self.bias = self.bias.to(device)
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = GPTNeoXRotaryEmbedding(
+                self.rotary_ndims, self.config.max_position_embeddings, base=self.config.rotary_emb_base
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = GPTNeoXLinearScalingRotaryEmbedding(
+                    self.rotary_ndims,
+                    self.config.max_position_embeddings,
+                    base=self.config.rotary_emb_base,
+                    scaling_factor=scaling_factor,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = GPTNeoXDynamicNTKScalingRotaryEmbedding(
+                    self.rotary_ndims,
+                    self.config.max_position_embeddings,
+                    base=self.config.rotary_emb_base,
+                    scaling_factor=scaling_factor,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
     def forward(
         self,
         hidden_states: torch.FloatTensor,
@@ -210,6 +239,9 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         batch_size, num_attention_heads, query_length, attn_head_size = query.size()
         key_length = key.size(-2)
 
+        # dynamically increase the causal mask with the key length, if needed.
+        if key_length > self.bias.shape[-1]:
+            self._init_bias(key_length, device=key.device)
         causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
 
         query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
@@ -258,15 +290,23 @@ def attention_mask_func(attention_scores, ltor_mask):
     return attention_scores
 
 
-class RotaryEmbedding(torch.nn.Module):
+class GPTNeoXRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=self.inv_freq.device)
+
+    def _set_cos_sin_cache(self, seq_len, device):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
@@ -275,18 +315,56 @@ def __init__(self, dim, max_position_embeddings, base=10000, device=None):
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.cos_cached = emb.cos()[None, None, :, :]
-            self.sin_cached = emb.sin()[None, None, :, :]
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device)
         return self.cos_cached[:seq_len, ...].to(x.device), self.sin_cached[:seq_len, ...].to(x.device)
 
 
+class GPTNeoXLinearScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
+    """GPTNeoXRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :]
+        self.sin_cached = emb.sin()[None, None, :, :]
+
+
+class GPTNeoXDynamicNTKScalingRotaryEmbedding(GPTNeoXRotaryEmbedding):
+    """GPTNeoXRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :]
+        self.sin_cached = emb.sin()[None, None, :, :]
+
+
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index e7cb510e6222..7df64174a599 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -238,16 +238,24 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         return attn_output, attn_weights
 
 
-# Copied from transformers.models.gpt_neox.modeling_gpt_neox.RotaryEmbedding
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=self.inv_freq.device)
+
+    def _set_cos_sin_cache(self, seq_len, device):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
@@ -256,15 +264,8 @@ def __init__(self, dim, max_position_embeddings, base=10000, device=None):
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.cos_cached = emb.cos()[None, None, :, :]
-            self.sin_cached = emb.sin()[None, None, :, :]
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device)
         return self.cos_cached[:seq_len, ...].to(x.device), self.sin_cached[:seq_len, ...].to(x.device)
 
 
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index e3d075310e84..d456b79e66fb 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -64,6 +64,15 @@ class LlamaConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
         Example:
 
     ```python
@@ -97,6 +106,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
+        rope_scaling=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -109,6 +119,9 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
@@ -116,3 +129,24 @@ def __init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 41a077c32fc9..6cdbb2623a7d 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -91,36 +91,84 @@ def forward(self, hidden_states):
 class LlamaRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        dtype = torch.get_default_dtype()
         self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
         return (
             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
         )
 
 
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -176,7 +224,24 @@ def __init__(self, config: LlamaConfig):
         self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
diff --git a/src/transformers/models/open_llama/configuration_open_llama.py b/src/transformers/models/open_llama/configuration_open_llama.py
index cbde4d67d498..c0629b31e812 100644
--- a/src/transformers/models/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/open_llama/configuration_open_llama.py
@@ -67,6 +67,15 @@ class OpenLlamaConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
         Example:
 
     ```python
@@ -104,6 +113,7 @@ def __init__(
         attention_dropout_prob=0.1,
         use_stable_embedding=True,
         shared_input_output_embedding=True,
+        rope_scaling=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -123,6 +133,9 @@ def __init__(
         self.attention_dropout_prob = attention_dropout_prob
         self.use_stable_embedding = use_stable_embedding
         self.shared_input_output_embedding = shared_input_output_embedding
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
@@ -130,3 +143,25 @@ def __init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index db05b868b066..34dc350dc556 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -102,36 +102,86 @@ def forward(self, hidden_states):
 class OpenLlamaRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        dtype = torch.get_default_dtype()
         self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
         return (
             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
         )
 
 
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->OpenLlama
+class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->OpenLlama
+class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -190,7 +240,27 @@ def __init__(self, config: OpenLlamaConfig):
         self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = OpenLlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        self._init_rope()
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->OpenLlama
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = OpenLlamaRotaryEmbedding(
+                self.head_dim, max_position_embeddings=self.max_position_embeddings
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index ed9b5764a361..176970779ed5 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -17,7 +17,9 @@
 
 import unittest
 
-from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available
+from parameterized import parameterized
+
+from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -49,7 +51,7 @@ def __init__(
         use_token_type_ids=True,
         use_labels=True,
         vocab_size=99,
-        hidden_size=32,
+        hidden_size=64,
         num_hidden_layers=5,
         num_attention_heads=4,
         intermediate_size=37,
@@ -298,6 +300,37 @@ def test_model_for_token_classification(self):
     def test_feed_forward_chunking(self):
         pass
 
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = GPTNeoXModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = GPTNeoXModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
 
 @require_torch
 class GPTNeoXLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 113f74d30976..e8b808461945 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -17,7 +17,9 @@
 
 import unittest
 
-from transformers import LlamaConfig, is_torch_available
+from parameterized import parameterized
+
+from transformers import LlamaConfig, is_torch_available, set_seed
 from transformers.testing_utils import require_torch, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -332,3 +334,34 @@ def test_llama_sequence_classification_model_for_multi_label(self):
     @unittest.skip("LLaMA buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = LlamaModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = LlamaModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
diff --git a/tests/models/open_llama/test_modeling_open_llama.py b/tests/models/open_llama/test_modeling_open_llama.py
index 6cdf61a8729d..687b267b70ca 100644
--- a/tests/models/open_llama/test_modeling_open_llama.py
+++ b/tests/models/open_llama/test_modeling_open_llama.py
@@ -17,7 +17,9 @@
 
 import unittest
 
-from transformers import OpenLlamaConfig, is_torch_available
+from parameterized import parameterized
+
+from transformers import OpenLlamaConfig, is_torch_available, set_seed
 from transformers.testing_utils import require_torch, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -335,3 +337,34 @@ def test_open_llama_sequence_classification_model_for_multi_label(self):
     @unittest.skip("Open-Llama buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = OpenLlamaModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = OpenLlamaModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index b4605164aefd..44a29a673d81 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -240,7 +240,7 @@ def run_pipeline_test(self, text_generator, _):
         # We don't care about infinite range models.
         # They already work.
         # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
-        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM"]
+        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM", "GPTNeoXForCausalLM"]
         if (
             tokenizer.model_max_length < 10000
             and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS

From eebce4470cd2b4cfda6572ff7995f10509a9f693 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 13 Jul 2023 16:50:19 +0100
Subject: [PATCH 653/935] Add accelerate version in transformers-cli env
 (#24806)

* Add accelerate version in transformers-cli env

* Add accelerate config
---
 src/transformers/commands/env.py | 41 +++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/transformers/commands/env.py b/src/transformers/commands/env.py
index aa0dccb579cc..8567bbcf5b61 100644
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@@ -13,13 +13,20 @@
 # limitations under the License.
 
 import importlib.util
+import os
 import platform
 from argparse import ArgumentParser
 
 import huggingface_hub
 
 from .. import __version__ as version
-from ..utils import is_flax_available, is_safetensors_available, is_tf_available, is_torch_available
+from ..utils import (
+    is_accelerate_available,
+    is_flax_available,
+    is_safetensors_available,
+    is_tf_available,
+    is_torch_available,
+)
 from . import BaseTransformersCLICommand
 
 
@@ -27,11 +34,24 @@ def info_command_factory(_):
     return EnvironmentCommand()
 
 
+def download_command_factory(args):
+    return EnvironmentCommand(args.accelerate_config_file)
+
+
 class EnvironmentCommand(BaseTransformersCLICommand):
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         download_parser = parser.add_parser("env")
         download_parser.set_defaults(func=info_command_factory)
+        download_parser.add_argument(
+            "--accelerate-config_file",
+            default=None,
+            help="The accelerate config file to use for the default values in the launching script.",
+        )
+        download_parser.set_defaults(func=download_command_factory)
+
+    def __init__(self, accelerate_config_file, *args) -> None:
+        self._accelerate_config_file = accelerate_config_file
 
     def run(self):
         safetensors_version = "not installed"
@@ -44,6 +64,23 @@ def run(self):
 
             safetensors_version = f"{safetensors.__version__} but is ignored because of PyTorch version too old."
 
+        accelerate_version = "not installed"
+        accelerate_config = accelerate_config_str = "not found"
+        if is_accelerate_available():
+            import accelerate
+            from accelerate.commands.config import default_config_file, load_config_from_file
+
+            accelerate_version = accelerate.__version__
+            # Get the default from the config file.
+            if self._accelerate_config_file is not None or os.path.isfile(default_config_file):
+                accelerate_config = load_config_from_file(self._accelerate_config_file).to_dict()
+
+            accelerate_config_str = (
+                "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
+                if isinstance(accelerate_config, dict)
+                else f"\t{accelerate_config}"
+            )
+
         pt_version = "not installed"
         pt_cuda_available = "NA"
         if is_torch_available():
@@ -85,6 +122,8 @@ def run(self):
             "Python version": platform.python_version(),
             "Huggingface_hub version": huggingface_hub.__version__,
             "Safetensors version": f"{safetensors_version}",
+            "Accelerate version": f"{accelerate_version}",
+            "Accelerate config": f"{accelerate_config_str}",
             "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
             "Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
             "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",

From f9a711df4a899d8586a62eabe6005334412936e8 Mon Sep 17 00:00:00 2001
From: dymil <30931139+dymil@users.noreply.github.com>
Date: Thu, 13 Jul 2023 11:56:53 -0400
Subject: [PATCH 654/935] Fix typo 'submosules' (#24809)

---
 docs/source/en/main_classes/model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/main_classes/model.md b/docs/source/en/main_classes/model.md
index f4ac72c405fc..da907f80ee48 100644
--- a/docs/source/en/main_classes/model.md
+++ b/docs/source/en/main_classes/model.md
@@ -103,7 +103,7 @@ t0pp.hf_device_map
  'lm_head': 'cpu'}
 ```
 
-You can also write your own device map following the same format (a dictionary layer name to device). It should map all parameters of the model to a given device, but you don't have to detail where all the submosules of one layer go if that layer is entirely on the same device. For instance, the following device map would work properly for T0pp (as long as you have the GPU memory):
+You can also write your own device map following the same format (a dictionary layer name to device). It should map all parameters of the model to a given device, but you don't have to detail where all the submodules of one layer go if that layer is entirely on the same device. For instance, the following device map would work properly for T0pp (as long as you have the GPU memory):
 
 ```python
 device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1}

From c0ca73dc98fffbc1d4c4e1d24a7a3c79265bd7c6 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Thu, 13 Jul 2023 17:27:58 +0100
Subject: [PATCH 655/935] Remove Falcon docs for the release until TGI is ready
 (#24808)

* Remove Falcon docs for the release until TGI is ready

* Update toctree
---
 docs/source/en/_toctree.yml        |  2 -
 docs/source/en/model_doc/falcon.md | 66 ------------------------------
 utils/check_repo.py                |  6 +++
 3 files changed, 6 insertions(+), 68 deletions(-)
 delete mode 100644 docs/source/en/model_doc/falcon.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 65e5187bfe89..403779fe34bb 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -295,8 +295,6 @@
         title: ErnieM
       - local: model_doc/esm
         title: ESM
-      - local: model_doc/falcon
-        title: Falcon
       - local: model_doc/flan-t5
         title: FLAN-T5
       - local: model_doc/flan-ul2
diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md
deleted file mode 100644
index 8b6c655a4e6f..000000000000
--- a/docs/source/en/model_doc/falcon.md
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Falcon
-
-## Overview
-
-Falcon is a state-of-the-art language model trained on the [RefinedWeb dataset](https://arxiv.org/abs/2306.01116). At the time of writing, it is the leading model on the [OpenLLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). 
-
-There is no paper associated with Falcon yet, but for citation information please see [the repository for Falcon-40B](https://huggingface.co/tiiuae/falcon-40b#citation), the highest-performance Falcon model. 
-
-- The model and tokenizer can be loaded via:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b-instruct")
-model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b-instruct")
-
-inputs = tokenizer("What's the best way to divide a pizza between three people?", return_tensors="pt")
-outputs = model.generate(**inputs, max_length=50)
-```
-
-- The Falcon tokenizer is a BPE model.
-
-## FalconConfig
-
-[[autodoc]] FalconConfig
-
-## FalconModel
-
-[[autodoc]] FalconModel
-    - forward
-
-## FalconForCausalLM
-
-[[autodoc]] FalconForCausalLM
-    - forward
-
-## FalconForSequenceClassification
-
-[[autodoc]] FalconForSequenceClassification
-    - forward
-
-## FalconForTokenClassification
-
-[[autodoc]] FalconForTokenClassification
-    - forward
-
-## FalconForQuestionAnswering
-
-[[autodoc]] FalconForQuestionAnswering
-    - forward
\ No newline at end of file
diff --git a/utils/check_repo.py b/utils/check_repo.py
index b39bc8489e10..5a3184b10ab6 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -920,6 +920,12 @@ def find_all_documented_objects():
     "logging",  # External module
     "requires_backends",  # Internal function
     "AltRobertaModel",  # Internal module
+    "FalconConfig",  # TODO Matt Remove this and re-add the docs once TGI is ready
+    "FalconForCausalLM",
+    "FalconForQuestionAnswering",
+    "FalconForSequenceClassification",
+    "FalconForTokenClassification",
+    "FalconModel",
 ]
 
 # This list should be empty. Objects in it should get their own doc page.

From 08667050223a6f27b8cd3a057bc06585ff190ecf Mon Sep 17 00:00:00 2001
From: Georgie Mathews <13368542+georgiemathews@users.noreply.github.com>
Date: Thu, 13 Jul 2023 09:56:43 -0700
Subject: [PATCH 656/935] Update setup.py to be compatible with pipenv (#24789)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ab484cb614a7..a3f2beba0a53 100644
--- a/setup.py
+++ b/setup.py
@@ -444,7 +444,7 @@ def run(self):
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
     python_requires=">=3.8.0",
-    install_requires=install_requires,
+    install_requires=list(install_requires),
     classifiers=[
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",

From 9d7a0871e22607c771b551b052caa34423870712 Mon Sep 17 00:00:00 2001
From: Fady Nakhla <67917337+fadynakhla@users.noreply.github.com>
Date: Thu, 13 Jul 2023 12:24:51 -0700
Subject: [PATCH 657/935] Use _BaseAutoModelClass's register method (#24810)

Switching _BaseAutoModelClass from_pretrained and from_config to use the register classmethod that it defines rather than using the _LazyAutoMapping register method directly. This makes use of the additional consistency check within the base model's register.
---
 src/transformers/models/auto/auto_factory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index d322f83668ea..5519d82e7aab 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -418,7 +418,7 @@ def from_config(cls, config, **kwargs):
             else:
                 repo_id = config.name_or_path
             model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
-            cls._model_mapping.register(config.__class__, model_class, exist_ok=True)
+            cls.register(config.__class__, model_class, exist_ok=True)
             _ = kwargs.pop("code_revision", None)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
@@ -477,7 +477,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
             )
             _ = hub_kwargs.pop("code_revision", None)
-            cls._model_mapping.register(config.__class__, model_class, exist_ok=True)
+            cls.register(config.__class__, model_class, exist_ok=True)
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )

From f32303d519d75782c61259daf10c0d657f714c9d Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 13 Jul 2023 15:25:45 -0400
Subject: [PATCH 658/935] Run hub tests (#24807)

* Run hub tests

* [all-test] Run tests please!

* [all-test] Add vision dep for hub tests

* Fix tests
---
 .circleci/create_circleci_config.py  |  3 ++-
 tests/test_image_processing_utils.py | 24 ++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index f44b500a80ad..63894cddd1bf 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -398,12 +398,13 @@ def job_name(self):
 
 hub_job = CircleCIJob(
     "hub",
+    additional_env={"HUGGINGFACE_CO_STAGING": True},
     install_steps=[
         "sudo apt-get -y update && sudo apt-get install git-lfs",
         'git config --global user.email "ci@dummy.com"',
         'git config --global user.name "ci"',
         "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing]",
+        "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing,vision]",
     ],
     marker="is_staging_test",
     pytest_num_workers=1,
diff --git a/tests/test_image_processing_utils.py b/tests/test_image_processing_utils.py
index d74da63249ba..5211a541d3ec 100644
--- a/tests/test_image_processing_utils.py
+++ b/tests/test_image_processing_utils.py
@@ -57,6 +57,17 @@ def test_legacy_load_from_url(self):
             "https://huggingface.co/hf-internal-testing/tiny-random-vit/resolve/main/preprocessor_config.json"
         )
 
+    def test_image_processor_from_pretrained_subfolder(self):
+        with self.assertRaises(OSError):
+            # config is in subfolder, the following should not work without specifying the subfolder
+            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
+
+        config = AutoImageProcessor.from_pretrained(
+            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
+        )
+
+        self.assertIsNotNone(config)
+
 
 @is_staging_test
 class ImageProcessorPushToHubTester(unittest.TestCase):
@@ -133,7 +144,7 @@ def test_push_to_hub_dynamic_image_processor(self):
         # This has added the proper auto_map field to the config
         self.assertDictEqual(
             image_processor.auto_map,
-            {"ImageProcessor": "custom_image_processing.CustomImageProcessor"},
+            {"AutoImageProcessor": "custom_image_processing.CustomImageProcessor"},
         )
 
         new_image_processor = AutoImageProcessor.from_pretrained(
@@ -141,14 +152,3 @@ def test_push_to_hub_dynamic_image_processor(self):
         )
         # Can't make an isinstance check because the new_image_processor is from the CustomImageProcessor class of a dynamic module
         self.assertEqual(new_image_processor.__class__.__name__, "CustomImageProcessor")
-
-    def test_image_processor_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/stable-diffusion-all-variants")
-
-        config = AutoImageProcessor.from_pretrained(
-            "hf-internal-testing/stable-diffusion-all-variants", subfolder="feature_extractor"
-        )
-
-        self.assertIsNotNone(config)

From 91d7df58b6537d385e90578dac40204cb550f706 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 13 Jul 2023 16:57:20 -0400
Subject: [PATCH 659/935] Copy code when using local trust remote code (#24785)

* Copy code when using local trust remote code

* Remote upgrade strategy

* Revert "Remote upgrade strategy"

This reverts commit 4f0392f5d747bcbbcf7211ef9f9b555a86778297.
---
 src/transformers/models/auto/auto_factory.py          | 11 +++++++++--
 src/transformers/models/auto/configuration_auto.py    |  3 +++
 .../models/auto/feature_extraction_auto.py            |  2 ++
 src/transformers/models/auto/image_processing_auto.py |  2 ++
 src/transformers/models/auto/processing_auto.py       |  3 +++
 src/transformers/models/auto/tokenization_auto.py     |  2 ++
 6 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 5519d82e7aab..7823d467505f 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -15,6 +15,7 @@
 """Factory function to build auto-model classes."""
 import copy
 import importlib
+import os
 from collections import OrderedDict
 
 from ...configuration_utils import PretrainedConfig
@@ -418,7 +419,10 @@ def from_config(cls, config, **kwargs):
             else:
                 repo_id = config.name_or_path
             model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
-            cls.register(config.__class__, model_class, exist_ok=True)
+            if os.path.isdir(config._name_or_path):
+                model_class.register_for_auto_class(cls.__name__)
+            else:
+                cls.register(config.__class__, model_class, exist_ok=True)
             _ = kwargs.pop("code_revision", None)
             return model_class._from_config(config, **kwargs)
         elif type(config) in cls._model_mapping.keys():
@@ -477,7 +481,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
             )
             _ = hub_kwargs.pop("code_revision", None)
-            cls.register(config.__class__, model_class, exist_ok=True)
+            if os.path.isdir(pretrained_model_name_or_path):
+                model_class.register_for_auto_class(cls.__name__)
+            else:
+                cls.register(config.__class__, model_class, exist_ok=True)
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 25f9ccc65ea9..893a319a0213 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Auto Config class."""
 import importlib
+import os
 import re
 import warnings
 from collections import OrderedDict
@@ -984,6 +985,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         if has_remote_code and trust_remote_code:
             class_ref = config_dict["auto_map"]["AutoConfig"]
             config_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
+            if os.path.isdir(pretrained_model_name_or_path):
+                config_class.register_for_auto_class()
             _ = kwargs.pop("code_revision", None)
             return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "model_type" in config_dict:
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index b100b62ac024..09734d200461 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -340,6 +340,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
             )
             _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                feature_extractor_class.register_for_auto_class()
             return feature_extractor_class.from_dict(config_dict, **kwargs)
         elif feature_extractor_class is not None:
             return feature_extractor_class.from_dict(config_dict, **kwargs)
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 8b1dea55c3c9..ae8d64ae6137 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -364,6 +364,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 image_processor_auto_map, pretrained_model_name_or_path, **kwargs
             )
             _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                image_processor_class.register_for_auto_class()
             return image_processor_class.from_dict(config_dict, **kwargs)
         elif image_processor_class is not None:
             return image_processor_class.from_dict(config_dict, **kwargs)
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 229b342d4d1f..9005925d7d76 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -16,6 +16,7 @@
 import importlib
 import inspect
 import json
+import os
 from collections import OrderedDict
 
 # Build the list of all feature extractors
@@ -262,6 +263,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 processor_auto_map, pretrained_model_name_or_path, **kwargs
             )
             _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                processor_class.register_for_auto_class()
             return processor_class.from_pretrained(
                 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
             )
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 57bec0e17847..68c99b7f43c6 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -684,6 +684,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 class_ref = tokenizer_auto_map[0]
             tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
             _ = kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                tokenizer_class.register_for_auto_class()
             return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif config_tokenizer_class is not None:
             tokenizer_class = None

From 50726f9ea7afc6113da617f8f4ca1ab264a5e28a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 14 Jul 2023 15:20:02 +0200
Subject: [PATCH 660/935] Fixing double `use_auth_token.pop` (preventing
 private models from being visible). (#24812)

Fixing double `use_auth_token.pop` (preventing private models from
being visible).

Should fix: https://github.com/huggingface/transformers/issues/14334#issuecomment-1634527833

Repro: Have a private repo, with `vocab.json` (spread out files for the
tokenizer) and use `AutoTokenizer.from_pretrained(...,
use_auth_token="token")`.
---
 src/transformers/tokenization_utils_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index c9284c10a2ca..2f18e964d13e 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1710,7 +1710,6 @@ def from_pretrained(
         from_auto_class = kwargs.pop("_from_auto", False)
         commit_hash = kwargs.pop("_commit_hash", None)
 
-        use_auth_token = kwargs.pop("use_auth_token", None)
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning

From a865b62e07c88f4139379fea99bace5ac3f400d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dario=20Su=C4=8Di=C4=87?= <dariosucic@gmail.com>
Date: Fri, 14 Jul 2023 19:13:45 +0200
Subject: [PATCH 661/935] set correct model input names for gptsw3tokenizer
 (#24788)

---
 src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index b7877b6b0ff6..f592a2b63eab 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -107,6 +107,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(
         self,

From 102370544044e020cc435f96dcada668e35d7aea Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 14 Jul 2023 14:43:19 -0400
Subject: [PATCH 662/935] Check models used for common tests are small (#24824)

* First models

* Conditional DETR

* Treat DETR models, skip others

* Skip LayoutLMv2 as well

* Fix last tests
---
 .../configuration_conditional_detr.py         | 15 ++++-
 .../modeling_conditional_detr.py              | 17 +++---
 .../configuration_deformable_detr.py          | 14 +++++
 src/transformers/models/detr/modeling_detr.py |  8 +--
 .../bridgetower/test_modeling_bridgetower.py  | 16 +++--
 tests/models/canine/test_modeling_canine.py   |  4 ++
 tests/models/clap/test_modeling_clap.py       | 13 +++--
 .../test_modeling_conditional_detr.py         | 50 ++++++----------
 tests/models/ctrl/test_modeling_ctrl.py       |  4 ++
 tests/models/cvt/test_modeling_cvt.py         |  4 ++
 .../test_modeling_deformable_detr.py          | 58 +++++++++----------
 tests/models/deta/test_modeling_deta.py       |  4 ++
 tests/models/detr/test_modeling_detr.py       | 50 +++++++---------
 tests/models/dpt/test_modeling_dpt.py         |  4 ++
 tests/models/dpt/test_modeling_dpt_hybrid.py  |  4 ++
 .../test_modeling_efficientnet.py             |  4 ++
 tests/models/encodec/test_modeling_encodec.py |  4 ++
 tests/models/esm/test_modeling_esm.py         |  4 ++
 tests/models/esm/test_modeling_esmfold.py     |  4 ++
 tests/models/flava/test_modeling_flava.py     | 20 +++++++
 tests/models/git/test_modeling_git.py         |  4 ++
 .../test_modeling_gptsan_japanese.py          |  8 +++
 .../graphormer/test_modeling_graphormer.py    |  4 ++
 .../models/layoutlm/test_modeling_layoutlm.py |  4 ++
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  4 ++
 tests/models/levit/test_modeling_levit.py     |  4 ++
 .../mask2former/test_modeling_mask2former.py  |  4 ++
 .../maskformer/test_modeling_maskformer.py    |  4 ++
 .../mobilevit/test_modeling_mobilevit.py      |  4 ++
 .../mobilevitv2/test_modeling_mobilevitv2.py  |  4 ++
 .../oneformer/test_modeling_oneformer.py      |  4 ++
 .../perceiver/test_modeling_perceiver.py      |  4 ++
 .../segformer/test_modeling_segformer.py      |  4 ++
 .../models/speecht5/test_modeling_speecht5.py | 20 +++++++
 .../swiftformer/test_modeling_swiftformer.py  |  4 ++
 .../test_modeling_table_transformer.py        |  4 ++
 .../test_modeling_timm_backbone.py            |  4 ++
 tests/models/tvlt/test_modeling_tvlt.py       |  4 ++
 tests/models/upernet/test_modeling_upernet.py |  4 ++
 .../models/videomae/test_modeling_videomae.py |  4 ++
 tests/models/vit_mae/test_modeling_vit_mae.py |  4 ++
 tests/models/vivit/test_modeling_vivit.py     |  4 ++
 tests/test_modeling_common.py                 | 12 ++++
 43 files changed, 305 insertions(+), 116 deletions(-)

diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index ec04f0a52369..9ce6179a19b1 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Conditional DETR model configuration"""
-
+import copy
 from collections import OrderedDict
 from typing import Mapping
 
@@ -238,6 +238,19 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if self.backbone_config is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index fc36732cc420..f824b52430df 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -499,10 +499,11 @@ def build_position_encoding(config):
 
 
 # function to generate sine positional embedding for 2d coordinates
-def gen_sine_position_embeddings(pos_tensor):
+def gen_sine_position_embeddings(pos_tensor, d_model):
     scale = 2 * math.pi
-    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
-    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / 128)
+    dim = d_model // 2
+    dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
     x_embed = pos_tensor[:, :, 0] * scale
     y_embed = pos_tensor[:, :, 1] * scale
     pos_x = x_embed[:, :, None] / dim_t
@@ -1376,7 +1377,7 @@ def forward(
         reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
         obj_center = reference_points[..., :2].transpose(0, 1)
         # get sine embedding for the query vector
-        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center)
+        query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center, self.config.d_model)
 
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -2093,13 +2094,13 @@ def __init__(self, dim, fpn_dims, context_dim):
         self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = nn.GroupNorm(8, dim)
         self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
-        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
         self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
-        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
         self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
-        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
         self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
-        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
         self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 
         self.dim = dim
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index dbe5fd7f0a78..be4634477d14 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Deformable DETR model configuration"""
+import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -260,3 +261,16 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if self.backbone_config is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 3476e1e2e715..ec47c9bb4f61 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1791,13 +1791,13 @@ def __init__(self, dim, fpn_dims, context_dim):
         self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = nn.GroupNorm(8, dim)
         self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
-        self.gn2 = nn.GroupNorm(8, inter_dims[1])
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
         self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
-        self.gn3 = nn.GroupNorm(8, inter_dims[2])
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
         self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
-        self.gn4 = nn.GroupNorm(8, inter_dims[3])
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
         self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
-        self.gn5 = nn.GroupNorm(8, inter_dims[4])
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
         self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
 
         self.dim = dim
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index 0eb293fdc33c..8c7bd00ee668 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -62,12 +62,12 @@ def __init__(
         self,
         parent,
         hidden_act="gelu",
-        hidden_size=128,
+        hidden_size=64,
         initializer_factor=1,
         layer_norm_eps=1e-05,
         num_attention_heads=4,
         num_hidden_layers=2,
-        intermediate_size=256,
+        intermediate_size=128,
         tie_word_embeddings=False,
         output_hidden_states=False,
     ):
@@ -105,6 +105,7 @@ def get_config(self):
             intermediate_size=self.intermediate_size,
             tie_word_embeddings=self.tie_word_embeddings,
             output_hidden_states=self.output_hidden_states,
+            vocab_size=self.vocab_size,
         )
 
 
@@ -112,7 +113,7 @@ class BridgeTowerImageModelTester:
     def __init__(
         self,
         parent,
-        hidden_size=128,
+        hidden_size=64,
         initializer_factor=1,
         layer_norm_eps=1e-05,
         num_hidden_layers=2,
@@ -168,10 +169,10 @@ def __init__(
         init_layernorm_from_vision_encoder=False,
         contrastive_hidden_size=512,
         logit_scale_init_value=2.6592,
-        hidden_size=128,
+        hidden_size=64,
         num_hidden_layers=2,
         num_attention_heads=4,
-        intermediate_size=256,
+        intermediate_size=128,
     ):
         if text_kwargs is None:
             text_kwargs = {}
@@ -280,7 +281,10 @@ def create_and_check_for_masked_language_modeling(
         result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
         result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
 
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_model_tester.seq_length, 50265))
+        self.parent.assertEqual(
+            result.logits.shape,
+            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.vocab_size),
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index d612a02bf47c..057fc0913185 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -50,6 +50,7 @@ def __init__(
         use_token_type_ids=True,
         use_labels=True,
         # let's use a vocab size that's way bigger than BERT's one
+        # NOTE: this is not a model parameter, just an input
         vocab_size=100000,
         hidden_size=32,
         num_hidden_layers=5,
@@ -64,6 +65,7 @@ def __init__(
         initializer_range=0.02,
         num_labels=3,
         num_choices=4,
+        num_hash_buckets=16,
         scope=None,
     ):
         self.parent = parent
@@ -87,6 +89,7 @@ def __init__(
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.num_choices = num_choices
+        self.num_hash_buckets = num_hash_buckets
         self.scope = scope
 
     def prepare_config_and_inputs(self):
@@ -125,6 +128,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            num_hash_buckets=self.num_hash_buckets,
         )
 
     def create_and_check_model(
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 53b4082c68da..35c0f4e203e9 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -67,11 +67,12 @@ def __init__(
         freq_ratio=2,
         num_channels=3,
         is_training=True,
-        hidden_size=256,
-        patch_embeds_hidden_size=32,
+        hidden_size=32,
+        patch_embeds_hidden_size=16,
         projection_dim=32,
-        num_hidden_layers=4,
-        num_heads=[2, 2, 2, 2],
+        depths=[2, 2],
+        num_hidden_layers=2,
+        num_heads=[2, 2],
         intermediate_size=37,
         dropout=0.1,
         attention_dropout=0.1,
@@ -89,6 +90,7 @@ def __init__(
         self.hidden_size = hidden_size
         self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
+        self.depths = depths
         self.num_heads = num_heads
         self.num_attention_heads = num_heads[0]
         self.seq_length = seq_length
@@ -118,6 +120,7 @@ def get_config(self):
             hidden_size=self.hidden_size,
             patch_stride=self.patch_stride,
             projection_dim=self.projection_dim,
+            depths=self.depths,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_heads,
             intermediate_size=self.intermediate_size,
@@ -203,7 +206,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
-                [self.model_tester.patch_embeds_hidden_size, self.model_tester.patch_embeds_hidden_size],
+                [2 * self.model_tester.patch_embeds_hidden_size, 2 * self.model_tester.patch_embeds_hidden_size],
             )
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index e1827f021391..10d788bd692f 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -19,8 +19,8 @@
 import math
 import unittest
 
-from transformers import ConditionalDetrConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import ConditionalDetrConfig, ResNetConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -29,14 +29,13 @@
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
     from transformers import (
         ConditionalDetrForObjectDetection,
         ConditionalDetrForSegmentation,
         ConditionalDetrModel,
-        ResNetConfig,
     )
 
 
@@ -53,7 +52,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -111,6 +110,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return ConditionalDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -123,6 +132,8 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -159,27 +170,8 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-    def create_and_check_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
-        config.use_timm_backbone = False
-        config.backbone_config = ResNetConfig()
-        model = ConditionalDetrForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
 
-@require_timm
+@require_torch
 class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
@@ -187,12 +179,12 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
             ConditionalDetrForObjectDetection,
             ConditionalDetrForSegmentation,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
         {"feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -243,10 +235,6 @@ def test_conditional_detr_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_conditional_detr_no_timm_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_no_timm_backbone(*config_and_inputs)
-
     # TODO: check if this works again for PyTorch 2.x.y
     @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
     def test_multi_gpu_data_parallel_forward(self):
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 8941927f1732..fd2027723916 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -243,6 +243,10 @@ def test_ctrl_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in CTRL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index e7fc756ea823..ab37e47b5130 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -247,6 +247,10 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in CVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 180f57f042f4..1864d054fdb3 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -20,9 +20,16 @@
 import unittest
 from typing import Dict, List, Tuple
 
-from transformers import DeformableDetrConfig, is_timm_available, is_vision_available
+from transformers import DeformableDetrConfig, ResNetConfig, is_torch_available, is_vision_available
 from transformers.file_utils import cached_property
-from transformers.testing_utils import require_timm, require_torch_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_timm,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -30,10 +37,10 @@
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import DeformableDetrForObjectDetection, DeformableDetrModel, ResNetConfig
+    from transformers import DeformableDetrForObjectDetection, DeformableDetrModel
 
 
 if is_vision_available():
@@ -49,7 +56,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -116,6 +123,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DeformableDetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -131,6 +148,8 @@ def get_config(self):
             num_feature_levels=self.num_feature_levels,
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -165,32 +184,13 @@ def create_and_check_deformable_detr_object_detection_head_model(self, config, p
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-    def create_and_check_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
-        config.use_timm_backbone = False
-        config.backbone_config = ResNetConfig()
-        model = DeformableDetrForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
 
-@require_timm
+@require_torch
 class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_timm_available() else ()
+    all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else ()
     pipeline_model_mapping = (
         {"feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -246,10 +246,6 @@ def test_deformable_detr_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_deformable_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_deformable_detr_no_timm_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_no_timm_backbone(*config_and_inputs)
-
     @unittest.skip(reason="Deformable DETR does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index b1d54a87def4..0693b030e299 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -423,6 +423,10 @@ def test_forward_signature(self):
     def test_tied_model_weights_key_ignore(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index f993a96dcbd0..abede6fa14c3 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -19,7 +19,7 @@
 import math
 import unittest
 
-from transformers import DetrConfig, is_timm_available, is_vision_available
+from transformers import DetrConfig, ResNetConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
@@ -29,10 +29,10 @@
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel, ResNetConfig
+    from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
 
 
 if is_vision_available():
@@ -48,7 +48,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -106,6 +106,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DetrConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -118,6 +128,8 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -154,27 +166,8 @@ def create_and_check_detr_object_detection_head_model(self, config, pixel_values
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
-    def create_and_check_no_timm_backbone(self, config, pixel_values, pixel_mask, labels):
-        config.use_timm_backbone = False
-        config.backbone_config = ResNetConfig()
-        model = DetrForObjectDetection(config=config)
-        model.to(torch_device)
-        model.eval()
 
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-
-@require_timm
+@require_torch
 class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
@@ -182,7 +175,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             DetrForObjectDetection,
             DetrForSegmentation,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
@@ -191,7 +184,7 @@ class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             "image-segmentation": DetrForSegmentation,
             "object-detection": DetrForObjectDetection,
         }
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -242,10 +235,6 @@ def test_detr_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_detr_no_timm_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_no_timm_backbone(*config_and_inputs)
-
     # TODO: check if this works again for PyTorch 2.x.y
     @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
     def test_multi_gpu_data_parallel_forward(self):
@@ -464,6 +453,7 @@ def test_greyscale_images(self):
 
         # let's set num_channels to 1
         config.num_channels = 1
+        config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
             model = model_class(config)
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 31e6cf221d3b..f9cb66607d0f 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -275,6 +275,10 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 68d600f33f37..4c32c76e8659 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -289,6 +289,10 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[1:]:
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index 11babddac2ed..e77c17a7a6a0 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -223,6 +223,10 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 389444d4b25d..b8883314619a 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -397,6 +397,10 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_identity_shortcut(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         config.use_conv_shortcut = False
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 2e5d48082be2..fc1879e6bf45 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -279,6 +279,10 @@ def test_resize_embeddings_untied(self):
     def test_resize_tokens_embeddings(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class EsmModelIntegrationTest(TestCasePlus):
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 84444f322237..bc5c10ae2427 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -243,6 +243,10 @@ def test_torchscript_simple(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class EsmModelIntegrationTest(TestCasePlus):
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index e69d2226c890..cef39224da3d 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -321,6 +321,10 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -472,6 +476,10 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -624,6 +632,10 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -731,6 +743,10 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_CODEBOOK_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -913,6 +929,10 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             return
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 1c39b61e47ca..5997230b16eb 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -454,6 +454,10 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 @require_vision
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
index d00c755f3cb9..0738b294c033 100644
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -182,6 +182,10 @@ def test_model(self):
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -212,6 +216,10 @@ def test_model(self):
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_logits(self):
         model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index bc860e1a8aa4..60f188f3b2b1 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -470,6 +470,10 @@ def test_for_graph_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 0535fbf4e1f4..687d1ae4a562 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,6 +279,10 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 def prepare_layoutlm_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 7d2f35c8b942..4eda8952c39d 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -415,6 +415,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 4f7852751497..b78554374e3c 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -282,6 +282,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 7ec44c0713a7..c492bbb7664d 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -220,6 +220,10 @@ def test_resize_tokens_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 81c720a0b59e..69cf21d566ec 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -224,6 +224,10 @@ def test_resize_tokens_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index c0ad5a7744b3..350934ad051a 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -231,6 +231,10 @@ def test_forward_signature(self):
             expected_arg_names = ["pixel_values"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index 0e55e6babe90..7f5c332a6166 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -225,6 +225,10 @@ def test_attention_outputs(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index dc25a2b48ff3..f23c0f265cea 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -309,6 +309,10 @@ def test_forward_signature(self):
             expected_arg_names = ["pixel_values", "task_inputs"]
             self.assertListEqual(arg_names[:2], expected_arg_names)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 64ad560d7d99..23bd75bdd157 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -784,6 +784,10 @@ def test_problem_types(self):
 
                     loss.backward()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @require_torch_multi_gpu
     @unittest.skip(
         reason=(
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 03706ca96e2d..7f1900114897 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -347,6 +347,10 @@ def test_training(self):
             loss = model(**inputs).loss
             loss.backward()
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 7532f132a79d..fbd09ef500b4 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -238,6 +238,10 @@ def test_torchscript_simple(self):
         # disabled because this model doesn't have decoder_input_ids
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 class SpeechT5ForSpeechToTextTester:
@@ -701,6 +705,10 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
@@ -988,6 +996,10 @@ def _mock_init_weights(self, module):
         if hasattr(module, "bias") and module.bias is not None:
             module.bias.data.fill_(3)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
@@ -1404,6 +1416,10 @@ def _mock_init_weights(self, module):
         if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
             module.masked_spec_embed.data.fill_(3)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 @require_torch
 @require_sentencepiece
@@ -1546,6 +1562,10 @@ def test_model_outputs_equivalence(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     # skip because it fails on automapping of SpeechT5HifiGanConfig
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index e524719d23ed..151807c80cff 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -272,6 +272,10 @@ def _config_zero_init(config):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 2d42e2990680..0df8da45cb94 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -486,6 +486,10 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index 2af7c4c6178d..c134a588b6b5 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -196,6 +196,10 @@ def test_torchscript_output_attentions(self):
     def test_can_use_safetensors(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index 52af9b6f1b3e..41eefc9eb73d 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -542,6 +542,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 79419f7b3761..97ba37f8be79 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -207,6 +207,10 @@ def test_save_load_fast_init_to_base(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 424fc7d84ad7..6f2e7fa31e39 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -344,6 +344,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 8b63cd6d7540..bb50cb9606ec 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -279,6 +279,10 @@ def test_save_load_fast_init_to_base(self):
     def test_model_outputs_equivalence(self):
         pass
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index 4440f46d046d..d7d72eca7ead 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -310,6 +310,10 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    def test_model_is_small(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 7e7ca36b4cec..0d5080ec5aa2 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2705,6 +2705,18 @@ def test_load_with_mismatched_shapes(self):
                     else:
                         new_model_without_prefix(input_ids)
 
+    def test_model_is_small(self):
+        # Just a consistency check to make sure we are not running tests on 80M parameter models.
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        # print(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            num_params = model.num_parameters()
+            assert (
+                num_params < 1000000
+            ), f"{model_class} is too big for the common tests ({num_params})! It should have 200k max."
+
 
 global_rng = random.Random()
 

From 5bb4430edc7df9f9950d412d98bbe505cc4d328b Mon Sep 17 00:00:00 2001
From: Kadir Nar <kadir.nar@hotmail.com>
Date: Sat, 15 Jul 2023 00:47:50 +0300
Subject: [PATCH 663/935] =?UTF-8?q?[=F0=9F=94=97=20Docs]=20Fixed=20Incorre?=
 =?UTF-8?q?ct=20Migration=20Link=20(#24793)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [🔗 Docs] Fixed Incorrect Migration Link

* Update README.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 527cb32fb194..78cc3792afa1 100644
--- a/README.md
+++ b/README.md
@@ -516,7 +516,6 @@ These implementations have been tested on several datasets (see the example scri
 | [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
 | [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks |
 | [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community |
-| [Migration](https://huggingface.co/docs/transformers/migration) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` |
 
 ## Citation
 

From 8ba26c18cf5b928b34596498dd2cd93d36badce4 Mon Sep 17 00:00:00 2001
From: statelesshz <371503430@qq.com>
Date: Mon, 17 Jul 2023 18:57:42 +0800
Subject: [PATCH 664/935] deprecate `sharded_ddp` training argument (#24825)

* deprecate fairscale's ShardedDDP

* fix code style

* roll back

* deprecate the `sharded_ddp` training argument

---------

Co-authored-by: jihuazhong <jihuazhong1@huawei.com>
---
 ISSUES.md                              |   2 +-
 docs/source/en/main_classes/trainer.md | 148 +------------------------
 docs/source/en/perf_train_gpu_many.md  |   3 -
 src/transformers/training_args.py      |   6 +
 4 files changed, 11 insertions(+), 148 deletions(-)

diff --git a/ISSUES.md b/ISSUES.md
index 7c36da3c6804..95f2334b26c8 100644
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -158,7 +158,7 @@ You are not required to read the following guidelines before opening an issue. H
     --do_train --n_train 500 --num_train_epochs 1 \
     --per_device_train_batch_size 1  --freeze_embeds \
     --src_lang en_XX --tgt_lang ro_RO --task translation \
-    --fp16 --sharded_ddp
+    --fp16
    ```
 
    If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index c74b764c5e6d..dcf076cc5ac4 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -295,7 +295,7 @@ Also if you do set this environment variable it's the best to set it in your `~/
 The [`Trainer`] has been extended to support libraries that may dramatically improve your training
 time and fit much bigger models.
 
-Currently it supports third party solutions, [DeepSpeed](https://github.com/microsoft/DeepSpeed), [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) and [FairScale](https://github.com/facebookresearch/fairscale/), which implement parts of the paper [ZeRO: Memory Optimizations
+Currently it supports third party solutions, [DeepSpeed](https://github.com/microsoft/DeepSpeed) and [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html), which implement parts of the paper [ZeRO: Memory Optimizations
 Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054).
 
 This provided support is new and experimental as of this writing. While the support for DeepSpeed and PyTorch FSDP is active and we welcome issues around it, we don't support the FairScale integration anymore since it has been integrated in PyTorch main (see the [PyTorch FSDP integration](#pytorch-fully-sharded-data-parallel))
@@ -304,15 +304,14 @@ This provided support is new and experimental as of this writing. While the supp
 
 ### CUDA Extension Installation Notes
 
-As of this writing, both FairScale and Deepspeed require compilation of CUDA C++ code, before they can be used.
+As of this writing, Deepspeed require compilation of CUDA C++ code, before it can be used.
 
-While all installation issues should be dealt with through the corresponding GitHub Issues of [FairScale](https://github.com/facebookresearch/fairscale/issues) and [Deepspeed](https://github.com/microsoft/DeepSpeed/issues), there are a few common issues that one may encounter while building
+While all installation issues should be dealt with through the corresponding GitHub Issues of [Deepspeed](https://github.com/microsoft/DeepSpeed/issues), there are a few common issues that one may encounter while building
 any PyTorch extension that needs to build CUDA extensions.
 
-Therefore, if you encounter a CUDA-related build issue while doing one of the following or both:
+Therefore, if you encounter a CUDA-related build issue while doing the following:
 
 ```bash
-pip install fairscale
 pip install deepspeed
 ```
 
@@ -410,145 +409,6 @@ should find `gcc-7` (and `g++7`) and then the build will succeed.
 
 As always make sure to edit the paths in the example to match your situation.
 
-### FairScale
-
-<Tip warning={true}>
-
-This integration is not supported anymore, we recommend you either use DeepSpeed or PyTorch FSDP.
-
-</Tip>
-
-By integrating [FairScale](https://github.com/facebookresearch/fairscale/) the [`Trainer`]
-provides support for the following features from [the ZeRO paper](https://arxiv.org/abs/1910.02054):
-
-1. Optimizer State Sharding
-2. Gradient Sharding
-3. Model Parameters Sharding (new and very experimental)
-4. CPU offload (new and very experimental)
-
-You will need at least two GPUs to use this feature.
-
-
-**Installation**:
-
-Install the library via pypi:
-
-```bash
-pip install fairscale
-```
-
-or via `transformers`' `extras`:
-
-```bash
-pip install transformers[fairscale]
-```
-
-(available starting from `transformers==4.6.0`) or find more details on [the FairScale's GitHub page](https://github.com/facebookresearch/fairscale/#installation).
-
-If you're still struggling with the build, first make sure to read [CUDA Extension Installation Notes](#zero-install-notes).
-
-If it's still not resolved the build issue, here are a few more ideas.
-
-`fairscale` seems to have an issue with the recently introduced by pip build isolation feature. If you have a problem
-with it, you may want to try one of:
-
-```bash
-pip install fairscale --no-build-isolation .
-```
-
-or:
-
-```bash
-git clone https://github.com/facebookresearch/fairscale/
-cd fairscale
-rm -r dist build
-python setup.py bdist_wheel
-pip uninstall -y fairscale
-pip install dist/fairscale-*.whl
-```
-
-`fairscale` also has issues with building against pytorch-nightly, so if you use it you may have to try one of:
-
-```bash
-pip uninstall -y fairscale; pip install fairscale --pre \
--f https://download.pytorch.org/whl/nightly/cu110/torch_nightly \
---no-cache --no-build-isolation
-```
-
-or:
-
-```bash
-pip install -v --disable-pip-version-check . \
--f https://download.pytorch.org/whl/nightly/cu110/torch_nightly --pre
-```
-
-Of course, adjust the urls to match the cuda version you use.
-
-If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
-[FairScale](https://github.com/facebookresearch/fairscale/issues).
-
-
-
-**Usage**:
-
-To use the first version of Sharded data-parallelism, add `--sharded_ddp simple` to the command line arguments, and
-make sure you have added the distributed launcher `-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
-
-For example here is how you could use it for `run_translation.py` with 2 GPUs:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=2 examples/pytorch/translation/run_translation.py \
---model_name_or_path t5-small --per_device_train_batch_size 1   \
---output_dir output_dir --overwrite_output_dir \
---do_train --max_train_samples 500 --num_train_epochs 1 \
---dataset_name wmt16 --dataset_config "ro-en" \
---source_lang en --target_lang ro \
---fp16 --sharded_ddp simple
-```
-
-Notes:
-
-- This feature requires distributed training (so multiple GPUs).
-- It is not implemented for TPUs.
-- It works with `--fp16` too, to make things even faster.
-- One of the main benefits of enabling `--sharded_ddp simple` is that it uses a lot less GPU memory, so you should be
-  able to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
-  significantly shorter training time.
-
-3. To use the second version of Sharded data-parallelism, add `--sharded_ddp zero_dp_2` or `--sharded_ddp zero_dp_3` to the command line arguments, and make sure you have added the distributed launcher `-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
-
-For example here is how you could use it for `run_translation.py` with 2 GPUs:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=2 examples/pytorch/translation/run_translation.py \
---model_name_or_path t5-small --per_device_train_batch_size 1   \
---output_dir output_dir --overwrite_output_dir \
---do_train --max_train_samples 500 --num_train_epochs 1 \
---dataset_name wmt16 --dataset_config "ro-en" \
---source_lang en --target_lang ro \
---fp16 --sharded_ddp zero_dp_2
-```
-
-`zero_dp_2` is an optimized version of the simple wrapper, while `zero_dp_3` fully shards model weights,
-gradients and optimizer states.
-
-Both are compatible with adding `cpu_offload` to enable ZeRO-offload (activate it like this: `--sharded_ddp "zero_dp_2 cpu_offload"`).
-
-Notes:
-
-- This feature requires distributed training (so multiple GPUs).
-- It is not implemented for TPUs.
-- It works with `--fp16` too, to make things even faster.
-- The `cpu_offload` additional option requires `--fp16`.
-- This is an area of active development, so make sure you have a source install of fairscale to use this feature as
-  some bugs you encounter may have been fixed there already.
-
-Known caveats:
-
-- This feature is incompatible with `--predict_with_generate` in the _run_translation.py_ script.
-- Using `--sharded_ddp zero_dp_3` requires wrapping each layer of the model in the special container
-  `FullyShardedDataParallelism` of fairscale. It should be used with the option `auto_wrap` if you are not
-  doing this yourself: `--sharded_ddp "zero_dp_3 auto_wrap"`.
 
 ### PyTorch Fully Sharded Data parallel
 
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index d8d6c9f09d2a..fc93f763d815 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -241,7 +241,6 @@ If you pay close attention the way ZeRO partitions the model's weights - it look
 Implementations:
 
 - [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
-- [Fairscale](https://github.com/facebookresearch/fairscale/#optimizer-state-sharding-zero) ZeRO-DP stages 1+2+3
 - [`transformers` integration](main_classes/trainer#trainer-integrations)
 
 ## Naive Model Parallelism (Vertical) and Pipeline Parallelism
@@ -294,7 +293,6 @@ There are 2 groups of solutions - the traditional Pipeline API and the more mode
 
 Traditional Pipeline API solutions:
 - PyTorch
-- FairScale
 - DeepSpeed
 - Megatron-LM
 
@@ -312,7 +310,6 @@ We are yet to experiment with Varuna and SageMaker but their papers report that
 
 Implementations:
 - [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
-- [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html)
 - [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
 - [Varuna](https://github.com/microsoft/varuna)
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 696e35638934..0527b4011473 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1460,6 +1460,12 @@ def __post_init__(self):
                 " during training"
             )
 
+        if not (self.sharded_ddp == "" or not self.sharded_ddp):
+            warnings.warn(
+                "using `sharded_ddp` is deprecated and will be removed in version 4.33"
+                " of 🤗 Transformers. Use `fsdp` instead",
+                FutureWarning,
+            )
         if isinstance(self.sharded_ddp, bool):
             self.sharded_ddp = "simple" if self.sharded_ddp else ""
         if isinstance(self.sharded_ddp, str):

From 9771ad33be5ec6f6f77d4a5e81e433620d89ade1 Mon Sep 17 00:00:00 2001
From: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Date: Mon, 17 Jul 2023 20:04:10 +0900
Subject: [PATCH 665/935] =?UTF-8?q?=F0=9F=8C=90=C2=A0[i18n-KO]=20Translate?=
 =?UTF-8?q?d=C2=A0`custom=5Ftools.mdx`=20to=20Korean=20=20(#24580)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: custom_tools.mdx

* feat: deepl draft

* fix: change .mdx to .md

* fix: resolve suggestions

* fix: resolve suggestions
---
 docs/source/ko/_toctree.yml    |   2 +
 docs/source/ko/custom_tools.md | 748 +++++++++++++++++++++++++++++++++
 2 files changed, 750 insertions(+)
 create mode 100644 docs/source/ko/custom_tools.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index eed3877c95f7..0c8ee2a76738 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -99,6 +99,8 @@
       title: (번역중) Notebooks with examples
     - local: in_translation
       title: (번역중) Community resources
+    - local: custom_tools
+      title: 사용자 정의 도구와 프롬프트
     - local: troubleshooting
       title: 문제 해결
   title: (번역중) 개발자 가이드
diff --git a/docs/source/ko/custom_tools.md b/docs/source/ko/custom_tools.md
new file mode 100644
index 000000000000..1accecab3d00
--- /dev/null
+++ b/docs/source/ko/custom_tools.md
@@ -0,0 +1,748 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# 사용자 정의 도구와 프롬프트[[custom-tools-and-prompts]]
+
+<Tip>
+
+Transformers와 관련하여 어떤 도구와 에이전트가 있는지 잘 모르신다면 [Transformers Agents](transformers_agents) 페이지를 먼저 읽어보시기 바랍니다. 
+
+</Tip>
+
+<Tip warning={true}>
+
+Transformers Agent는 실험 중인 API로 언제든지 변경될 수 있습니다. 
+API 또는 기반 모델이 변경되기 쉽기 때문에 에이전트가 반환하는 결과도 달라질 수 있습니다.
+
+</Tip>
+
+에이전트에게 권한을 부여하고 새로운 작업을 수행하게 하려면 사용자 정의 도구와 프롬프트를 만들고 사용하는 것이 무엇보다 중요합니다.
+이 가이드에서는 다음과 같은 내용을 살펴보겠습니다:
+
+- 프롬프트를 사용자 정의하는 방법
+- 사용자 정의 도구를 사용하는 방법
+- 사용자 정의 도구를 만드는 방법
+
+## 프롬프트를 사용자 정의하기[[customizing-the-prompt]]
+
+[Transformers Agents](transformers_agents)에서 설명한 것처럼 에이전트는 [`~Agent.run`] 및 [`~Agent.chat`] 모드에서 실행할 수 있습니다.
+`run`(실행) 모드와 `chat`(채팅) 모드 모두 동일한 로직을 기반으로 합니다. 
+에이전트를 구동하는 언어 모델은 긴 프롬프트에 따라 조건이 지정되고, 중지 토큰에 도달할 때까지 다음 토큰을 생성하여 프롬프트를 완수합니다.
+`chat` 모드에서는 프롬프트가 이전 사용자 입력 및 모델 생성으로 연장된다는 점이 두 모드의 유일한 차이점입니다.
+이를 통해 에이전트가 과거 상호작용에 접근할 수 있게 되므로 에이전트에게 일종의 메모리를 제공하는 셈입니다.
+
+### 프롬프트의 구조[[structure-of-the-prompt]]
+
+어떻게 프롬프트 사용자 정의를 잘 할 수 있는지 이해하기 위해 프롬프트의 구조를 자세히 살펴봅시다.
+프롬프트는 크게 네 부분으로 구성되어 있습니다.
+
+- 1. 도입: 에이전트가 어떻게 행동해야 하는지, 도구의 개념에 대한 설명.
+- 2. 모든 도구에 대한 설명. 이는 런타임에 사용자가 정의/선택한 도구로 동적으로 대체되는 `<<all_tools>>` 토큰으로 정의됩니다.
+- 3. 작업 예제 및 해당 솔루션 세트.
+- 4. 현재 예제 및 해결 요청.
+
+각 부분을 더 잘 이해할 수 있도록 짧은 버전을 통해 `run` 프롬프트가 어떻게 보이는지 살펴보겠습니다:
+
+````text
+I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
+[...]
+You can print intermediate results if it makes sense to do so.
+
+Tools:
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
+[...]
+
+Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
+
+I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+
+Answer:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+print(f"The answer is {answer}")
+```
+
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+[...]
+
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+````
+
+도입(*"도구:"* 앞의 텍스트)에서는 모델이 어떻게 작동하고 무엇을 해야 하는지 정확하게 설명합니다.
+에이전트는 항상 같은 방식으로 작동해야 하므로 이 부분은 사용자 정의할 필요가 없을 가능성이 높습니다.
+
+두 번째 부분(*"도구"* 아래의 글머리 기호)은 `run` 또는 `chat`을 호출할 때 동적으로 추가됩니다. 
+정확히 `agent.toolbox`에 있는 도구 수만큼 글머리 기호가 있고, 각 글머리 기호는 도구의 이름과 설명으로 구성됩니다:
+
+```text
+- <tool.name>: <tool.description>
+```
+
+문서 질의응답 도구를 가져오고 이름과 설명을 출력해서 빠르게 확인해 보겠습니다.
+
+```py
+from transformers import load_tool
+
+document_qa = load_tool("document-question-answering")
+print(f"- {document_qa.name}: {document_qa.description}")
+```
+
+그러면 다음 결과가 출력됩니다:
+```text
+- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
+```
+
+여기서 도구 이름이 짧고 정확하다는 것을 알 수 있습니다. 
+설명은 두 부분으로 구성되어 있는데, 첫 번째 부분에서는 도구의 기능을 설명하고 두 번째 부분에서는 예상되는 입력 인수와 반환 값을 명시합니다.
+
+에이전트가 도구를 올바르게 사용하려면 좋은 도구 이름과 도구 설명이 매우 중요합니다. 
+에이전트가 도구에 대해 알 수 있는 유일한 정보는 이름과 설명뿐이므로, 이 두 가지를 정확하게 작성하고 도구 상자에 있는 기존 도구의 스타일과 일치하는지 확인해야 합니다. 
+특히 이름에 따라 예상되는 모든 인수가 설명에 코드 스타일로 언급되어 있는지, 예상되는 유형과 그 유형이 무엇인지에 대한 설명이 포함되어 있는지 확인하세요.
+
+<Tip>
+
+도구에 어떤 이름과 설명이 있어야 하는지 이해하려면 엄선된 Transformers 도구의 이름과 설명을 확인하세요. 
+[`Agent.toolbox`] 속성을 가진 모든 도구를 볼 수 있습니다.
+
+</Tip>
+
+세 번째 부분에는 에이전트가 어떤 종류의 사용자 요청에 대해 어떤 코드를 생성해야 하는지 정확하게 보여주는 엄선된 예제 세트가 포함되어 있습니다. 
+에이전트를 지원하는 대규모 언어 모델은 프롬프트에서 패턴을 인식하고 새로운 데이터로 패턴을 반복하는 데 매우 능숙합니다. 
+따라서 에이전트가 실제로 올바른 실행 가능한 코드를 생성할 가능성을 극대화하는 방식으로 예제를 작성하는 것이 매우 중요합니다. 
+
+한 가지 예를 살펴보겠습니다:
+
+````text
+Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
+
+I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+
+Answer:
+```py
+answer = document_qa(document, question="What is the oldest person?")
+print(f"The answer is {answer}.")
+image = image_generator("A banner showing " + answer)
+```
+
+````
+작업 설명, 에이전트가 수행하려는 작업에 대한 설명, 마지막으로 생성된 코드, 이 세 부분으로 구성된 프롬프트는 모델에 반복하여 제공됩니다. 
+프롬프트의 일부인 모든 예제는 이러한 정확한 패턴으로 되어 있으므로, 에이전트가 새 토큰을 생성할 때 정확히 동일한 패턴을 재현할 수 있습니다.
+
+프롬프트 예제는 Transformers 팀이 선별하고 일련의 [problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)에 따라 엄격하게 평가하여 
+에이전트의 프롬프트가 에이전트의 실제 사용 사례를 최대한 잘 해결할 수 있도록 보장합니다.
+
+프롬프트의 마지막 부분은 다음에 해당합니다:
+```text
+Task: "Draw me a picture of rivers and lakes"
+
+I will use the following
+```
+
+이는 에이전트가 완료해야 할 최종적인 미완성 예제입니다. 미완성 예제는 실제 사용자 입력에 따라 동적으로 만들어집니다. 
+위 예시의 경우 사용자가 다음과 같이 실행했습니다:
+
+```py
+agent.run("Draw me a picture of rivers and lakes")
+```
+
+사용자 입력 - *즉* Task: *"Draw me a picture of rivers and lakes"*가 프롬프트 템플릿에 맞춰 "Task: <task> \n\n I will use the following"로 캐스팅됩니다. 
+이 문장은 에이전트에게 조건이 적용되는 프롬프트의 마지막 줄을 구성하므로 에이전트가 이전 예제에서 수행한 것과 정확히 동일한 방식으로 예제를 완료하도록 강력하게 영향을 미칩니다.
+
+너무 자세히 설명하지 않더라도 채팅 템플릿의 프롬프트 구조는 동일하지만 예제의 스타일이 약간 다릅니다. *예를 들면*:
+
+````text
+[...]
+
+=====
+
+Human: Answer the question in the variable `question` about the image stored in the variable `image`.
+
+Assistant: I will use the tool `image_qa` to answer the question on the input image.
+
+```py
+answer = image_qa(text=question, image=image)
+print(f"The answer is {answer}")
+```
+
+Human: I tried this code, it worked but didn't give me a good result. The question is in French
+
+Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
+
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(text=translated_question, image=image)
+print(f"The answer is {answer}")
+```
+
+=====
+
+[...]
+````
+
+`run` 프롬프트의 예와는 반대로, 각 `chat` 프롬프트의 예에는 *Human(사람)*과 *Assistant(어시스턴트)* 간에 하나 이상의 교환이 있습니다. 모든 교환은 `run` 프롬프트의 예와 유사한 구조로 되어 있습니다.
+사용자의 입력이 *Human:* 뒤에 추가되며, 에이전트에게 코드를 생성하기 전에 수행해야 할 작업을 먼저 생성하라는 메시지가 표시됩니다. 
+교환은 이전 교환을 기반으로 할 수 있으므로 위와 같이 사용자가 "**이** 코드를 시도했습니다"라고 입력하면 이전에 생성된 에이전트의 코드를 참조하여 과거 교환을 참조할 수 있습니다.
+
+`.chat`을 실행하면 사용자의 입력 또는 *작업*이 미완성된 양식의 예시로 캐스팅됩니다:
+```text
+Human: <user-input>\n\nAssistant:
+```
+그러면 에이전트가 이를 완성합니다. `run` 명령과 달리 `chat` 명령은 완료된 예제를 프롬프트에 추가하여 에이전트에게 다음 `chat` 차례에 대한 더 많은 문맥을 제공합니다.
+
+이제 프롬프트가 어떻게 구성되어 있는지 알았으니 어떻게 사용자 정의할 수 있는지 살펴봅시다!
+
+### 좋은 사용자 입력 작성하기[[writing-good-user-inputs]]
+
+대규모 언어 모델이 사용자의 의도를 이해하는 능력이 점점 더 향상되고 있지만, 에이전트가 올바른 작업을 선택할 수 있도록 최대한 정확성을 유지하는 것은 큰 도움이 됩니다. 
+최대한 정확하다는 것은 무엇을 의미할까요?
+
+에이전트는 프롬프트에서 도구 이름 목록과 해당 설명을 볼 수 있습니다. 
+더 많은 도구가 추가될수록 에이전트가 올바른 도구를 선택하기가 더 어려워지고 실행할 도구의 올바른 순서를 선택하는 것은 더욱 어려워집니다. 
+일반적인 실패 사례를 살펴보겠습니다. 여기서는 분석할 코드만 반환하겠습니다.
+
+```py
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+
+agent.run("Show me a tree", return_code=True)
+```
+
+그러면 다음 결과가 출력됩니다:
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
+
+
+==Code generated by the agent==
+mask = image_segmenter(image, prompt="tree")
+```
+
+우리가 원했던 결과가 아닐 수도 있습니다. 대신 나무 이미지가 생성되기를 원할 가능성이 더 높습니다.
+따라서 에이전트가 특정 도구를 사용하도록 유도하려면 도구의 이름과 설명에 있는 중요한 키워드를 사용하는 것이 매우 유용할 수 있습니다. 한번 살펴보겠습니다.
+```py
+agent.toolbox["image_generator"].description
+```
+
+```text
+'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
+```
+
+이름과 설명은 "image", "prompt", "create" 및 "generate" 키워드를 사용합니다. 이 단어들을 사용하면 더 잘 작동할 가능성이 높습니다. 프롬프트를 조금 더 구체화해 보겠습니다.
+
+```py
+agent.run("Create an image of a tree", return_code=True)
+```
+
+이 코드는 다음 프롬프트를 만들어냅니다:
+```text
+==Explanation from the agent==
+I will use the following tool `image_generator` to generate an image of a tree.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="tree")
+```
+
+훨씬 낫네요! 저희가 원했던 것과 비슷해 보입니다. 
+즉, 에이전트가 작업을 올바른 도구에 올바르게 매핑하는 데 어려움을 겪고 있다면 도구 이름과 설명에서 가장 관련성이 높은 키워드를 찾아보고 이를 통해 작업 요청을 구체화해 보세요.
+
+### 도구 설명 사용자 정의하기[[customizing-the-tool-descriptions]]
+
+앞서 살펴본 것처럼 에이전트는 각 도구의 이름과 설명에 액세스할 수 있습니다. 
+기본 도구에는 매우 정확한 이름과 설명이 있어야 하지만 특정 사용 사례에 맞게 도구의 설명이나 이름을 변경하는 것이 도움이 될 수도 있습니다. 
+이는 매우 유사한 여러 도구를 추가했거나 특정 도메인(*예*: 이미지 생성 및 변환)에만 에이전트를 사용하려는 경우에 특히 중요해질 수 있습니다.
+
+일반적인 문제는 이미지 생성 작업에 많이 사용되는 경우 에이전트가 이미지 생성과 이미지 변환/수정을 혼동하는 것입니다. *예를 들어,*
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+그러면 다음 결과가 출력됩니다:
+```text
+==Explanation from the agent== 
+I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+house_car_image = image_transformer(image=car_image, prompt="A house")
+```
+
+결과물이 우리가 여기서 원하는 것과 정확히 일치하지 않을 수 있습니다. 에이전트가 `image_generator`와 `image_transformer`의 차이점을 이해하기 어려워서 두 가지를 함께 사용하는 경우가 많은 것 같습니다.
+
+여기서 `image_transformer`의 도구 이름과 설명을 변경하여 에이전트가 도울 수 있습니다. 
+"image" 및 "prompt"와 약간 분리하기 위해 `modifier`라고 대신 부르겠습니다:
+```py
+agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
+agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
+    "transforms an image according to a prompt", "modifies an image"
+)
+```
+
+이제 "modify"은 새 이미지 프로세서를 사용하라는 강력한 신호이므로 위의 프롬프트에 도움이 될 것입니다. 다시 실행해 봅시다.
+
+```py
+agent.run("Make an image of a house and a car", return_code=True)
+```
+
+여기서 다음과 같은 결과를 얻게 됩니다:
+```text
+==Explanation from the agent==
+I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
+
+
+==Code generated by the agent==
+house_image = image_generator(prompt="A house")
+car_image = image_generator(prompt="A car")
+```
+
+우리가 염두에 두었던 것과 확실히 더 가까워졌습니다! 하지만 집과 자동차가 모두 같은 이미지에 포함되면 좋겠습니다. 작업을 단일 이미지 생성에 더 집중하면 도움이 될 것입니다:
+
+```py
+agent.run("Create image: 'A house and car'", return_code=True)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_generator` to generate an image.
+
+
+==Code generated by the agent==
+image = image_generator(prompt="A house and car")
+```
+
+<Tip warning={true}>
+
+에이전트는 여전히 특히 여러 개체의 이미지를 생성하는 것과 같이 약간 더 복잡한 사용 사례에서 취약한 경우가 많습니다.
+앞으로 몇 달 안에 에이전트 자체와 기본 프롬프트가 더욱 개선되어 에이전트가 다양한 사용자 입력에 더욱 강력하게 대응할 수 있도록 할 예정입니다.
+
+</Tip>
+
+### 전체 프롬프트 사용자 정의하기[[customizing-the-whole-prompt]]
+
+사용자에게 최대한의 유연성을 제공하기 위해 [위](#structure-of-the-prompt)에 설명된 전체 프롬프트 템플릿을 사용자가 덮어쓸 수 있습니다. 
+이 경우 사용자 정의 프롬프트에 소개 섹션, 도구 섹션, 예제 섹션 및 미완성 예제 섹션이 포함되어 있는지 확인하세요. 
+`run` 프롬프트 템플릿을 덮어쓰려면 다음과 같이 하면 됩니다:
+
+```py
+template = """ [...] """
+
+agent = HfAgent(your_endpoint, run_prompt_template=template)
+```
+
+<Tip warning={true}>
+
+에이전트가 사용 가능한 도구를 인식하고 사용자의 프롬프트를 올바르게 삽입할 수 있도록 `<<all_tools>>` 문자열과 `<<prompt>>`를 `template` 어딘가에 정의해야 합니다.
+
+</Tip>
+
+마찬가지로 `chat` 프롬프트 템플릿을 덮어쓸 수 있습니다. `chat` 모드에서는 항상 다음과 같은 교환 형식을 사용한다는 점에 유의하세요:
+
+```text
+Human: <<task>>
+
+Assistant:
+```
+
+따라서 사용자 정의 `chat` 프롬프트 템플릿의 예제에서도 이 형식을 사용하는 것이 중요합니다. 
+다음과 같이 인스턴스화 할 때 `chat` 템플릿을 덮어쓸 수 있습니다.
+
+```
+template = """ [...] """
+
+agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
+```
+
+<Tip warning={true}>
+
+에이전트가 사용 가능한 도구를 인식할 수 있도록 `<<all_tools>>` 문자열을 `template` 어딘가에 정의해야 합니다.
+
+</Tip>
+
+두 경우 모두 커뮤니티의 누군가가 호스팅하는 템플릿을 사용하려는 경우 프롬프트 템플릿 대신 저장소 ID를 전달할 수 있습니다. 
+기본 프롬프트는 [이 저장소](https://huggingface.co/datasets/huggingface-tools/default-prompts)를 예로 들 수 있습니다.
+
+Hub의 저장소에 사용자 정의 프롬프트를 업로드하여 커뮤니티와 공유하려면 다음을 확인하세요:
+- 데이터 세트 저장소를 사용하세요.
+- `run` 명령에 대한 프롬프트 템플릿을 `run_prompt_template.txt`라는 파일에 넣으세요.
+- `chat` 명령에 대한 프롬프트 템플릿을 `chat_prompt_template.txt`라는 파일에 넣으세요.
+
+## 사용자 정의 도구 사용하기[[using-custom-tools]]
+
+이 섹션에서는 이미지 생성에 특화된 두 가지 기존 사용자 정의 도구를 활용하겠습니다:
+
+- 더 많은 이미지 수정을 허용하기 위해 [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation)을 
+  [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool)로 대체합니다.
+- 기본 도구 상자에 이미지 업스케일링을 위한 새로운 도구가 추가되었습니다: 
+  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool)가 기존 이미지 변환 도구를 대체합니다.
+
+편리한 [`load_tool`] 함수를 사용하여 사용자 정의 도구를 가져오는 것으로 시작하겠습니다:
+
+```py
+from transformers import load_tool
+
+controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
+upscaler = load_tool("diffusers/latent-upscaler-tool")
+```
+
+에이전트에게 사용자 정의 도구를 추가하면 도구의 설명과 이름이 에이전트의 프롬프트에 자동으로 포함됩니다. 
+따라서 에이전트가 사용 방법을 이해할 수 있도록 사용자 정의 도구의 설명과 이름을 잘 작성해야 합니다.
+`controlnet_transformer`의 설명과 이름을 살펴보겠습니다:
+
+```py
+print(f"Description: '{controlnet_transformer.description}'")
+print(f"Name: '{controlnet_transformer.name}'")
+```
+
+그러면 다음 결과가 출력됩니다:
+```text
+Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
+It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
+Name: 'image_transformer'
+```
+
+이름과 설명이 정확하고 [큐레이팅 된 도구 세트(curated set of tools)](./transformers_agents#a-curated-set-of-tools)의 스타일에 맞습니다.
+다음으로, `controlnet_transformer`와 `upscaler`로 에이전트를 인스턴스화해 봅시다:
+```py
+tools = [controlnet_transformer, upscaler]
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
+```
+
+이 명령을 실행하면 다음 정보가 표시됩니다:
+
+```text
+image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
+8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
+```
+
+큐레이팅된 도구 세트에는 이미 'image_transformer' 도구가 있으며, 이 도구는 사용자 정의 도구로 대체됩니다.
+
+<Tip>
+
+기존 도구와 똑같은 작업에 사용자 정의 도구를 사용하려는 경우 기존 도구를 덮어쓰는 것이 유용할 수 있습니다. 
+에이전트가 해당 작업에 능숙하기 때문입니다.
+이 경우 사용자 정의 도구가 덮어쓴 도구와 정확히 동일한 API를 따라야 하며, 그렇지 않으면 해당 도구를 사용하는 모든 예제가 업데이트되도록 프롬프트 템플릿을 조정해야 한다는 점에 유의하세요.
+
+</Tip>
+
+업스케일러 도구에 지정된 'image_upscaler'라는 이름 아직 기본 도구 상자에는 존재하지 않기 때문에, 도구 목록에 해당 이름이 간단히 추가되었습니다.
+에이전트가 현재 사용할 수 있는 도구 상자는 언제든지 `agent.toolbox` 속성을 통해 확인할 수 있습니다:
+
+```py
+print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
+```
+
+```text
+- document_qa
+- image_captioner
+- image_qa
+- image_segmenter
+- transcriber
+- summarizer
+- text_classifier
+- text_qa
+- text_reader
+- translator
+- image_transformer
+- text_downloader
+- image_generator
+- video_generator
+- image_upscaler
+```
+
+에이전트의 도구 상자에 `image_upscaler`가 추가된 점을 주목하세요.
+
+이제 새로운 도구를 사용해봅시다! [Transformers Agents Quickstart](./transformers_agents#single-execution-run)에서 생성한 이미지를 다시 사용하겠습니다.
+
+```py
+from diffusers.utils import load_image
+
+image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
+)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+이미지를 아름다운 겨울 풍경으로 바꿔 봅시다:
+
+```py
+image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_transformer` to transform the image.
+
+
+==Code generated by the agent==
+image = image_transformer(image, prompt="A frozen lake and snowy forest")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
+
+새로운 이미지 처리 도구는 이미지를 매우 강력하게 수정할 수 있는 ControlNet을 기반으로 합니다.
+기본적으로 이미지 처리 도구는 512x512 픽셀 크기의 이미지를 반환합니다. 이를 업스케일링할 수 있는지 살펴봅시다.
+
+```py
+image = agent.run("Upscale the image", image)
+```
+
+```text
+==Explanation from the agent==
+I will use the following tool: `image_upscaler` to upscale the image.
+
+
+==Code generated by the agent==
+upscaled_image = image_upscaler(image)
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
+
+에이전트는 업스케일러 도구의 설명과 이름만 보고 방금 추가한 업스케일러 도구에 "이미지 업스케일링"이라는 프롬프트를 자동으로 매핑하여 올바르게 실행했습니다.
+
+다음으로 새 사용자 정의 도구를 만드는 방법을 살펴보겠습니다.
+
+### 새 도구 추가하기[[adding-new-tools]]
+
+이 섹션에서는 에이전트에게 추가할 수 있는 새 도구를 만드는 방법을 보여 드립니다.
+
+#### 새 도구 만들기[[creating-a-new-tool]]
+
+먼저 도구를 만드는 것부터 시작하겠습니다. 
+특정 작업에 대해 가장 많은 다운로드를 받은 Hugging Face Hub의 모델을 가져오는, 그다지 유용하지는 않지만 재미있는 작업을 추가하겠습니다.
+
+다음 코드를 사용하면 됩니다:
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+`text-classification`(텍스트 분류) 작업의 경우 `'facebook/bart-large-mnli'`를 반환하고, `translation`(번역) 작업의 경우 `'t5-base'`를 반환합니다.
+
+이를 에이전트가 활용할 수 있는 도구로 변환하려면 어떻게 해야 할까요? 
+모든 도구는 필요한 주요 속성을 보유하는 슈퍼클래스 `Tool`에 의존합니다. 이를 상속하는 클래스를 만들어 보겠습니다:
+
+```python
+from transformers import Tool
+
+
+class HFModelDownloadsTool(Tool):
+    pass
+```
+
+이 클래스에는 몇 가지 요구사항이 있습니다:
+- 도구 자체의 이름에 해당하는 `name` 속성. 수행명이 있는 다른 도구와 호환되도록 `model_download_counter`로 이름을 지정하겠습니다.
+- 에이전트의 프롬프트를 채우는 데 사용되는 속성 `description`.
+- `inputs` 및 `outputs` 속성. 이를 정의하면 Python 인터프리터가 유형에 대한 정보에 입각한 선택을 하는 데 도움이 되며, 
+  도구를 허브에 푸시할 때 gradio 데모를 생성할 수 있습니다. 
+  두 속성 모두 값은 '텍스트', '이미지' 또는 '오디오'가 될 수 있는 예상 값의 리스트입니다.
+- 추론 코드가 포함된 `__call__` 메소드. 이것이 우리가 위에서 다루었던 코드입니다!
+
+이제 클래스의 모습은 다음과 같습니다:
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
+        "returns the name of the checkpoint."
+    )
+
+    inputs = ["text"]
+    outputs = ["text"]
+
+    def __call__(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+이제 도구를 손쉽게 사용할 수 있게 되었습니다. 
+도구를 파일에 저장하고 메인 스크립트에서 가져옵니다. 이 파일의 이름을 `model_downloads.py`로 지정하면 결과적으로 가져오기 코드는 다음과 같습니다:
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+다른 사람들이 이 기능을 활용할 수 있도록 하고 초기화를 더 간단하게 하려면 네임스페이스 아래의 Hub로 푸시하는 것이 좋습니다. 
+그렇게 하려면 `tool` 변수에서 `push_to_hub`를 호출하면 됩니다:
+
+```python
+tool.push_to_hub("hf-model-downloads")
+```
+
+이제 허브에 코드가 생겼습니다! 마지막 단계인 에이전트가 코드를 사용하도록 하는 단계를 살펴보겠습니다.
+
+#### 에이전트가 도구를 사용하게 하기[[Having-the-agent-use-the-tool]]
+
+이제 이런 식으로 허브에 존재하는 도구를 인스턴스화할 수 있습니다(도구의 사용자 이름은 변경하세요):
+We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
+
+```python
+from transformers import load_tool
+
+tool = load_tool("lysandre/hf-model-downloads")
+```
+
+이 도구를 에이전트에서 사용하려면 에이전트 초기화 메소드의 `additional_tools` 매개변수에 전달하기만 하면 됩니다:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run(
+    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+그러면 다음과 같은 결과가 출력됩니다:
+```text
+==Code generated by the agent==
+model = model_download_counter(task="text-to-video")
+print(f"The model with the most downloads is {model}.")
+audio_model = text_reader(model)
+
+
+==Result==
+The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
+```
+
+and generates the following audio.
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+
+<Tip>
+
+LLM에 따라 일부는 매우 취약하기 때문에 제대로 작동하려면 매우 정확한 프롬프트가 필요합니다. 
+에이전트가 도구를 잘 활용하기 위해서는 도구의 이름과 설명을 잘 정의하는 것이 무엇보다 중요합니다.
+
+</Tip>
+
+### 기존 도구 대체하기[[replacing-existing-tools]]
+
+에이전트의 도구 상자에 새 항목을 배정하기만 하면 기존 도구를 대체할 수 있습니다. 방법은 다음과 같습니다:
+
+```python
+from transformers import HfAgent, load_tool
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
+```
+
+<Tip>
+
+다른 도구로 교체할 때는 주의하세요! 이 작업으로 에이전트의 프롬프트도 조정됩니다. 
+작업에 더 적합한 프롬프트가 있으면 좋을 수 있지만, 
+다른 도구보다 더 많이 선택되거나 정의한 도구 대신 다른 도구가 선택될 수도 있습니다.
+
+</Tip>
+
+## gradio-tools 사용하기[[leveraging-gradio-tools]]
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools)는 Hugging Face Spaces를 도구로 사용할 수 있는 강력한 라이브러리입니다. 
+기존의 많은 Spaces뿐만 아니라 사용자 정의 Spaces를 사용하여 디자인할 수 있도록 지원합니다.
+
+우리는 `Tool.from_gradio` 메소드를 사용하여 `gradio_tools`에 대한 지원을 제공합니다. 
+예를 들어, 프롬프트를 개선하고 더 나은 이미지를 생성하기 위해 `gradio-tools` 툴킷에서 제공되는 `StableDiffusionPromptGeneratorTool` 도구를 활용하고자 합니다.
+
+먼저 `gradio_tools`에서 도구를 가져와서 인스턴스화합니다:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+
+gradio_tool = StableDiffusionPromptGeneratorTool()
+```
+
+해당 인스턴스를 `Tool.from_gradio` 메소드에 전달합니다:
+
+```python
+from transformers import Tool
+
+tool = Tool.from_gradio(gradio_tool)
+```
+
+이제 일반적인 사용자 정의 도구와 똑같이 관리할 수 있습니다. 
+이를 활용하여 `a rabbit wearing a space suit'(우주복을 입은 토끼)라는 프롬프트를 개선했습니다:
+
+```python
+from transformers import HfAgent
+
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
+
+agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
+```
+
+모델이 도구를 적절히 활용합니다:
+```text
+==Explanation from the agent==
+I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
+
+
+==Code generated by the agent==
+improved_prompt = StableDiffusionPromptGenerator(prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(improved_prompt)
+```
+
+마지막으로 이미지를 생성하기 전에:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
+
+<Tip warning={true}>
+
+gradio-tools는 다른 모달리티로 작업할 때에도 *텍스트* 입력 및 출력을 필요로 합니다. 
+이 구현은 이미지 및 오디오 객체에서 작동합니다. 
+현재는 이 두 가지가 호환되지 않지만 지원 개선을 위해 노력하면서 빠르게 호환될 것입니다.
+
+</Tip>
+
+## 향후 Langchain과의 호환성[[future-compatibility-with-langchain]]
+
+저희는 Langchain을 좋아하며 매우 매력적인 도구 모음을 가지고 있다고 생각합니다. 
+이러한 도구를 처리하기 위해 Langchain은 다른 모달리티와 작업할 때에도 *텍스트* 입력과 출력을 필요로 합니다.
+이는 종종 객체의 직렬화된(즉, 디스크에 저장된) 버전입니다.
+
+이 차이로 인해 transformers-agents와 Langchain 간에는 멀티 모달리티가 처리되지 않습니다. 
+향후 버전에서 이 제한이 해결되기를 바라며, 이 호환성을 달성할 수 있도록 열렬한 Langchain 사용자의 도움을 환영합니다.
+
+저희는 더 나은 지원을 제공하고자 합니다. 도움을 주고 싶으시다면, [이슈를 열어](https://github.com/huggingface/transformers/issues/new) 의견을 공유해 주세요.

From 18d42bfd23eff8b5bd6464769079c0ad01b6a2aa Mon Sep 17 00:00:00 2001
From: namespace-Pt <61188463+namespace-Pt@users.noreply.github.com>
Date: Mon, 17 Jul 2023 19:07:47 +0800
Subject: [PATCH 666/935] Remove unused code in GPT-Neo (#24826)

1
---
 src/transformers/models/gpt_neo/modeling_gpt_neo.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 66471b6eac27..a0e5b23b4ed3 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -696,8 +696,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
-        else:
-            position_ids = None
+
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,

From 38dfb869589a8ec8198e1cdecc5eba9d0facab6b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Jul 2023 07:17:17 -0400
Subject: [PATCH 667/935] Bump cryptography from 41.0.0 to 41.0.2 in
 /examples/research_projects/decision_transformer (#24833)

Bump cryptography in /examples/research_projects/decision_transformer

Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.0 to 41.0.2.
- [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pyca/cryptography/compare/41.0.0...41.0.2)

---
updated-dependencies:
- dependency-name: cryptography
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 694c225b468a..fbfc600b1524 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -34,7 +34,7 @@ cmd2==2.4.0
 codecarbon==1.2.0
 colorlog==6.6.0
 cookiecutter==2.1.1
-cryptography==41.0.0
+cryptography==41.0.2
 csvw==2.0.0
 cycler==0.11.0
 Cython==0.29.28

From 4f08887053cae174d8a249ea410eb3655d5bc2eb Mon Sep 17 00:00:00 2001
From: Samin Yasar <saminc97@gmail.com>
Date: Mon, 17 Jul 2023 18:51:19 +0600
Subject: [PATCH 668/935] Add Multimodal heading and Document question
 answering in task_summary.mdx (#23318)

* add multimodal heading and docqa

* fix sentence

* task_summary data type = modality clarification

* change the multimodal example to a smaller model
---
 docs/source/en/task_summary.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md
index c1e2c9a0ba81..46e73297bbf9 100644
--- a/docs/source/en/task_summary.md
+++ b/docs/source/en/task_summary.md
@@ -311,4 +311,31 @@ There are two types of language modeling:
       'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
     ```
 
+## Multimodal
+
+Multimodal tasks require a model to process multiple data modalities (text, image, audio, video) to solve a particular problem. Image captioning is an example of a multimodal task where the model takes an image as input and outputs a sequence of text describing the image or some properties of the image. 
+
+Although multimodal models work with different data types or modalities, internally, the preprocessing steps help the model convert all the data types into embeddings (vectors or list of numbers that holds meaningful information about the data). For a task like image captioning, the model learns relationships between image embeddings and text embeddings.
+
+### Document question answering
+
+Document question answering is a task that answers natural language questions from a document. Unlike a token-level question answering task which takes text as input, document question answering takes an image of a document as input along with a question about the document and returns an answer. Document question answering can be used to parse structured documents and extract key information from it. In the example below, the total amount and change due can be extracted from a receipt.
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="What is the total amount?",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531239628791809, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
 Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next [section](tasks_explained), you'll learn **how** 🤗 Transformers work to solve these tasks.
\ No newline at end of file

From e4a52b6a1536b1d9ef1ac55168bc4fede25605bc Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 17 Jul 2023 16:58:51 +0200
Subject: [PATCH 669/935] Fix `is_vision_available` (#24853)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/utils/import_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 42671462655d..83b6f01dfdea 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -546,7 +546,10 @@ def is_vision_available():
         try:
             package_version = importlib.metadata.version("Pillow")
         except importlib.metadata.PackageNotFoundError:
-            return False
+            try:
+                package_version = importlib.metadata.version("Pillow-SIMD")
+            except importlib.metadata.PackageNotFoundError:
+                return False
         logger.debug(f"Detected PIL version {package_version}")
     return _pil_available
 

From c965d302791cf935d6ea7776428749be678cf509 Mon Sep 17 00:00:00 2001
From: bofeng huang <bofenghuang7@gmail.com>
Date: Mon, 17 Jul 2023 17:07:16 +0200
Subject: [PATCH 670/935] Fix comments for `_merge_heads` (#24855)

* Fix comments

* Fix comments
---
 src/transformers/models/bloom/modeling_bloom.py   | 4 ++--
 src/transformers/models/falcon/modeling_falcon.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 48a8db9186db..d12ec1724f70 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -253,7 +253,7 @@ def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Ten
 
     def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Merge heads together over the last dimenstion
+        Merge heads together over the last dimension
 
         Args:
             x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
@@ -344,7 +344,7 @@ def forward(
         # matmul: [batch_size * num_heads, q_length, head_dim]
         context_layer = torch.bmm(attention_probs_reshaped, value_layer)
 
-        # change view [batch_size, num_heads, q_length, head_dim]
+        # change view [batch_size, q_length, num_heads * head_dim]
         context_layer = self._merge_heads(context_layer)
 
         # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 9ab9030a5447..cb05d7259949 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -255,7 +255,7 @@ def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Ten
     # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
     def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Merge heads together over the last dimenstion
+        Merge heads together over the last dimension
 
         Args:
             x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
@@ -384,7 +384,7 @@ def forward(
             # matmul: [batch_size * num_heads, q_length, head_dim]
             context_layer = (attention_probs_reshaped @ value_layer_).flatten(0, 1)
 
-            # change view [batch_size, num_heads, q_length, head_dim]
+            # change view [batch_size, q_length, num_heads * head_dim]
             context_layer = self._merge_heads(context_layer)
 
             output_tensor = self.dense(context_layer)

From 054e80291452b39d3fe2834cb1629e02e1401faf Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 17 Jul 2023 18:47:14 +0200
Subject: [PATCH 671/935] fix broken links in READMEs (#24861)

fix MRA in READMEs
---
 README_es.md      | 2 +-
 README_hd.md      | 2 +-
 README_ja.md      | 2 +-
 README_ko.md      | 2 +-
 README_zh-hans.md | 2 +-
 README_zh-hant.md | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README_es.md b/README_es.md
index 9fd3e7b9cf4d..d59219e07b37 100644
--- a/README_es.md
+++ b/README_es.md
@@ -386,7 +386,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_hd.md b/README_hd.md
index ce65206165a0..f7f2d66f5caf 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -358,7 +358,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA) के साथ जारी किया गया
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/README_ja.md b/README_ja.md
index 10bb7d12313f..1f1a58e9a1a0 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -420,7 +420,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
diff --git a/README_ko.md b/README_ko.md
index 4cdb0c24b756..8ae9a7dfb121 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -335,7 +335,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)논문과 함께 발표했습니다.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 7e74fa282ebe..1f3198fc28ec 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -359,7 +359,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index ddd6242c0f95..23c24a9f9024 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -371,7 +371,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.

From c21c3737c1d64aa15a5853366818efc72694cdc9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:53:03 -0400
Subject: [PATCH 672/935] Add TAPEX to the list of deprecated models (#24859)

* Add TAPEX to the list of deprecated models

* Add check

* Fix typo

* Fix import path for Van conversion
---
 .../models/auto/configuration_auto.py         |  1 +
 .../deprecated/van/convert_van_to_pytorch.py  |  2 +-
 utils/check_repo.py                           | 28 +++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 893a319a0213..0ae8e34e4adb 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -646,6 +646,7 @@
     "mctct",
     "mmbt",
     "retribert",
+    "tapex",
     "trajectory_transformer",
     "van",
 ]
diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
index c9bbc0fc9195..20492e42be20 100644
--- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
@@ -31,7 +31,7 @@
 from torch import Tensor
 
 from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
-from transformers.models.van.modeling_van import VanLayerScaling
+from transformers.models.deprecated.van.modeling_van import VanLayerScaling
 from transformers.utils import logging
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 5a3184b10ab6..64bd343abfbc 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -1076,6 +1076,32 @@ def check_docstrings_are_in_md():
         )
 
 
+def check_deprecated_constant_is_up_to_date():
+    deprecated_folder = os.path.join(PATH_TO_TRANSFORMERS, "models", "deprecated")
+    deprecated_models = [m for m in os.listdir(deprecated_folder) if not m.startswith("_")]
+
+    constant_to_check = transformers.models.auto.configuration_auto.DEPRECATED_MODELS
+    message = []
+    missing_models = sorted(set(deprecated_models) - set(constant_to_check))
+    if len(missing_models) != 0:
+        missing_models = ", ".join(missing_models)
+        message.append(
+            "The following models are in the deprecated folder, make sur to add them to `DEPRECATED_MODELS` in "
+            f"`models/auto/configuration_auto.py`: {missing_models}."
+        )
+
+    extra_models = sorted(set(constant_to_check) - set(deprecated_models))
+    if len(extra_models) != 0:
+        extra_models = ", ".join(extra_models)
+        message.append(
+            "The following models are in the `DEPRECATED_MODELS` constant but not in the deprecated folder. Either "
+            f"remove them from the constant or move to the deprecated folder: {extra_models}."
+        )
+
+    if len(message) > 0:
+        raise Exception("\n".join(message))
+
+
 def check_repo_quality():
     """Check all models are properly tested and documented."""
     print("Checking all models are included.")
@@ -1097,6 +1123,8 @@ def check_repo_quality():
     check_all_auto_mappings_importable()
     print("Checking all objects are equally (across frameworks) in the main __init__.")
     check_objects_being_equally_in_main_init()
+    print("Checking the DEPRECATED_MODELS constant is up to date.")
+    check_deprecated_constant_is_up_to_date()
 
 
 if __name__ == "__main__":

From f42a35e611ae3916cfb5b1e8a0540606abb31466 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Mon, 17 Jul 2023 18:53:24 +0200
Subject: [PATCH 673/935] Add bark (#24086)

* first raw version of the bark integration

* working code on small models with single run

* add converting script from suno weights 2 hf

* many changes

* correct past_kv output

* working implementation for inference

* update the converting script according to the architecture changes

* add a working end-to-end inference code

* remove some comments and make small changes

* remove unecessary comment

* add docstrings and ensure no unecessary intermediary output during audio generation

* remove done TODOs

* make style + add config docstrings

* modification for batch inference support on the whole model

* add details to .generation_audio method

* add copyright

* convert EncodecModel from original library to transformers implementation

* add two class in order to facilitate model and sub-models loading from the hub

* add support of loading the whole model

* add BarkProcessor

* correct modeling according to processor output

* Add proper __init__ and auto support

* Add up-to-date copyright/license message

* add relative import instead of absolute

* cleaner head_dim computation

* small comment removal or changes

* more verbose LayerNorm init method

* specify eps for clearer comprehension

* more verbose variable naming in the MLP module

* remove unecessary BarkBlock parameter

* clearer code in the forward pass of the BarkBlock

* remove _initialize_modules method for cleaner code

* Remove unnecessary methods from sub-models

* move code to remove unnecessary function

* rename a variable for clarity and change an assert

* move code and change variable name for clarity

* remove unnecessary asserts

* correct small bug

* correct a comment

* change variable names for clarity

* remove asserts

* change import from absolute to relative

* correct small error due to comma missing + correct import

* Add attribute Bark config

* add first version of tests

* update attention_map

* add tie_weights and resize_token_embeddings for fineModel

* correct getting attention_mask in generate_text_semantic

* remove Bark inference trick

* leave more choices in barkProcessor

* remove _no_split_modules

* fixe error in forward of block and introduce clearer notations

* correct converting script with last changes

* make style + add draft bark.mdx

* correct BarkModelTest::test_generate_text_semantic

* add Bark in main README

* add dummy_pt_objects for Bark

* add missing models in the main init

* correct test_decoder_model_past_with_large_inputs

* disable torchscript test

* change docstring of BarkProcessor

* Add test_processor_bark

* make style

* correct copyrights

* add bark.mdx + make style, quality and consistency

* Apply suggestions from code review

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Remove unnecessary test method

* simply logic of a test

* Only check first ids for slow audio generation

* split full end-to-end generation tests

* remove unneccessary comment

* change submodel names for clearer naming

* remove ModuleDict from modeling_bark

* combine two if statements

* ensure that an edge misued won't happen

* modify variable name

* move code snippet to the right place (coarse instead of semantic)

* change BarkSemanticModule -> BarkSemanticModel

* align BarkProcessor with transformers paradigm

* correct BarkProcessor tests with last commit changes

* change _validate_voice_preset to an instance method instead of a class method

* tie_weights already called with post_init

* add codec_model config to configuration

* update bark modeling tests with recent BarkProcessor changes

* remove SubModelPretrainedModel + change speakers embeddings prompt type in BarkModel

* change absolute imports to relative

* remove TODO

* change docstrings

* add examples to docs and docstrings

* make style

* uses BatchFeature in BarkProcessor insteads of dict

* continue improving docstrings and docs + make style

* correct docstrings examples

* more comprehensible speaker_embeddings load/Save

* rename speaker_embeddings_dict -> speaker_embeddings

* correct bark.mdx + add bark to documentation_tests

* correct docstrings configuration_bark

* integrate last nit suggestions

* integrate BarkGeneration configs

* make style

* remove bark tests from documentation_tests.txt because timeout - tested manually

* add proper generation config initialization

* small bark.mdx documentation changes

* rename bark.mdx -> bark.md

* add torch.no_grad behind BarkModel.generate_audio()

* replace assert by ValueError in convert_suno_to_hf.py

* integrate a series of short comments from reviewer

* move SemanticLogitsProcessors and remove .detach() from Bark docs and docstrings

* actually remove SemanticLogitsProcessor from modeling_bark.oy

* BarkProcessor returns a single output instead of tuple + correct docstrings

* make style + correct bug

* add initializer_range to BarkConfig + correct slow modeling tests

* add .clone() to history_prompt.coarse_prompt to avoid modifying input array

* Making sure no extra "`" are present

* remove extra characters in modeling_bark.py

* Correct output if history_prompt is None

* remove TODOs

* remove ravel comment

* completing generation_configuration_bark.py docstrings

* change docstrings - number of audio codebooks instead of Encodec codebooks

* change 'bias' docstrings in configuration_bark.py

* format code

* rename BarkModel.generate_audio -> BarkModel.generate_speech

* modify AutoConfig instead of EncodecConfig in BarkConfig

* correct AutoConfig wrong init

* refactor BarkModel and sub-models generate_coarse, generate_fine, generate_text_semantic

* remove SemanticLogitsProcessor and replace it with SuppressTokensLogitsProcessor

* move nb_codebook related config arguments to BarkFineConfig

* rename bark.mdx -> bark.md

* correcting BarkModelConfig from_pretrained + remove keys_to_ignore

* correct bark.md with correct hub path

* correct code bug in bark.md

* correct list tokens_to_suppress

* modify Processor to load nested speaker embeddings in a safer way

* correct batch sampling in BarkFineModel.generate_fine

* Apply suggestions from code review

Small docstrings correction and code improvements

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* give more details about num_layers in docstrings

* correct indentation mistake

* correct submodelconfig order of docstring variables

* put audio models in alphabetical order in utils/check_repo.my

* remove useless line from test_modeling_bark.py

* makes BarkCoarseModelTest inherits from (ModelTesterMixin, GenerationTesterMixin, unittest.TestCase) instead of BarkSemanticModelTest

* make a Tester class for each sub-model instead of inheriting

* add test_resize_embeddings=True for Bark sub-models

* add Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads

* remove 'Copied fom Bark' comment

* remove unneccessary comment

* change np.min -> min in modeling_bark.py

* refactored all custom layers to have Bark prefix

* add attention_mask as an argument of generate_text_semantic

* refactor sub-models start docstrings to have more precise config class definition

* move _tied_weights_keys overriding

* add docstrings to generate_xxx in modeling_bark.py

* add loading whole BarkModel to convert_suno_to_hf

* refactor attribute and variable names

* make style convert_suno

* update bark checkpoints

* remove never entered if statement

* move bark_modeling docstrings after BarkPretrainedModel class definition

* refactor modeling_bark.py: kv -> key_values

* small nits - code refactoring and removing unecessary lines from _init_weights

* nits - replace inplace method by variable assigning

* remove *optional* when necessary

* remove some lines in generate_speech

* add default value for optional parameter

* Refactor preprocess_histories_before_coarse -> preprocess_histories

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* correct usage after refactoring

* refactor Bark's generate_xxx -> generate and modify docstrings and tests accordingly

* update docstrings python in configuration_bark.py

* add bark files in utils/documentation_test.txt

* correct docstrings python snippet

* add the ability to use parameters in the form of e.g coarse_temperature

* add semantic_max_new_tokens in python snippet in docstrings for quicker generation

* Reformate sub-models kwargs in BakModel.generate

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* correct kwargs in BarkModel.generate

* correct attention_mask kwarg in BarkModel.generate

* add tests for sub-models args in BarkModel.generate and correct BarkFineModel.test_generate_fp16

* enrich BarkModel.generate docstrings with a description of how to use the kwargs

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    1 +
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/bark.md              |  141 ++
 src/transformers/__init__.py                  |   34 +
 src/transformers/generation/logits_process.py |   36 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 .../models/auto/processing_auto.py            |    1 +
 src/transformers/models/bark/__init__.py      |   79 +
 .../models/bark/configuration_bark.py         |  348 ++++
 .../models/bark/convert_suno_to_hf.py         |  262 +++
 .../bark/generation_configuration_bark.py     |  318 ++++
 src/transformers/models/bark/modeling_bark.py | 1503 +++++++++++++++++
 .../models/bark/processing_bark.py            |  286 ++++
 src/transformers/utils/dummy_pt_objects.py    |   45 +
 tests/models/bark/__init__.py                 |    0
 tests/models/bark/test_modeling_bark.py       |  991 +++++++++++
 tests/models/bark/test_processor_bark.py      |  127 ++
 utils/check_repo.py                           |   11 +-
 utils/documentation_tests.txt                 |    3 +
 28 files changed, 4199 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/en/model_doc/bark.md
 create mode 100644 src/transformers/models/bark/__init__.py
 create mode 100644 src/transformers/models/bark/configuration_bark.py
 create mode 100644 src/transformers/models/bark/convert_suno_to_hf.py
 create mode 100644 src/transformers/models/bark/generation_configuration_bark.py
 create mode 100644 src/transformers/models/bark/modeling_bark.py
 create mode 100644 src/transformers/models/bark/processing_bark.py
 create mode 100644 tests/models/bark/__init__.py
 create mode 100644 tests/models/bark/test_modeling_bark.py
 create mode 100644 tests/models/bark/test_processor_bark.py

diff --git a/README.md b/README.md
index 78cc3792afa1..8f1a08e7ec87 100644
--- a/README.md
+++ b/README.md
@@ -291,6 +291,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_es.md b/README_es.md
index d59219e07b37..e9cdd14ce801 100644
--- a/README_es.md
+++ b/README_es.md
@@ -268,6 +268,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_hd.md b/README_hd.md
index f7f2d66f5caf..b5262faef100 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -240,6 +240,7 @@ conda install -c huggingface transformers
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 1f1a58e9a1a0..dbb6bb407f32 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -302,6 +302,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
diff --git a/README_ko.md b/README_ko.md
index 8ae9a7dfb121..1e569b7d3869 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -217,6 +217,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 1f3198fc28ec..f6688e21a96f 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -241,6 +241,7 @@ conda install -c huggingface transformers
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 23c24a9f9024..1d97b4027cb5 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -253,6 +253,7 @@ conda install -c huggingface transformers
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 403779fe34bb..bcdf5e50ff1a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -545,6 +545,8 @@
       sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
+      - local: model_doc/bark
+        title: Bark
       - local: model_doc/clap
         title: CLAP
       - local: model_doc/encodec
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index dbf4f3dfbece..25542df14392 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -57,6 +57,7 @@ The documentation is organized into five sections:
 1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -282,6 +283,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 | Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Autoformer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Bark              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md
new file mode 100644
index 000000000000..4126f10c3ef7
--- /dev/null
+++ b/docs/source/en/model_doc/bark.md
@@ -0,0 +1,141 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Bark
+
+## Overview
+
+Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark). 
+
+
+Bark is made of 4 main models:
+
+- [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that takes as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
+- [`BarkCoarseModel`] (also referred to as the 'coarse acoustics' model): a causal autoregressive transformer, that takes as input the results of the [`BarkSemanticModel`] model. It aims at predicting the first two audio codebooks necessary for EnCodec.
+- [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively predicts the last codebooks based on the sum of the previous codebooks embeddings.
+- having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio array.
+
+It should be noted that each of the first three modules can support conditional speaker embeddings to condition the output sound according to specific predefined voice.
+
+
+### Tips:
+
+Suno offers a library of voice presets in a number of languages [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c).
+These presets are also uploaded in the hub [here](https://huggingface.co/suno/bark-small/tree/main/speaker_embeddings) or [here](https://huggingface.co/suno/bark/tree/main/speaker_embeddings).
+
+```python
+>>> from transformers import AutoProcessor, BarkModel
+
+>>> processor = AutoProcessor.from_pretrained("suno/bark")
+>>> model = BarkModel.from_pretrained("suno/bark")
+
+>>> voice_preset = "v2/en_speaker_6"
+
+>>> inputs = processor("Hello, my dog is cute", voice_preset=voice_preset)
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+Bark can generate highly realistic, **multilingual** speech as well as other audio - including music, background noise and simple sound effects. 
+
+```python
+>>> # Multilingual speech - simplified Chinese
+>>> inputs = processor("惊人的！我会说中文")
+
+>>> # Multilingual speech - French - let's use a voice_preset as well
+>>> inputs = processor("Incroyable! Je peux générer du son.", voice_preset="fr_speaker_5")
+
+>>> # Bark can also generate music. You can help it out by adding music notes around your lyrics.
+>>> inputs = processor("♪ Hello, my dog is cute ♪")
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+The model can also produce **nonverbal communications** like laughing, sighing and crying.
+
+
+```python
+>>> # Adding non-speech cues to the input text
+>>> inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]")
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+To save the audio, simply take the sample rate from the model config and some scipy utility:
+
+```python
+>>> from scipy.io.wavfile import write as write_wav
+
+>>> # save audio to disk, but first take the sample rate from the model config
+>>> sample_rate = model.generation_config.sample_rate
+>>> write_wav("bark_generation.wav", sample_rate, audio_array)
+```
+
+
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe) and [Sanchit Gandhi (sanchit-gandhi)](https://github.com/sanchit-gandhi).
+The original code can be found [here](https://github.com/suno-ai/bark).
+
+
+## BarkConfig
+
+[[autodoc]] BarkConfig
+    - all
+
+## BarkProcessor
+
+[[autodoc]] BarkProcessor
+    - all
+    - __call__
+
+## BarkModel
+
+[[autodoc]] BarkModel
+    - generate
+
+## BarkSemanticModel
+
+[[autodoc]] BarkSemanticModel
+    - forward
+
+## BarkCoarseModel
+
+[[autodoc]] BarkCoarseModel
+    - forward
+
+## BarkFineModel
+
+[[autodoc]] BarkFineModel
+    - forward
+
+## BarkCausalModel
+
+[[autodoc]] BarkCausalModel
+    - forward
+
+## BarkCoarseConfig
+
+[[autodoc]] BarkCoarseConfig
+    - all
+
+## BarkFineConfig
+
+[[autodoc]] BarkFineConfig
+    - all
+
+## BarkSemanticConfig
+
+[[autodoc]] BarkSemanticConfig
+    - all
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 872f2ddaadd6..d53f23e727cf 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -160,6 +160,13 @@
         "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "AutoformerConfig",
     ],
+    "models.bark": [
+        "BarkCoarseConfig",
+        "BarkConfig",
+        "BarkFineConfig",
+        "BarkProcessor",
+        "BarkSemanticConfig",
+    ],
     "models.bart": ["BartConfig", "BartTokenizer"],
     "models.barthez": [],
     "models.bartpho": [],
@@ -1136,6 +1143,17 @@
             "AutoformerPreTrainedModel",
         ]
     )
+    _import_structure["models.bark"].extend(
+        [
+            "BARK_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BarkCausalModel",
+            "BarkCoarseModel",
+            "BarkFineModel",
+            "BarkModel",
+            "BarkPreTrainedModel",
+            "BarkSemanticModel",
+        ]
+    )
     _import_structure["models.bart"].extend(
         [
             "BART_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4098,6 +4116,13 @@
         AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         AutoformerConfig,
     )
+    from .models.bark import (
+        BarkCoarseConfig,
+        BarkConfig,
+        BarkFineConfig,
+        BarkProcessor,
+        BarkSemanticConfig,
+    )
     from .models.bart import BartConfig, BartTokenizer
     from .models.beit import BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BeitConfig
     from .models.bert import (
@@ -4978,6 +5003,15 @@
             AutoformerModel,
             AutoformerPreTrainedModel,
         )
+        from .models.bark import (
+            BARK_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BarkCausalModel,
+            BarkCoarseModel,
+            BarkFineModel,
+            BarkModel,
+            BarkPreTrainedModel,
+            BarkSemanticModel,
+        )
         from .models.bart import (
             BART_PRETRAINED_MODEL_ARCHIVE_LIST,
             BartForCausalLM,
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index c1d5e9338936..5ff46910d8f4 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1148,3 +1148,39 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         cond_logits, uncond_logits = scores.split(unguided_bsz, dim=0)
         scores = uncond_logits + (cond_logits - uncond_logits) * self.guidance_scale
         return scores
+
+
+class AlternatingCodebooksLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing alternated generation between the two codebooks of [`Bark`]'s fine submodel.
+
+    Args:
+        input_start_len (`int`):
+            The length of the initial input sequence.
+        semantic_vocab_size (`int`):
+            Vocabulary size of the semantic part, i.e number of tokens associated to the semantic vocabulary.
+        codebook_size (`int`):
+            Number of tokens associated to the codebook.
+    """
+
+    def __init__(self, input_start_len: int, semantic_vocab_size: int, codebook_size: int):
+        if not isinstance(input_start_len, int) or input_start_len < 0:
+            raise ValueError(f"`input_starting_length` has to be a non-negative integer, but is {input_start_len}")
+
+        self.input_start_len = input_start_len
+        self.semantic_vocab_size = semantic_vocab_size
+        self.codebook_size = codebook_size
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        curr_len = input_ids.shape[-1]
+
+        # even -> first codebook, odd -> second codebook
+        is_first_codebook = ((curr_len - self.input_start_len) % 2) == 0
+
+        if is_first_codebook:
+            scores[:, : self.semantic_vocab_size] = -float("inf")
+            scores[:, self.semantic_vocab_size + self.codebook_size :] = -float("inf")
+        else:
+            scores[:, : self.semantic_vocab_size + self.codebook_size] = -float("inf")
+
+        return scores
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7f698f15d9a6..0a102e035d2b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -19,6 +19,7 @@
     audio_spectrogram_transformer,
     auto,
     autoformer,
+    bark,
     bart,
     barthez,
     bartpho,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0ae8e34e4adb..2d3901b4d05c 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -35,6 +35,7 @@
         ("altclip", "AltCLIPConfig"),
         ("audio-spectrogram-transformer", "ASTConfig"),
         ("autoformer", "AutoformerConfig"),
+        ("bark", "BarkConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
         ("bert", "BertConfig"),
@@ -237,6 +238,7 @@
         ("altclip", "ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("audio-spectrogram-transformer", "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("autoformer", "AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bark", "BARK_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -419,6 +421,7 @@
         ("altclip", "AltCLIP"),
         ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
         ("autoformer", "Autoformer"),
+        ("bark", "Bark"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
         ("bartpho", "BARTpho"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index c11fee596b21..2dc1b9f8e6c1 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -33,6 +33,7 @@
         ("altclip", "AltCLIPModel"),
         ("audio-spectrogram-transformer", "ASTModel"),
         ("autoformer", "AutoformerModel"),
+        ("bark", "BarkModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
         ("bert", "BertModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 9005925d7d76..7377d069be35 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -44,6 +44,7 @@
     [
         ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
+        ("bark", "BarkProcessor"),
         ("blip", "BlipProcessor"),
         ("blip-2", "Blip2Processor"),
         ("bridgetower", "BridgeTowerProcessor"),
diff --git a/src/transformers/models/bark/__init__.py b/src/transformers/models/bark/__init__.py
new file mode 100644
index 000000000000..03e5865ca4a4
--- /dev/null
+++ b/src/transformers/models/bark/__init__.py
@@ -0,0 +1,79 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_bark": [
+        "BARK_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BarkCoarseConfig",
+        "BarkConfig",
+        "BarkFineConfig",
+        "BarkSemanticConfig",
+    ],
+    "processing_bark": ["BarkProcessor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_bark"] = [
+        "BARK_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "BarkFineModel",
+        "BarkSemanticModel",
+        "BarkCoarseModel",
+        "BarkModel",
+        "BarkPreTrainedModel",
+        "BarkCausalModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_bark import (
+        BARK_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BarkCoarseConfig,
+        BarkConfig,
+        BarkFineConfig,
+        BarkSemanticConfig,
+    )
+    from .processing_bark import BarkProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_bark import (
+            BARK_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BarkCausalModel,
+            BarkCoarseModel,
+            BarkFineModel,
+            BarkModel,
+            BarkPreTrainedModel,
+            BarkSemanticModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
new file mode 100644
index 000000000000..4d7c3bc94d84
--- /dev/null
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -0,0 +1,348 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BARK model configuration"""
+
+import copy
+import os
+from typing import Dict, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import add_start_docstrings, logging
+from ..auto import AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+BARK_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "suno/bark-small": "https://huggingface.co/suno/bark-small/resolve/main/config.json",
+    "suno/bark": "https://huggingface.co/suno/bark/resolve/main/config.json",
+}
+
+BARK_SUBMODELCONFIG_START_DOCSTRING = """
+    This is the configuration class to store the configuration of a [`{model}`]. It is used to instantiate the model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Bark [suno/bark](https://huggingface.co/suno/bark)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        block_size (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        input_vocab_size (`int`, *optional*, defaults to 10_048):
+            Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with
+            regards to the chosen sub-model.
+        output_vocab_size (`int`, *optional*, defaults to 10_048):
+            Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented
+            by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought
+            with regards to the chosen sub-model.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the given sub-model.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer architecture.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the architecture.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the linear layers and layer norm layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+"""
+
+
+class BarkSubModelConfig(PretrainedConfig):
+    model_type = "bark_module"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+        "vocab_size": "input_vocab_size",
+        "window_size": "block_size",
+    }
+
+    def __init__(
+        self,
+        block_size=1024,
+        input_vocab_size=10_048,
+        output_vocab_size=10_048,
+        num_layers=12,
+        num_heads=12,
+        hidden_size=768,
+        dropout=0.0,
+        bias=True,  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+        initializer_range=0.02,
+        use_cache=True,
+        **kwargs,
+    ):
+        self.block_size = block_size
+        self.input_vocab_size = input_vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.bias = bias
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> "PretrainedConfig":
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        cls._set_token_in_kwargs(kwargs, token)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the config dict if we are loading from Bark
+        if config_dict.get("model_type") == "bark":
+            config_dict = config_dict[f"{cls.model_type}_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"),
+    """
+    Example:
+
+    ```python
+    >>> from transformers import BarkSemanticConfig, BarkSemanticModel
+
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkSemanticConfig()
+
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkSemanticModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkSemanticConfig(BarkSubModelConfig):
+    model_type = "semantic"
+
+
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkCoarseConfig", model="BarkCoarseModel"),
+    """
+    Example:
+
+    ```python
+    >>> from transformers import BarkCoarseConfig, BarkCoarseModel
+
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkCoarseConfig()
+
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkCoarseModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkCoarseConfig(BarkSubModelConfig):
+    model_type = "coarse_acoustics"
+
+
+@add_start_docstrings(
+    BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkFineConfig", model="BarkFineModel"),
+    """
+        n_codes_total (`int`, *optional*, defaults to 8):
+            The total number of audio codebooks predicted. Used in the fine acoustics sub-model.
+        n_codes_given (`int`, *optional*, defaults to 1):
+            The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics
+            sub-models.
+    Example:
+
+    ```python
+    >>> from transformers import BarkFineConfig, BarkFineModel
+
+    >>> # Initializing a Bark sub-module style configuration
+    >>> configuration = BarkFineConfig()
+
+    >>> # Initializing a model (with random weights) from the suno/bark style configuration
+    >>> model = BarkFineModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```""",
+)
+class BarkFineConfig(BarkSubModelConfig):
+    model_type = "fine_acoustics"
+
+    def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class BarkConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`BarkModel`]. It is used to instantiate a Bark
+    model according to the specified sub-models configurations, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Bark
+    [suno/bark](https://huggingface.co/suno/bark) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+    semantic_config ([`BarkSemanticConfig`], *optional*):
+        Configuration of the underlying semantic sub-model.
+    coarse_acoustics_config ([`BarkCoarseConfig`], *optional*):
+        Configuration of the underlying coarse acoustics sub-model.
+    fine_acoustics_config ([`BarkFineConfig`], *optional*):
+        Configuration of the underlying fine acoustics sub-model.
+    codec_config ([`AutoConfig`], *optional*):
+        Configuration of the underlying codec sub-model.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     BarkSemanticConfig,
+    ...     BarkCoarseConfig,
+    ...     BarkFineConfig,
+    ...     BarkModel,
+    ...     BarkConfig,
+    ...     AutoConfig,
+    ... )
+
+    >>> # Initializing Bark sub-modules configurations.
+    >>> semantic_config = BarkSemanticConfig()
+    >>> coarse_acoustics_config = BarkCoarseConfig()
+    >>> fine_acoustics_config = BarkFineConfig()
+    >>> codec_config = AutoConfig.from_pretrained("facebook/encodec_24khz")
+
+
+    >>> # Initializing a Bark module style configuration
+    >>> configuration = BarkConfig.from_sub_model_configs(
+    ...     semantic_config, coarse_acoustics_config, fine_acoustics_config, codec_config
+    ... )
+
+    >>> # Initializing a model (with random weights)
+    >>> model = BarkModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "bark"
+    is_composition = True
+
+    def __init__(
+        self,
+        semantic_config: Dict = None,
+        coarse_acoustics_config: Dict = None,
+        fine_acoustics_config: Dict = None,
+        codec_config: Dict = None,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        if semantic_config is None:
+            semantic_config = {}
+            logger.info("semantic_config is None. initializing the semantic model with default values.")
+
+        if coarse_acoustics_config is None:
+            coarse_acoustics_config = {}
+            logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
+
+        if fine_acoustics_config is None:
+            fine_acoustics_config = {}
+            logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
+
+        if codec_config is None:
+            codec_config = {}
+            logger.info("codec_config is None. initializing the codec model with default values.")
+
+        self.semantic_config = BarkSemanticConfig(**semantic_config)
+        self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config)
+        self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config)
+        self.codec_config = AutoConfig.for_model(**codec_config)
+
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        semantic_config: BarkSemanticConfig,
+        coarse_acoustics_config: BarkCoarseConfig,
+        fine_acoustics_config: BarkFineConfig,
+        codec_config: AutoConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`BarkConfig`] (or a derived class) from bark sub-models configuration.
+
+        Returns:
+            [`BarkConfig`]: An instance of a configuration object
+        """
+        return cls(
+            semantic_config=semantic_config.to_dict(),
+            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
+            fine_acoustics_config=fine_acoustics_config.to_dict(),
+            codec_config=codec_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        output["semantic_config"] = self.semantic_config.to_dict()
+        output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
+        output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()
+        output["codec_config"] = self.codec_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py
new file mode 100644
index 000000000000..4720a70d5cd2
--- /dev/null
+++ b/src/transformers/models/bark/convert_suno_to_hf.py
@@ -0,0 +1,262 @@
+"""Convert Bark checkpoint."""
+import argparse
+import os
+from pathlib import Path
+
+import torch
+from bark.generation import _load_model as _bark_load_model
+from huggingface_hub import hf_hub_download
+
+from transformers import EncodecConfig, EncodecModel, set_seed
+from transformers.models.bark.configuration_bark import (
+    BarkCoarseConfig,
+    BarkConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+)
+from transformers.models.bark.generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+set_seed(770)
+
+
+new_layer_name_dict = {
+    "c_attn": "att_proj",
+    "c_proj": "out_proj",
+    "c_fc": "in_proj",
+    "transformer.": "",
+    "h.": "layers.",
+    "ln_1": "layernorm_1",
+    "ln_2": "layernorm_2",
+    "ln_f": "layernorm_final",
+    "wpe": "position_embeds_layer",
+    "wte": "input_embeds_layer",
+}
+
+
+REMOTE_MODEL_PATHS = {
+    "text_small": {
+        "repo_id": "suno/bark",
+        "file_name": "text.pt",
+    },
+    "coarse_small": {
+        "repo_id": "suno/bark",
+        "file_name": "coarse.pt",
+    },
+    "fine_small": {
+        "repo_id": "suno/bark",
+        "file_name": "fine.pt",
+    },
+    "text": {
+        "repo_id": "suno/bark",
+        "file_name": "text_2.pt",
+    },
+    "coarse": {
+        "repo_id": "suno/bark",
+        "file_name": "coarse_2.pt",
+    },
+    "fine": {
+        "repo_id": "suno/bark",
+        "file_name": "fine_2.pt",
+    },
+}
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
+
+
+def _get_ckpt_path(model_type, use_small=False):
+    key = model_type
+    if use_small:
+        key += "_small"
+    return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
+
+
+def _download(from_hf_path, file_name):
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+
+
+def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+    if model_type == "text":
+        ModelClass = BarkSemanticModel
+        ConfigClass = BarkSemanticConfig
+        GenerationConfigClass = BarkSemanticGenerationConfig
+    elif model_type == "coarse":
+        ModelClass = BarkCoarseModel
+        ConfigClass = BarkCoarseConfig
+        GenerationConfigClass = BarkCoarseGenerationConfig
+    elif model_type == "fine":
+        ModelClass = BarkFineModel
+        ConfigClass = BarkFineConfig
+        GenerationConfigClass = BarkFineGenerationConfig
+    else:
+        raise NotImplementedError()
+    model_key = f"{model_type}_small" if use_small else model_type
+    model_info = REMOTE_MODEL_PATHS[model_key]
+    if not os.path.exists(ckpt_path):
+        logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
+        _download(model_info["repo_id"], model_info["file_name"])
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    # this is a hack
+    model_args = checkpoint["model_args"]
+    if "input_vocab_size" not in model_args:
+        model_args["input_vocab_size"] = model_args["vocab_size"]
+        model_args["output_vocab_size"] = model_args["vocab_size"]
+        del model_args["vocab_size"]
+
+    # convert Bark model arguments to HF Bark model arguments
+    model_args["num_heads"] = model_args.pop("n_head")
+    model_args["hidden_size"] = model_args.pop("n_embd")
+    model_args["num_layers"] = model_args.pop("n_layer")
+
+    model_config = ConfigClass(**checkpoint["model_args"])
+    model = ModelClass(config=model_config)
+    model_generation_config = GenerationConfigClass()
+
+    model.generation_config = model_generation_config
+    state_dict = checkpoint["model"]
+    # fixup checkpoint
+    unwanted_prefix = "_orig_mod."
+    for k, v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            # replace part of the key with corresponding layer name in HF implementation
+            new_k = k[len(unwanted_prefix) :]
+            for old_layer_name in new_layer_name_dict:
+                new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name])
+
+            state_dict[new_k] = state_dict.pop(k)
+
+    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+    extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")}
+    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")}
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    model.load_state_dict(state_dict, strict=False)
+    n_params = model.num_parameters(exclude_embeddings=True)
+    val_loss = checkpoint["best_val_loss"].item()
+    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+    model.eval()
+    model.to(device)
+    del checkpoint, state_dict
+
+    return model
+
+
+def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
+    if model_type not in ("text", "coarse", "fine"):
+        raise NotImplementedError()
+
+    device = "cpu"  # do conversion on cpu
+
+    ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
+    model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small)
+
+    # load bark initial model
+    bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small)
+
+    if model_type == "text":
+        bark_model = bark_model["model"]
+
+    if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params():
+        raise ValueError("initial and new models don't have the same number of parameters")
+
+    # check if same output as the bark model
+    batch_size = 5
+    sequence_length = 10
+
+    if model_type in ["text", "coarse"]:
+        vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int)
+        output_old_model = bark_model(vec)[0]
+
+        output_new_model_total = model(vec)
+
+        # take last logits
+        output_new_model = output_new_model_total.logits[:, [-1], :]
+
+    else:
+        prediction_codeboook_channel = 3
+        n_codes_total = 8
+        vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
+
+        output_new_model_total = model(prediction_codeboook_channel, vec)
+        output_old_model = bark_model(prediction_codeboook_channel, vec)
+
+        output_new_model = output_new_model_total.logits
+
+    # output difference should come from the difference of self-attention implementation design
+    if output_new_model.shape != output_old_model.shape:
+        raise ValueError("initial and new outputs don't have the same shape")
+    if (output_new_model - output_old_model).abs().max().item() > 1e-3:
+        raise ValueError("initial and new outputs are not equal")
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+def load_whole_bark_model(
+    semantic_path,
+    coarse_path,
+    fine_path,
+    append_text,
+    hub_path,
+    folder_path,
+):
+    pytorch_dump_folder_path = os.path.join(folder_path, append_text)
+
+    semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json"))
+    coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json"))
+    fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json"))
+    codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz")
+
+    semantic = BarkSemanticModel.from_pretrained(semantic_path)
+    coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path)
+    fineAcoustic = BarkFineModel.from_pretrained(fine_path)
+    codec = EncodecModel.from_pretrained("facebook/encodec_24khz")
+
+    bark_config = BarkConfig.from_sub_model_configs(
+        semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig
+    )
+
+    bark_generation_config = BarkGenerationConfig.from_sub_model_configs(
+        semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config
+    )
+
+    bark = BarkModel(bark_config)
+
+    bark.semantic = semantic
+    bark.coarse_acoustics = coarseAcoustic
+    bark.fine_acoustics = fineAcoustic
+    bark.codec_model = codec
+
+    bark.generation_config = bark_generation_config
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+
+    parser.add_argument("model_type", type=str, help="text, coarse or fine.")
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.")
+
+    args = parser.parse_args()
+
+    load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small)
diff --git a/src/transformers/models/bark/generation_configuration_bark.py b/src/transformers/models/bark/generation_configuration_bark.py
new file mode 100644
index 000000000000..9a280b99898d
--- /dev/null
+++ b/src/transformers/models/bark/generation_configuration_bark.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BARK model generation configuration"""
+
+import copy
+from typing import Dict
+
+from ...generation.configuration_utils import GenerationConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BarkSemanticGenerationConfig(GenerationConfig):
+    model_type = "semantic"
+
+    def __init__(
+        self,
+        eos_token_id=10_000,
+        renormalize_logits=True,
+        max_new_tokens=768,
+        output_scores=False,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        temperature=0.7,
+        do_sample=True,
+        text_encoding_offset=10_048,
+        text_pad_token=129_595,
+        semantic_infer_token=129_599,
+        semantic_vocab_size=10_000,
+        max_input_semantic_length=256,
+        semantic_rate_hz=49.9,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkSemanticModel`].
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            eos_token_id (`int`, *optional*, defaults to 10_000):
+                The id of the *end-of-sequence* token.
+            renormalize_logits (`bool`, *optional*, defaults to `True`):
+                Whether to renormalize the logits after applying all the logits processors or warpers (including the
+                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
+                score logits are normalized but some logit processors or warpers break the normalization.
+            max_new_tokens (`int`, *optional*, defaults to 768):
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            temperature (`float`, *optional*, defaults to 0.7):
+                The value used to modulate the next token probabilities.
+            do_sample (`bool`, *optional*, defaults to `True`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            text_encoding_offset (`int`, *optional*, defaults to 10_048):
+                Text encoding offset.
+            text_pad_token (`int`, *optional*, defaults to 129_595):
+                Text pad token.
+            semantic_infer_token (`int`, *optional*, defaults to 129_599):
+                Semantic infer token.
+            semantic_vocab_size (`int`, *optional*, defaults to 10_000):
+                Semantic vocab size.
+            max_input_semantic_length (`int`, *optional*, defaults to 256):
+                Max length of semantic input vector.
+            semantic_rate_hz (`float`, *optional*, defaults to 49.9):
+                Semantic rate in Hertz.
+        """
+        super().__init__(
+            temperature=temperature,
+            do_sample=do_sample,
+            eos_token_id=eos_token_id,
+            renormalize_logits=renormalize_logits,
+            max_new_tokens=max_new_tokens,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+
+        self.text_encoding_offset = text_encoding_offset
+        self.text_pad_token = text_pad_token
+        self.semantic_pad_token = eos_token_id
+        self.semantic_infer_token = semantic_infer_token
+        self.semantic_vocab_size = semantic_vocab_size
+        self.max_input_semantic_length = max_input_semantic_length
+        self.semantic_rate_hz = semantic_rate_hz
+
+
+class BarkCoarseGenerationConfig(GenerationConfig):
+    model_type = "coarse_acoustics"
+
+    def __init__(
+        self,
+        renormalize_logits=True,
+        output_scores=False,
+        return_dict_in_generate=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        temperature=0.7,
+        do_sample=True,
+        coarse_semantic_pad_token=12_048,
+        coarse_rate_hz=75,
+        n_coarse_codebooks=2,
+        coarse_infer_token=12_050,
+        max_coarse_input_length=256,
+        max_coarse_history: int = 630,
+        sliding_window_len: int = 60,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkCoarseModel`].
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            renormalize_logits (`bool`, *optional*, defaults to `True`):
+                Whether to renormalize the logits after applying all the logits processors or warpers (including the
+                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
+                score logits are normalized but some logit processors or warpers break the normalization.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            temperature (`float`, *optional*, defaults to 0.7):
+                The value used to modulate the next token probabilities.
+            do_sample (`bool`, *optional*, defaults to `True`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
+                Coarse semantic pad token.
+            coarse_rate_hz (`int`, *optional*, defaults to 75):
+                Coarse rate in Hertz.
+            n_coarse_codebooks (`int`, *optional*, defaults to 2):
+                Number of coarse codebooks.
+            coarse_infer_token (`int`, *optional*, defaults to 12_050):
+                Coarse infer token.
+            max_coarse_input_length (`int`, *optional*, defaults to 256):
+                Max length of input coarse vector.
+            max_coarse_history (`int`, *optional*, defaults to 630):
+                Max length of the output of the coarse acoustics model used in the fine generation step.
+            sliding_window_len (`int`, *optional*, defaults to 60):
+                The coarse generation step uses a sliding window to generate raw audio.
+        """
+        super().__init__(
+            temperature=temperature,
+            do_sample=do_sample,
+            renormalize_logits=renormalize_logits,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+
+        self.coarse_semantic_pad_token = coarse_semantic_pad_token
+        self.coarse_rate_hz = coarse_rate_hz
+        self.n_coarse_codebooks = n_coarse_codebooks
+        self.coarse_infer_token = coarse_infer_token
+        self.max_coarse_input_length = max_coarse_input_length
+        self.max_coarse_history = max_coarse_history
+        self.sliding_window_len = sliding_window_len
+
+
+class BarkFineGenerationConfig(GenerationConfig):
+    model_type = "fine_acoustics"
+
+    def __init__(
+        self,
+        temperature=0.5,
+        max_fine_history_length=512,
+        max_fine_input_length=1024,
+        n_fine_codebooks=8,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkFineModel`].
+
+        [`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the
+        hood, it uses `temperature` when used by [`BarkModel`]
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            temperature (`float`, *optional*, defaults to 0.5):
+                The value used to modulate the next token probabilities.
+            max_fine_history_length (`int`, *optional*, defaults to 512):
+                Max length of the fine history vector.
+            max_fine_input_length (`int`, *optional*, defaults to 1024):
+                Max length of fine input vector.
+            n_fine_codebooks (`int`, *optional*, defaults to 8):
+                Number of codebooks used.
+        """
+        super().__init__(temperature=temperature)
+
+        self.max_fine_history_length = max_fine_history_length
+        self.max_fine_input_length = max_fine_input_length
+        self.n_fine_codebooks = n_fine_codebooks
+
+
+class BarkGenerationConfig(GenerationConfig):
+    model_type = "bark"
+    is_composition = True
+
+    # TODO (joao): nested from_dict
+
+    def __init__(
+        self,
+        semantic_config: Dict = None,
+        coarse_acoustics_config: Dict = None,
+        fine_acoustics_config: Dict = None,
+        sample_rate=24_000,
+        codebook_size=1024,
+        **kwargs,
+    ):
+        """Class that holds a generation configuration for [`BarkModel`].
+
+        The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested
+        [`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`],
+        [`BarkFineGenerationConfig`].
+
+        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
+        documentation from [`GenerationConfig`] for more information.
+
+        Args:
+            semantic_config (`Dict`, *optional*):
+                Semantic generation configuration.
+            coarse_acoustics_config (`Dict`, *optional*):
+                Coarse generation configuration.
+            fine_acoustics_config (`Dict`, *optional*):
+                Fine generation configuration.
+            sample_rate (`int`, *optional*, defaults to 24_000):
+                Sample rate.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Vector length for each codebook.
+        """
+        if semantic_config is None:
+            semantic_config = {}
+            logger.info("semantic_config is None. initializing the semantic model with default values.")
+
+        if coarse_acoustics_config is None:
+            coarse_acoustics_config = {}
+            logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
+
+        if fine_acoustics_config is None:
+            fine_acoustics_config = {}
+            logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
+
+        self.semantic_config = BarkSemanticGenerationConfig(**semantic_config)
+        self.coarse_acoustics_config = BarkCoarseGenerationConfig(**coarse_acoustics_config)
+        self.fine_acoustics_config = BarkFineGenerationConfig(**fine_acoustics_config)
+
+        self.sample_rate = sample_rate
+        self.codebook_size = codebook_size
+
+    @classmethod
+    def from_sub_model_configs(
+        cls,
+        semantic_config: BarkSemanticGenerationConfig,
+        coarse_acoustics_config: BarkCoarseGenerationConfig,
+        fine_acoustics_config: BarkFineGenerationConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration.
+
+        Returns:
+            [`BarkGenerationConfig`]: An instance of a configuration object
+        """
+        return cls(
+            semantic_config=semantic_config.to_dict(),
+            coarse_acoustics_config=coarse_acoustics_config.to_dict(),
+            fine_acoustics_config=fine_acoustics_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        output["semantic_config"] = self.semantic_config.to_dict()
+        output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
+        output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
new file mode 100644
index 000000000000..6aa5be7b5b7d
--- /dev/null
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -0,0 +1,1503 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BARK model."""
+import math
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ...generation.logits_process import AlternatingCodebooksLogitsProcessor, SuppressTokensLogitsProcessor
+from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ..auto import AutoModel
+from .configuration_bark import (
+    BarkCoarseConfig,
+    BarkConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+    BarkSubModelConfig,
+)
+from .generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "suno/bark-small"
+_CONFIG_FOR_DOC = "BarkConfig"
+
+BARK_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "suno/bark-small",
+    "suno/barh",
+    # See all Bark models at https://huggingface.co/models?filter=bark
+]
+
+
+class BarkSelfAttention(nn.Module):
+    # adapted from GPTNeoSelfAttention and Bark code
+    # BarkSelfAttention can have two attention type, i.e full attention or causal attention
+
+    def __init__(self, config, is_causal=False):
+        super().__init__()
+
+        # regularization
+        self.dropout = config.dropout
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        if config.hidden_size % config.num_heads != 0:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        # key, query, value projections for all heads, but in a batch
+        self.att_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
+        # output projection
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.bias)
+
+        self.is_causal = is_causal
+        if is_causal:
+            block_size = config.block_size
+            bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size)
+            self.register_buffer("bias", bias)
+
+    # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+
+        # re-assemble all head outputs side by side
+        # (batch, num_heads, seq_len, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size)
+        tensor = tensor.transpose(1, 2).contiguous()
+        tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
+
+        return tensor
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # unlike GPTNeo's SelfAttention, divide by the square root of the dimension of the query and the key
+        attn_weights = torch.matmul(query, key.transpose(-1, -2)) * (1.0 / math.sqrt(self.head_dim))
+
+        if self.is_causal:
+            query_length, key_length = query.size(-2), key.size(-2)
+
+            # fill the upper left part of the attention weights with inf
+            attn_weights = attn_weights.masked_fill(
+                self.bias[:, :, key_length - query_length : key_length, :key_length] == 0,
+                torch.finfo(attn_weights.dtype).min,
+            )
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        # (batch, num_heads, seq_len, seq_len) x (batch, num_heads, seq_len, attn_head_size)
+        # -> (batch, num_heads, seq_len, attn_head_size)
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        past_key_values=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        if past_key_values is not None:
+            past_key = past_key_values[0]
+            past_value = past_key_values[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class BarkLayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False."""
+
+    def __init__(self, hidden_size, bias=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None
+
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5)
+
+
+class BarkMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.in_proj = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=config.bias)
+        self.out_proj = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        self.gelu = nn.GELU()
+
+    def forward(self, hidden_states):
+        hidden_states = self.in_proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class BarkBlock(nn.Module):
+    def __init__(self, config, is_causal=False):
+        super().__init__()
+
+        if is_causal:
+            # if causal, uses handmade LayerNorm, so that the layerNorm bias is optional
+            # this handmade layerNorm is used to stick with Bark choice of leaving optional bias in
+            # AutoRegressive models (corresponding to the "Text" and the "Coarse" modules)
+            self.layernorm_1 = BarkLayerNorm(config.hidden_size, bias=config.bias)
+            self.layernorm_2 = BarkLayerNorm(config.hidden_size, bias=config.bias)
+        else:
+            self.layernorm_1 = nn.LayerNorm(config.hidden_size)
+            self.layernorm_2 = nn.LayerNorm(config.hidden_size)
+
+        self.attn = BarkSelfAttention(config, is_causal=is_causal)
+
+        self.mlp = BarkMLP(config)
+
+    def forward(
+        self,
+        hidden_states,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        intermediary_hidden_states = self.layernorm_1(hidden_states)
+
+        attn_outputs = self.attn(
+            intermediary_hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        attn_output = attn_outputs[0]  # output_attn: output, present_key_values, (attn_weights)
+        outputs = attn_outputs[1:]
+
+        intermediary_hidden_states = hidden_states + attn_output
+        intermediary_hidden_states = intermediary_hidden_states + self.mlp(
+            self.layernorm_2(intermediary_hidden_states)
+        )
+
+        if use_cache:
+            outputs = (intermediary_hidden_states,) + outputs
+        else:
+            outputs = (intermediary_hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, ((present), attentions)
+
+
+class BarkPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BarkConfig
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BarkCausalModel) or isinstance(module, BarkFineModel) or isinstance(module, BarkModel):
+            module.gradient_checkpointing = value
+
+
+BARK_MODEL_START_DOCSTRING = """
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`{config}`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+BARK_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BarkConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+BARK_FINE_INPUTS_DOCSTRING = r"""
+    Args:
+        codebook_idx (`int`):
+            Index of the codebook that will be predicted.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
+            predicted recursively by attending the previously predicted channels. The model predicts on windows of
+            length 1024.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
+        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
+            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
+            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
+            associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `input_ids` of shape `(batch_size, sequence_length)`.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
+            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
+            is used in priority instead of `input_ids`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# GPT2-like autoregressive model
+class BarkCausalModel(BarkPreTrainedModel):
+    config_class = BarkSubModelConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        # initialize as an autoregressive GPT-like model
+        self.input_embeds_layer = nn.Embedding(config.input_vocab_size, config.hidden_size)
+        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.layers = nn.ModuleList([BarkBlock(config, is_causal=True) for _ in range(config.num_layers)])
+
+        self.layernorm_final = BarkLayerNorm(config.hidden_size, bias=config.bias)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.input_embeds_layer
+
+    def set_input_embeddings(self, new_embeddings):
+        self.input_embeds_layer = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        input_embeds = kwargs.get("input_embeds", None)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if past_key_values is not None:
+            # only last token for inputs_ids if past is defined in kwargs
+            seq_len = input_ids.shape[1]
+            input_ids = input_ids[:, [-1]]
+
+            # input_embeds have already been used and is not required anymore
+            input_embeds = None
+        else:
+            if input_embeds is not None and kwargs.get("use_cache"):
+                seq_len = input_embeds.shape[1]
+            else:
+                seq_len = input_ids.shape[1]
+
+        # ensure that attention_mask and position_ids shapes are aligned with the weird Bark hack of reducing
+        # sequence length on the first forward pass
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :seq_len]
+        if position_ids is not None:
+            position_ids = position_ids[:, :seq_len]
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+
+        if input_embeds is not None and kwargs.get("use_cache"):
+            return {
+                "input_ids": None,
+                "input_embeds": input_embeds,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+            }
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+        }
+
+    @add_start_docstrings_to_model_forward(BARK_CAUSAL_MODEL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Verify if input_embeds already exists
+        # then compute embeddings.
+        if input_ids is not None and input_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and input_embeds at the same time")
+        elif input_embeds is not None and past_key_values is None:
+            # we want to return the input_embeds in priority so that it is in line with a weird hack
+            # of Bark which concatenate two bits of the input_embeds on the first forward pass of the semantic model
+            pass
+        elif input_ids is not None:
+            input_embeds = self.input_embeds_layer(input_ids)  # token embeddings of shape (b, t, n_embd)
+        elif input_embeds is not None:
+            pass
+        else:
+            raise ValueError("You have to specify either input_ids or input_embeds")
+
+        input_shape = input_embeds.size()[:-1]
+        batch_size = input_embeds.shape[0]
+        seq_length = input_shape[-1]
+
+        device = input_ids.device if input_ids is not None else input_embeds.device
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.layers))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+
+        if position_ids is None:
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, seq_length)
+
+        position_embeds = self.position_embeds_layer(position_ids)  # position embeddings of shape (1, t, n_embd)
+
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_heads x N x N
+        # head_mask has shape num_layers x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+
+        hidden_states = self.drop(input_embeds + position_embeds)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        present_key_values = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, (block, past_layer_key_values) in enumerate(zip(self.layers, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    past_key_values=past_layer_key_values,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = outputs[0]
+
+            if use_cache:
+                present_key_values = present_key_values + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.layernorm_final(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError(
+                "Training is not implemented yet for Bark - ensure you do not pass `labels` to the model."
+            )
+
+        if not return_dict:
+            return tuple(
+                v for v in [None, logits, present_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=present_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        # Necessary for beam_search
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            for layer_past in past_key_values
+        )
+
+
+@add_start_docstrings(
+    """Bark semantic (or text) model. It shares the same architecture as the coarse model.
+    It is a GPT-2 like autoregressive model with a language modeling head on top.""",
+    BARK_MODEL_START_DOCSTRING.format(config="BarkSemanticConfig"),
+)
+class BarkSemanticModel(BarkCausalModel):
+    base_model_prefix = "semantic"
+    config_class = BarkSemanticConfig
+
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        semantic_generation_config: BarkSemanticGenerationConfig = None,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.
+
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids, i.e tokenized input sentences. Will be truncated up to
+                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
+                long as the longest generation among the batch.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+            attention_mask (`Optional[torch.Tensor]`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+        Returns:
+            torch.LongTensor: Output semantic tokens.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+
+        batch_size = input_ids.shape[0]
+
+        max_input_semantic_length = semantic_generation_config.max_input_semantic_length
+
+        input_ids = input_ids + semantic_generation_config.text_encoding_offset
+
+        if attention_mask is not None:
+            input_ids = input_ids.masked_fill((1 - attention_mask).bool(), semantic_generation_config.text_pad_token)
+
+        if history_prompt is not None:
+            semantic_history = history_prompt["semantic_prompt"][-max_input_semantic_length:]
+            semantic_history = nn.functional.pad(
+                semantic_history,
+                (0, max_input_semantic_length - len(semantic_history)),
+                value=semantic_generation_config.semantic_pad_token,
+                mode="constant",
+            )
+        else:
+            semantic_history = torch.tensor(
+                [semantic_generation_config.semantic_pad_token] * max_input_semantic_length, dtype=torch.int
+            ).to(self.device)
+
+        semantic_history = torch.repeat_interleave(semantic_history[None], batch_size, dim=0)
+
+        infer_array = torch.tensor(
+            [[semantic_generation_config.semantic_infer_token]] * batch_size, dtype=torch.int
+        ).to(self.device)
+
+        input_embeds = torch.cat(
+            [
+                self.input_embeds_layer(input_ids[:, :max_input_semantic_length])
+                + self.input_embeds_layer(semantic_history[:, : max_input_semantic_length + 1]),
+                self.input_embeds_layer(infer_array),
+            ],
+            dim=1,
+        )
+
+        tokens_to_suppress = list(
+            range(semantic_generation_config.semantic_vocab_size, semantic_generation_config.semantic_pad_token)
+        )
+        tokens_to_suppress.extend(
+            list(range(semantic_generation_config.semantic_pad_token + 1, self.config.output_vocab_size))
+        )
+
+        suppress_tokens_logits_processor = SuppressTokensLogitsProcessor(tokens_to_suppress)
+
+        # pass input_ids in order to stay consistent with the transformers generate method even though it is not used
+        # (except to get the input seq_len - that's why we keep the first 257 tokens)
+        semantic_output = super().generate(
+            torch.ones((batch_size, max_input_semantic_length + 1), dtype=torch.int).to(self.device),
+            input_embeds=input_embeds,
+            logits_processor=[suppress_tokens_logits_processor],
+            generation_config=semantic_generation_config,
+            **kwargs,
+        )  # size: 10048
+
+        # take the generated semantic tokens
+        semantic_output = semantic_output[:, max_input_semantic_length + 1 :]
+
+        return semantic_output
+
+
+@add_start_docstrings(
+    """Bark coarse acoustics model.
+    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
+    language modeling head on top.""",
+    BARK_MODEL_START_DOCSTRING.format(config="BarkCoarseConfig"),
+)
+class BarkCoarseModel(BarkCausalModel):
+    base_model_prefix = "coarse_acoustics"
+    config_class = BarkCoarseConfig
+
+    def preprocess_histories(
+        self,
+        max_coarse_history: int,
+        semantic_to_coarse_ratio: int,
+        batch_size: int,
+        semantic_generation_config: int,
+        codebook_size: int,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        """
+        Preprocess the optional `Bark` speaker prompts before `self.generate`.
+
+        Args:
+            max_coarse_history (`int`):
+                Maximum size of coarse tokens used.
+            semantic_to_coarse_ratio (`int`):
+                Ratio of semantic to coarse frequency
+            batch_size (`int`):
+                Batch size, i.e the number of samples.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            codebook_size (`int`):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`):
+                Optional `Bark` speaker prompt.
+        Returns: Returns:
+            `tuple(torch.FloatTensor)`:
+            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
+            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
+        """
+        if history_prompt is not None:
+            x_semantic_history = torch.repeat_interleave(history_prompt["semantic_prompt"][None], batch_size, dim=0)
+            # clone to avoid modifying history_prompt.coarse_prompt
+            x_coarse_history = history_prompt["coarse_prompt"].clone()
+
+            # offset x_coarse_history
+            if codebook_size is not None:
+                for n in range(1, x_coarse_history.shape[0]):
+                    # offset
+                    x_coarse_history[n, :] += codebook_size * n
+
+            # flatten x_coarse_history
+            x_coarse_history = torch.transpose(x_coarse_history, 0, 1).view(-1)
+
+            x_coarse_history = x_coarse_history + semantic_generation_config.semantic_vocab_size
+
+            x_coarse_history = torch.repeat_interleave(x_coarse_history[None], batch_size, dim=0)
+            # e.g: after SEMANTIC_VOCAB_SIZE (10000), 1024 tokens dedicated to first codebook, 1024 next tokens
+            # dedicated to second codebook.
+
+            max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+            # trim histories correctly
+            n_semantic_hist_provided = min(
+                [
+                    max_semantic_history,
+                    x_semantic_history.shape[1] - x_semantic_history.shape[1] % 2,
+                    int(np.floor(x_coarse_history.shape[1] / semantic_to_coarse_ratio)),
+                ]
+            )
+
+            n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+
+            x_semantic_history = x_semantic_history[:, -n_semantic_hist_provided:].int()
+            x_coarse_history = x_coarse_history[:, -n_coarse_hist_provided:].int()
+            # bit of a hack for time alignment (sounds better) - from Bark original implementation
+            x_coarse_history = x_coarse_history[:, :-2]
+
+        else:
+            # shape: (batch_size, 0)
+            x_semantic_history = torch.tensor([[]] * batch_size, dtype=torch.int).to(self.device)
+            x_coarse_history = torch.tensor([[]] * batch_size, dtype=torch.int).to(self.device)
+
+        return x_semantic_history, x_coarse_history
+
+    def generate(
+        self,
+        semantic_output: torch.Tensor,
+        semantic_generation_config: BarkSemanticGenerationConfig = None,
+        coarse_generation_config: BarkCoarseGenerationConfig = None,
+        codebook_size: int = 1024,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
+        prompt.
+
+        Args:
+            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
+                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            coarse_generation_config (`BarkCoarseGenerationConfig`):
+                Generation config indicating how to generate the coarse tokens.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+        Returns:
+            torch.LongTensor: Output coarse acoustics tokens.
+        """
+
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+
+        if coarse_generation_config is None:
+            raise ValueError("`coarse_generation_config` has to be provided")
+
+        max_coarse_input_length = coarse_generation_config.max_coarse_input_length
+        max_coarse_history = coarse_generation_config.max_coarse_history
+        sliding_window_len = coarse_generation_config.sliding_window_len
+
+        # replace semantic_pad_token (eos_tok and pad_tok here) with coarse_semantic_pad_token i.e the pad_token
+        # used in the next model
+        semantic_output.masked_fill_(
+            semantic_output == semantic_generation_config.semantic_pad_token,
+            coarse_generation_config.coarse_semantic_pad_token,
+        )
+
+        semantic_to_coarse_ratio = (
+            coarse_generation_config.coarse_rate_hz
+            / semantic_generation_config.semantic_rate_hz
+            * coarse_generation_config.n_coarse_codebooks
+        )
+        max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+
+        # beware, depends on the seq_len of the longest sequence of the batch.
+        # Also, the seq_len might be one token too long because of an added
+        # pad_token as compared to Bark original implementation.
+        max_generated_len = np.floor(
+            semantic_output.shape[1] * semantic_to_coarse_ratio / coarse_generation_config.n_coarse_codebooks
+        )
+        max_generated_len = int(round(max_generated_len * coarse_generation_config.n_coarse_codebooks))
+
+        batch_size = semantic_output.shape[0]
+
+        x_semantic_history, x_coarse = self.preprocess_histories(
+            history_prompt=history_prompt,
+            max_coarse_history=max_coarse_history,
+            semantic_to_coarse_ratio=semantic_to_coarse_ratio,
+            batch_size=batch_size,
+            semantic_generation_config=semantic_generation_config,
+            codebook_size=codebook_size,
+        )
+        base_semantic_idx = x_semantic_history.shape[1]
+
+        semantic_output = torch.hstack([x_semantic_history, semantic_output])
+
+        n_window_steps = int(np.ceil(max_generated_len / sliding_window_len))
+
+        total_generated_len = 0
+
+        len_coarse_history = x_coarse.shape[1]
+
+        for _ in range(n_window_steps):
+            semantic_idx = base_semantic_idx + int(round(total_generated_len / semantic_to_coarse_ratio))
+
+            # pad from right side
+            input_coarse = semantic_output[:, np.max([0, semantic_idx - max_semantic_history]) :]
+            input_coarse = input_coarse[:, :max_coarse_input_length]
+            input_coarse = F.pad(
+                input_coarse,
+                (0, max_coarse_input_length - input_coarse.shape[-1]),
+                "constant",
+                coarse_generation_config.coarse_semantic_pad_token,
+            )
+
+            input_coarse = torch.hstack(
+                [
+                    input_coarse,
+                    torch.tensor([[coarse_generation_config.coarse_infer_token]] * batch_size).to(self.device),
+                    x_coarse[:, -max_coarse_history:],
+                ]
+            )
+
+            alternatingLogitsProcessor = AlternatingCodebooksLogitsProcessor(
+                input_coarse.shape[1],
+                semantic_generation_config.semantic_vocab_size,
+                codebook_size,
+            )
+
+            output_coarse = super().generate(
+                input_coarse,
+                logits_processor=[alternatingLogitsProcessor],
+                max_new_tokens=min(sliding_window_len, max_generated_len - total_generated_len),
+                generation_config=coarse_generation_config,
+                **kwargs,
+            )
+
+            input_coarse_len = input_coarse.shape[1]
+
+            x_coarse = torch.hstack([x_coarse, output_coarse[:, input_coarse_len:]])
+            total_generated_len = x_coarse.shape[1] - len_coarse_history
+
+            del output_coarse
+
+        coarse_output = x_coarse[:, len_coarse_history:]
+
+        return coarse_output
+
+
+@add_start_docstrings(
+    """Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
+    language modeling heads, one for each codebook.""",
+    BARK_MODEL_START_DOCSTRING.format(config="BarkFineConfig"),
+)
+class BarkFineModel(BarkPreTrainedModel):
+    base_model_prefix = "fine_acoustics"
+    config_class = BarkFineConfig
+    main_input_name = "codebook_idx"
+
+    def __init__(self, config):
+        # non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec
+        super().__init__(config)
+        self.config = config
+
+        # initialize a modified non causal GPT-like model
+        # note that for there is one embedding layer and one lm_head for each codebook of Encodec
+        self.input_embeds_layers = nn.ModuleList(
+            [nn.Embedding(config.input_vocab_size, config.hidden_size) for _ in range(config.n_codes_total)]
+        )
+        self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.layers = nn.ModuleList([BarkBlock(config, is_causal=False) for _ in range(config.num_layers)])
+
+        self.layernorm_final = nn.LayerNorm(config.hidden_size)
+
+        self.lm_heads = nn.ModuleList(
+            [
+                nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
+                for _ in range(config.n_codes_given, config.n_codes_total)
+            ]
+        )
+        self.gradient_checkpointing = False
+        self.n_codes_total = config.n_codes_total
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        # one embedding layers for each codebook
+        return self.input_embeds_layers
+
+    def set_input_embeddings(self, new_embeddings):
+        # one embedding layers for each codebook
+        self.input_embeds_layers = new_embeddings
+
+    def get_output_embeddings(self):
+        # one lm_head for each codebook
+        return self.lm_heads
+
+    def set_output_embeddings(self, new_output_embeddings):
+        # one lm_head for each codebook
+        self.lm_heads = new_output_embeddings
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings_list = self.get_input_embeddings()
+        new_embeddings_list = nn.ModuleList(
+            [self._get_resized_embeddings(old_embeddings, new_num_tokens) for old_embeddings in old_embeddings_list]
+        )
+        self.set_input_embeddings(new_embeddings_list)
+
+        # if word embeddings are not tied, make sure that lm head is resized as well
+        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
+            old_lm_head_list = self.get_output_embeddings()
+            new_lm_head_list = nn.ModuleList(
+                [self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list]
+            )
+            self.set_output_embeddings(new_lm_head_list)
+
+        return self.get_input_embeddings()
+
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings list and the output embeddings list.
+
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
+        weights instead.
+        """
+        if getattr(self.config, "tie_word_embeddings", True):
+            self._tied_weights_keys = []
+            output_embeddings = self.get_output_embeddings()
+            input_embeddings = self.get_input_embeddings()
+
+            for i in range(self.config.n_codes_total - self.config.n_codes_given):
+                # self.input_embeds_layers[i + 1].weight = self.lm_heads[i].weight
+                self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1])
+                self._tied_weights_keys.append(f"lm_heads.{i}.weight")
+
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+
+    @add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        codebook_idx: int,  # an additionnal idx corresponding to the id of the codebook that will be predicted
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if codebook_idx == 0:
+            raise ValueError("Cannot predict 0th codebook - 0th codebook should be predicted by the coarse model")
+
+        if input_ids is not None and input_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and input_embeds at the same time")
+
+        if input_ids is None and input_embeds is None:
+            raise ValueError("You have to specify either input_ids or input_embeds")
+
+        if input_ids is not None:
+            # the input_embeddings are the sum of the j previous codebooks embeddings before
+            # the current codebook_idx codebook
+
+            # forward the GPT model itself
+            input_embeds = [
+                input_embeds_layer(input_ids[:, :, i]).unsqueeze(-1)
+                for i, input_embeds_layer in enumerate(self.input_embeds_layers)
+            ]  # token embeddings of shape (b, t, n_embd)
+            input_embeds = torch.cat(input_embeds, dim=-1)
+            input_embeds = input_embeds[:, :, :, : codebook_idx + 1].sum(dim=-1)
+
+        input_shape = input_embeds.size()[:-1]
+        batch_size = input_embeds.shape[0]
+        seq_length = input_shape[1]
+
+        device = input_ids.device if input_ids is not None else input_embeds.device
+
+        if position_ids is None:
+            position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, seq_length)
+
+        position_embeds = self.position_embeds_layer(position_ids)  # position embeddings of shape (1, t, n_embd)
+
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            attention_mask = attention_mask[:, None, None, :]
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+
+        hidden_states = self.drop(input_embeds + position_embeds)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, block in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        hidden_states = self.layernorm_final(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        logits = self.lm_heads[codebook_idx - self.config.n_codes_given](hidden_states)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        if not return_dict:
+            return tuple(v for v in [None, logits, all_hidden_states, all_self_attentions] if v is not None)
+
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def can_generate(self) -> bool:
+        """
+        Returns True. Despite being an autoencoder, BarkFineModel shares some characteristics with generative models
+        due to the way audio are generated.
+        """
+        return True
+
+    def generate(
+        self,
+        coarse_output: torch.Tensor,
+        semantic_generation_config: BarkSemanticGenerationConfig = None,
+        coarse_generation_config: BarkCoarseGenerationConfig = None,
+        fine_generation_config: BarkFineGenerationConfig = None,
+        codebook_size: int = 1024,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
+        prompt.
+
+        Args:
+            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
+                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
+            semantic_generation_config (`BarkSemanticGenerationConfig`):
+                Generation config indicating how to generate the semantic tokens.
+            coarse_generation_config (`BarkCoarseGenerationConfig`):
+                Generation config indicating how to generate the coarse tokens.
+            fine_generation_config (`BarkFineGenerationConfig`):
+                Generation config indicating how to generate the fine tokens.
+            codebook_size (`int`, *optional*, defaults to 1024):
+                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt.
+        Returns:
+            torch.LongTensor: Output fine acoustics tokens.
+        """
+        if semantic_generation_config is None:
+            raise ValueError("`semantic_generation_config` has to be provided")
+
+        if coarse_generation_config is None:
+            raise ValueError("`coarse_generation_config` has to be provided")
+
+        if fine_generation_config is None:
+            raise ValueError("`fine_generation_config` has to be provided")
+
+        # since we don't really use GenerationConfig through the fine model (autoencoder)
+        # and since only temperature is used from the classic GenerationConfig parameters
+        # manually impose the kwargs priority over the generation config
+        temperature = kwargs.get("temperature", fine_generation_config.temperature)
+
+        max_fine_history_length = fine_generation_config.max_fine_history_length
+        max_fine_input_length = fine_generation_config.max_fine_input_length
+
+        # shape: (batch, n_coarse_codebooks * seq_len)
+        # new_shape: (batch, seq_len, n_coarse_codebooks)
+        coarse_output = coarse_output.view(coarse_output.shape[0], -1, coarse_generation_config.n_coarse_codebooks)
+
+        # brings ids into the range [0, codebook_size -1]
+        coarse_output = torch.remainder(coarse_output - semantic_generation_config.semantic_vocab_size, codebook_size)
+        batch_size = coarse_output.shape[0]
+
+        if history_prompt is not None:
+            x_fine_history = torch.repeat_interleave(history_prompt["fine_prompt"].T[None], batch_size, dim=0)
+            # transpose to get to shape (seq_len, n_fine_codebooks)
+        else:
+            x_fine_history = None
+
+        n_coarse = coarse_generation_config.n_coarse_codebooks
+
+        # pad the last 6th codebooks
+        fine_input = F.pad(
+            coarse_output,
+            (0, fine_generation_config.n_fine_codebooks - n_coarse),
+            "constant",
+            codebook_size,
+        )
+
+        # prepend history if available (max max_fine_history_length)
+        if x_fine_history is not None:
+            fine_input = torch.cat([x_fine_history[:, -max_fine_history_length:, :], fine_input], dim=1)
+
+            # len of the fine_history that has been added to fine_input
+            n_history = x_fine_history[:, -max_fine_history_length:, :].shape[1]
+        else:
+            n_history = 0
+
+        n_remove_from_end = 0
+        # need to pad if too short (since non-causal model)
+        if fine_input.shape[1] < max_fine_input_length:
+            n_remove_from_end = max_fine_input_length - fine_input.shape[1]
+            fine_input = F.pad(fine_input, (0, 0, 0, n_remove_from_end), mode="constant", value=codebook_size)
+
+        # we can be lazy about fractional loop and just keep overwriting codebooks.
+        # seems that coarse_output.shape[1] - (max_fine_input_length - n_history) is equal to minus n_remove_from_end
+        # So if we needed to pad because too short, n_loops is always 1 (because n_remove_from_end > 0)
+        # If not, we loop over at least twice.
+
+        n_loops = (coarse_output.shape[1] - (max_fine_input_length - n_history)) / max_fine_history_length
+        n_loops = int(np.ceil(n_loops))
+        n_loops = max(0, n_loops) + 1
+
+        for n_outer in range(n_loops):
+            start_idx = min([n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_input_length])
+
+            start_fill_idx = min(
+                [n_history + n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_history_length]
+            )
+            rel_start_fill_idx = start_fill_idx - start_idx
+            input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :]
+            for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
+                logits = self.forward(n_inner, input_buffer).logits
+                if temperature is None:
+                    relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size]
+                    codebook_preds = torch.argmax(relevant_logits, -1)
+                else:
+                    relevant_logits = logits[:, :, :codebook_size] / temperature
+                    # apply softmax
+                    probs = F.softmax(relevant_logits, dim=-1)[:, rel_start_fill_idx:max_fine_input_length]
+                    # reshape to 2D: (batch_size, seq_len, codebook_size) -> (batch_size*seq_len, codebook_size)
+                    probs = probs.reshape((-1, codebook_size))
+                    # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+                    codebook_preds = torch.multinomial(probs, num_samples=1).view(batch_size, -1)
+                codebook_preds = codebook_preds.to(torch.int32)
+                input_buffer[:, rel_start_fill_idx:, n_inner] = codebook_preds
+                del logits, codebook_preds
+
+            # transfer into fine_input
+            for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
+                fine_input[
+                    :, start_fill_idx : start_fill_idx + (max_fine_input_length - rel_start_fill_idx), n_inner
+                ] = input_buffer[:, rel_start_fill_idx:, n_inner]
+            del input_buffer
+
+        fine_input = fine_input.transpose(1, 2)[:, :, n_history:]
+        if n_remove_from_end > 0:
+            fine_input = fine_input[:, :, :-n_remove_from_end]
+
+        if fine_input.shape[-1] != coarse_output.shape[-2]:
+            raise ValueError("input and output should have the same seq_len")
+
+        return fine_input
+
+
+@add_start_docstrings(
+    """
+    The full Bark model, a text-to-speech model composed of 4 sub-models:
+    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
+      takes
+    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
+    - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
+    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
+    to `encodec`.
+    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
+    predicts the last codebooks based on the sum of the previous codebooks embeddings.
+    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
+      array.
+
+    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
+    output sound according to specific predefined voice.
+    """,
+    BARK_START_DOCSTRING,
+)
+class BarkModel(BarkPreTrainedModel):
+    config_class = BarkConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.semantic = BarkSemanticModel(config.semantic_config)
+        self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
+        self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
+
+        self.codec_model = AutoModel.from_config(config.codec_config)
+
+        self.config = config
+
+    def codec_decode(self, fine_output):
+        """Turn quantized audio codes into audio array using encodec."""
+
+        fine_output = fine_output.transpose(0, 1)
+        emb = self.codec_model.quantizer.decode(fine_output)
+        out = self.codec_model.decoder(emb)
+        audio_arr = out.squeeze(1)  # squeeze the codebook dimension
+
+        return audio_arr
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
+
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
+                longest generation among the batch.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
+        kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
+
+            - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
+            - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
+              semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
+
+            This means you can, for example, specify a generation strategy for all sub-models except one.
+        Returns:
+            torch.LongTensor: Output generated audio.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, BarkModel
+
+        >>> processor = AutoProcessor.from_pretrained("ylacombe/bark-small")
+        >>> model = BarkModel.from_pretrained("ylacombe/bark-small")
+
+        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
+        >>> voice_preset = "v2/en_speaker_6"
+
+        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
+
+        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
+        >>> audio_array = audio_array.cpu().numpy().squeeze()
+        ```
+        """
+        # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
+        # todo: dict
+        semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
+        coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
+        fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
+
+        kwargs_semantic = {
+            # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
+            "attention_mask": kwargs.pop("attention_mask", None)
+        }
+        kwargs_coarse = {}
+        kwargs_fine = {}
+        for key, value in kwargs.items():
+            if key.startswith("semantic_"):
+                key = key[len("semantic_") :]
+                kwargs_semantic[key] = value
+            elif key.startswith("coarse_"):
+                key = key[len("coarse_") :]
+                kwargs_coarse[key] = value
+            elif key.startswith("fine_"):
+                key = key[len("fine_") :]
+                kwargs_fine[key] = value
+            else:
+                # If the key is already in a specific config, then it's been set with a
+                # submodules specific value and we don't override
+                if key not in kwargs_semantic:
+                    kwargs_semantic[key] = value
+                if key not in kwargs_coarse:
+                    kwargs_coarse[key] = value
+                if key not in kwargs_fine:
+                    kwargs_fine[key] = value
+
+        # 1. Generate from the semantic model
+        semantic_output = self.semantic.generate(
+            input_ids,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            **kwargs_semantic,
+        )
+
+        # 2. Generate from the coarse model
+        coarse_output = self.coarse_acoustics.generate(
+            semantic_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_coarse,
+        )
+
+        # 3. "generate" from the fine model
+        output = self.fine_acoustics.generate(
+            coarse_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_fine,
+        )
+
+        # 4. Decode the output and generate audio array
+        audio = self.codec_decode(output)
+
+        return audio
+
+    def can_generate(self) -> bool:
+        """
+        Returns True. Despite not having a `self.generate` method, this model can `generate` and thus needs a
+        BarkGenerationConfig.
+        """
+        return True
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
new file mode 100644
index 000000000000..7084c8b5a932
--- /dev/null
+++ b/src/transformers/models/bark/processing_bark.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Bark
+"""
+import json
+import os
+from typing import Optional
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import ProcessorMixin
+from ...utils import logging
+from ...utils.hub import get_file_from_repo
+from ..auto import AutoTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+class BarkProcessor(ProcessorMixin):
+    r"""
+    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`].
+        speaker_embeddings (`Dict[Dict[str]]`, *optional*, defaults to `None`):
+            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
+            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
+            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
+            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
+            a list of `voice_preset_names`.
+
+    """
+    tokenizer_class = "AutoTokenizer"
+    attributes = ["tokenizer"]
+
+    preset_shape = {
+        "semantic_prompt": 1,
+        "coarse_prompt": 2,
+        "fine_prompt": 2,
+    }
+
+    def __init__(self, tokenizer, speaker_embeddings=None):
+        super().__init__(tokenizer)
+
+        self.speaker_embeddings = speaker_embeddings
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_processor_name_or_path, speaker_embeddings_dict_path="speaker_embeddings_path.json", **kwargs
+    ):
+        r"""
+        Instantiate a Bark processor associated with a pretrained model.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
+                  method, e.g., `./my_model_directory/`.
+            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
+                The name of the `.json` file containing the speaker_embeddings dictionnary located in
+                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
+            **kwargs
+                Additional keyword arguments passed along to both
+                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        """
+
+        if speaker_embeddings_dict_path is not None:
+            speaker_embeddings_path = get_file_from_repo(
+                pretrained_processor_name_or_path,
+                speaker_embeddings_dict_path,
+                subfolder=kwargs.pop("subfolder", None),
+                cache_dir=kwargs.pop("cache_dir", None),
+                force_download=kwargs.pop("force_download", False),
+                proxies=kwargs.pop("proxies", None),
+                resume_download=kwargs.pop("resume_download", False),
+                local_files_only=kwargs.pop("local_files_only", False),
+                use_auth_token=kwargs.pop("use_auth_token", None),
+                revision=kwargs.pop("revision", None),
+            )
+            if speaker_embeddings_path is None:
+                logger.warning(
+                    f"""`{os.path.join(pretrained_processor_name_or_path,speaker_embeddings_dict_path)}` does not exists
+                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
+                    dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
+                )
+                speaker_embeddings = None
+            else:
+                with open(speaker_embeddings_path) as speaker_embeddings_json:
+                    speaker_embeddings = json.load(speaker_embeddings_json)
+        else:
+            speaker_embeddings = None
+
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs)
+
+        return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
+
+    def save_pretrained(
+        self,
+        save_directory,
+        speaker_embeddings_dict_path="speaker_embeddings_path.json",
+        speaker_embeddings_directory="speaker_embeddings",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
+        using the [`~BarkProcessor.from_pretrained`] method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
+                if it does not exist).
+            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
+                The name of the `.json` file that will contains the speaker_embeddings nested path dictionnary, if it
+                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
+            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
+                The name of the folder in which the speaker_embeddings arrays will be saved.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if self.speaker_embeddings is not None:
+            os.makedirs(os.path.join(save_directory, speaker_embeddings_directory, "v2"), exist_ok=True)
+
+            embeddings_dict = {}
+
+            embeddings_dict["repo_or_path"] = save_directory
+
+            for prompt_key in self.speaker_embeddings:
+                if prompt_key != "repo_or_path":
+                    voice_preset = self._load_voice_preset(prompt_key)
+
+                    tmp_dict = {}
+                    for key in self.speaker_embeddings[prompt_key]:
+                        np.save(
+                            os.path.join(
+                                embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}"
+                            ),
+                            voice_preset[key],
+                            allow_pickle=False,
+                        )
+                        tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy")
+
+                    embeddings_dict[prompt_key] = tmp_dict
+
+            with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp:
+                json.dump(embeddings_dict, fp)
+
+        super().save_pretrained(save_directory, push_to_hub, **kwargs)
+
+    def _load_voice_preset(self, voice_preset: str = None, **kwargs):
+        voice_preset_paths = self.speaker_embeddings[voice_preset]
+
+        voice_preset_dict = {}
+        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
+            if key not in voice_preset_paths:
+                raise ValueError(
+                    f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]."
+                )
+
+            path = get_file_from_repo(
+                self.speaker_embeddings.get("repo_or_path", "/"),
+                voice_preset_paths[key],
+                subfolder=kwargs.pop("subfolder", None),
+                cache_dir=kwargs.pop("cache_dir", None),
+                force_download=kwargs.pop("force_download", False),
+                proxies=kwargs.pop("proxies", None),
+                resume_download=kwargs.pop("resume_download", False),
+                local_files_only=kwargs.pop("local_files_only", False),
+                use_auth_token=kwargs.pop("use_auth_token", None),
+                revision=kwargs.pop("revision", None),
+            )
+            if path is None:
+                raise ValueError(
+                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"),voice_preset_paths[key])}` does not exists
+                    , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
+                    embeddings."""
+                )
+
+            voice_preset_dict[key] = np.load(path)
+
+        return voice_preset_dict
+
+    def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None):
+        for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
+            if key not in voice_preset:
+                raise ValueError(f"Voice preset unrecognized, missing {key} as a key.")
+
+            if not isinstance(voice_preset[key], np.ndarray):
+                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
+
+            if len(voice_preset[key].shape) != self.preset_shape[key]:
+                raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
+
+    def __call__(
+        self,
+        text=None,
+        voice_preset=None,
+        return_tensors="pt",
+        max_length=256,
+        add_special_tokens=False,
+        return_attention_mask=True,
+        return_token_type_ids=False,
+        **kwargs,
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
+        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
+        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
+        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            voice_preset (`str`, `Dict[np.ndarray]`):
+                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
+                `"en_speaker_1"`, or directly a dictionnary of `np.ndarray` embeddings for each submodel of `Bark`. Or
+                it can be a valid file name of a local `.npz` single voice preset.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            Tuple([`BatchEncoding`], [`BatchFeature`]): A tuple composed of a [`BatchEncoding`], i.e the output of the
+            `tokenizer` and a [`BatchFeature`], i.e the voice preset with the right tensors type.
+        """
+        if voice_preset is not None and not isinstance(voice_preset, dict):
+            if (
+                isinstance(voice_preset, str)
+                and self.speaker_embeddings is not None
+                and voice_preset in self.speaker_embeddings
+            ):
+                voice_preset = self._load_voice_preset(voice_preset)
+
+            else:
+                if isinstance(voice_preset, str) and not voice_preset.endswith(".npz"):
+                    voice_preset = voice_preset + ".npz"
+
+                voice_preset = np.load(voice_preset)
+
+        if voice_preset is not None:
+            self._validate_voice_preset_dict(voice_preset, **kwargs)
+            voice_preset = BatchFeature(data=voice_preset, tensor_type=return_tensors)
+
+        encoded_text = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding="max_length",
+            max_length=max_length,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            add_special_tokens=add_special_tokens,
+            **kwargs,
+        )
+
+        if voice_preset is not None:
+            encoded_text["history_prompt"] = voice_preset
+
+        return encoded_text
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b583e8aad03c..741b099e7b3d 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -816,6 +816,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+BARK_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BarkCausalModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkCoarseModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkFineModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BarkSemanticModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/bark/__init__.py b/tests/models/bark/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
new file mode 100644
index 000000000000..e2826fcfa22d
--- /dev/null
+++ b/tests/models/bark/test_modeling_bark.py
@@ -0,0 +1,991 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Bark model. """
+
+
+import copy
+import inspect
+import tempfile
+import unittest
+
+from transformers import (
+    BarkCoarseConfig,
+    BarkFineConfig,
+    BarkSemanticConfig,
+    is_torch_available,
+)
+from transformers.models.bark.generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import cached_property
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BarkCausalModel,
+        BarkCoarseModel,
+        BarkFineModel,
+        BarkModel,
+        BarkProcessor,
+        BarkSemanticModel,
+    )
+
+
+class BarkSemanticModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=4,
+        is_training=False,  # for now training is not supported
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=33,
+        output_vocab_size=33,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=15,
+        dropout=0.1,
+        window_size=256,
+        initializer_range=0.02,
+        n_codes_total=8,  # for BarkFineModel
+        n_codes_given=1,  # for BarkFineModel
+        config_class=None,
+        model_class=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.bos_token_id = output_vocab_size - 1
+        self.eos_token_id = output_vocab_size - 1
+        self.pad_token_id = output_vocab_size - 1
+
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        self.is_encoder_decoder = False
+        self.config_class = config_class
+        self.model_class = model_class
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "head_mask": head_mask,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return self.config_class(
+            vocab_size=self.vocab_size,
+            output_vocab_size=self.output_vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            window_size=self.window_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = self.model_class(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "logits"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        # test no attention_mask works
+        outputs = model(input_ids, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["logits"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+class BarkCoarseModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=4,
+        is_training=False,  # for now training is not supported
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=33,
+        output_vocab_size=33,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=15,
+        dropout=0.1,
+        window_size=256,
+        initializer_range=0.02,
+        n_codes_total=8,  # for BarkFineModel
+        n_codes_given=1,  # for BarkFineModel
+        config_class=None,
+        model_class=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.bos_token_id = output_vocab_size - 1
+        self.eos_token_id = output_vocab_size - 1
+        self.pad_token_id = output_vocab_size - 1
+
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        self.is_encoder_decoder = False
+        self.config_class = config_class
+        self.model_class = model_class
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "head_mask": head_mask,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return self.config_class(
+            vocab_size=self.vocab_size,
+            output_vocab_size=self.output_vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            window_size=self.window_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = self.model_class(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "logits"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        # test no attention_mask works
+        outputs = model(input_ids, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["logits"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+class BarkFineModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=4,
+        is_training=False,  # for now training is not supported
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=33,
+        output_vocab_size=33,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=15,
+        dropout=0.1,
+        window_size=256,
+        initializer_range=0.02,
+        n_codes_total=8,  # for BarkFineModel
+        n_codes_given=1,  # for BarkFineModel
+        config_class=None,
+        model_class=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.output_vocab_size = output_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.bos_token_id = output_vocab_size - 1
+        self.eos_token_id = output_vocab_size - 1
+        self.pad_token_id = output_vocab_size - 1
+
+        self.n_codes_total = n_codes_total
+        self.n_codes_given = n_codes_given
+
+        self.is_encoder_decoder = False
+        self.config_class = config_class
+        self.model_class = model_class
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length, self.n_codes_total], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = self.get_config()
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        # randint between self.n_codes_given - 1 and self.n_codes_total - 1
+        codebook_idx = ids_tensor((1,), self.n_codes_total - self.n_codes_given).item() + self.n_codes_given
+
+        inputs_dict = {
+            "codebook_idx": codebook_idx,
+            "input_ids": input_ids,
+            "head_mask": head_mask,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return self.config_class(
+            vocab_size=self.vocab_size,
+            output_vocab_size=self.output_vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            window_size=self.window_size,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = self.model_class(config=config).to(torch_device).eval()
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "logits"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+        # test no attention_mask works
+        outputs = model(input_ids, use_cache=True)
+        _, past_key_values = outputs.to_tuple()
+        output_from_no_past = model(next_input_ids)["logits"]
+
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
+
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+
+@require_torch
+class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BarkSemanticModel,) if is_torch_available() else ()
+    all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+    # no model_parallel for now
+
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = BarkSemanticModelTester(
+            self, config_class=BarkSemanticConfig, model_class=BarkSemanticModel
+        )
+        self.config_tester = ConfigTester(self, config_class=BarkSemanticConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            wte = model.get_input_embeddings()
+            inputs["input_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = self.all_generative_model_classes[0](config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+@require_torch
+class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    # Same tester as BarkSemanticModelTest, except for model_class and config_class
+    all_model_classes = (BarkCoarseModel,) if is_torch_available() else ()
+    all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+    # no model_parallel for now
+
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = BarkCoarseModelTester(self, config_class=BarkCoarseConfig, model_class=BarkCoarseModel)
+        self.config_tester = ConfigTester(self, config_class=BarkCoarseConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            wte = model.get_input_embeddings()
+            inputs["input_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = self.all_generative_model_classes[0](config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+@require_torch
+class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (BarkFineModel,) if is_torch_available() else ()
+
+    is_encoder_decoder = False
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    # no model_parallel for now
+    test_model_parallel = False
+
+    # torchscript disabled for now because forward with an int
+    test_torchscript = False
+
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = BarkFineModelTester(self, config_class=BarkFineConfig, model_class=BarkFineModel)
+        self.config_tester = ConfigTester(self, config_class=BarkFineConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+
+            wte = model.get_input_embeddings()[inputs_dict["codebook_idx"]]
+
+            inputs["input_embeds"] = wte(input_ids[:, :, inputs_dict["codebook_idx"]])
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        # take first codebook channel
+
+        model = self.all_model_classes[0](config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+
+        # toy generation_configs
+        semantic_generation_config = BarkSemanticGenerationConfig(semantic_vocab_size=0)
+        coarse_generation_config = BarkCoarseGenerationConfig(n_coarse_codebooks=config.n_codes_given)
+        fine_generation_config = BarkFineGenerationConfig(
+            max_fine_history_length=config.block_size // 2,
+            max_fine_input_length=config.block_size,
+            n_fine_codebooks=config.n_codes_total,
+        )
+        codebook_size = config.vocab_size - 1
+
+        model.generate(
+            input_ids,
+            history_prompt=None,
+            temperature=None,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=codebook_size,
+        )
+
+        model.generate(
+            input_ids,
+            history_prompt=None,
+            temperature=0.7,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=codebook_size,
+        )
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["codebook_idx", "input_ids"]
+            self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model_common_attributes(self):
+        # one embedding layer per codebook
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings()[0], (torch.nn.Embedding))
+            model.set_input_embeddings(
+                torch.nn.ModuleList([torch.nn.Embedding(10, 10) for _ in range(config.n_codes_total)])
+            )
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x[0], torch.nn.Linear))
+
+    def test_resize_tokens_embeddings(self):
+        # resizing tokens_embeddings of a ModuleList
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed_list = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings_list = [model_embed.weight.clone() for model_embed in model_embed_list]
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed_list = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+
+            # Check that it actually resizes the embeddings matrix for each codebook
+            for model_embed, cloned_embeddings in zip(model_embed_list, cloned_embeddings_list):
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed_list = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            for model_embed, cloned_embeddings in zip(model_embed_list, cloned_embeddings_list):
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            # only check for the first embedding matrix
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings_list[0], model_embed_list[0].weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_embeddings_untied(self):
+        # resizing tokens_embeddings of a ModuleList
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds_list = model.get_output_embeddings()
+
+            for output_embeds in output_embeds_list:
+                self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+
+                # Check bias if present
+                if output_embeds.bias is not None:
+                    self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds_list = model.get_output_embeddings()
+
+            for output_embeds in output_embeds_list:
+                self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+                # Check bias if present
+                if output_embeds.bias is not None:
+                    self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class BarkModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return BarkModel.from_pretrained("ylacombe/bark-large").to(torch_device)
+
+    @cached_property
+    def processor(self):
+        return BarkProcessor.from_pretrained("ylacombe/bark-large")
+
+    @cached_property
+    def inputs(self):
+        input_ids = self.processor("In the light of the moon, a little egg lay on a leaf", voice_preset="en_speaker_6")
+
+        input_ids = input_ids.to(torch_device)
+
+        return input_ids
+
+    @cached_property
+    def semantic_generation_config(self):
+        semantic_generation_config = BarkSemanticGenerationConfig(**self.model.generation_config.semantic_config)
+        return semantic_generation_config
+
+    @cached_property
+    def coarse_generation_config(self):
+        coarse_generation_config = BarkCoarseGenerationConfig(**self.model.generation_config.coarse_acoustics_config)
+        return coarse_generation_config
+
+    @cached_property
+    def fine_generation_config(self):
+        fine_generation_config = BarkFineGenerationConfig(**self.model.generation_config.fine_acoustics_config)
+        return fine_generation_config
+
+    @slow
+    def test_generate_semantic(self):
+        input_ids = self.inputs
+
+        # fmt: off
+        # check first ids
+        expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,]
+        # fmt: on
+
+        # greedy decoding
+        with torch.no_grad():
+            output_ids = self.model.semantic.generate(
+                **input_ids,
+                do_sample=False,
+                semantic_generation_config=self.semantic_generation_config,
+            )
+
+        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
+
+    @slow
+    def test_generate_coarse(self):
+        input_ids = self.inputs
+
+        history_prompt = input_ids["history_prompt"]
+
+        # fmt: off
+        # check first ids
+        expected_output_ids = [11018, 11391, 10651, 11418, 10857, 11620, 10642, 11366, 10312, 11528, 10531, 11516, 10474, 11051, 10524, 11051, ]
+        # fmt: on
+
+        with torch.no_grad():
+            output_ids = self.model.semantic.generate(
+                **input_ids,
+                do_sample=False,
+                semantic_generation_config=self.semantic_generation_config,
+            )
+
+            output_ids = self.model.coarse_acoustics.generate(
+                output_ids,
+                history_prompt=history_prompt,
+                do_sample=False,
+                semantic_generation_config=self.semantic_generation_config,
+                coarse_generation_config=self.coarse_generation_config,
+                codebook_size=self.model.generation_config.codebook_size,
+            )
+
+        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
+
+    @slow
+    def test_generate_fine(self):
+        input_ids = self.inputs
+
+        history_prompt = input_ids["history_prompt"]
+
+        # fmt: off
+        expected_output_ids = [
+            [1018, 651, 857, 642, 312, 531, 474, 524, 524, 776,],
+            [367, 394, 596, 342, 504, 492, 27, 27, 822, 822,],
+            [961, 955, 221, 955, 955, 686, 939, 939, 479, 176,],
+            [638, 365, 218, 944, 853, 363, 639, 22, 884, 456,],
+            [302, 912, 524, 38, 174, 209, 879, 23, 910, 227,],
+            [440, 673, 861, 666, 372, 558, 49, 172, 232, 342,],
+            [244, 358, 123, 356, 586, 520, 499, 877, 542, 637,],
+            [806, 685, 905, 848, 803, 810, 921, 208, 625, 203,],
+        ]
+        # fmt: on
+
+        with torch.no_grad():
+            output_ids = self.model.semantic.generate(
+                **input_ids,
+                do_sample=False,
+                semantic_generation_config=self.semantic_generation_config,
+            )
+
+            output_ids = self.model.coarse_acoustics.generate(
+                output_ids,
+                history_prompt=history_prompt,
+                do_sample=False,
+                semantic_generation_config=self.semantic_generation_config,
+                coarse_generation_config=self.coarse_generation_config,
+                codebook_size=self.model.generation_config.codebook_size,
+            )
+
+            # greedy decoding
+            output_ids = self.model.fine_acoustics.generate(
+                output_ids,
+                history_prompt=history_prompt,
+                temperature=None,
+                semantic_generation_config=self.semantic_generation_config,
+                coarse_generation_config=self.coarse_generation_config,
+                fine_generation_config=self.fine_generation_config,
+                codebook_size=self.model.generation_config.codebook_size,
+            )
+
+        self.assertListEqual(output_ids[0, :, : len(expected_output_ids[0])].tolist(), expected_output_ids)
+
+    @slow
+    def test_generate_end_to_end(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            self.model.generate(**input_ids)
+            self.model.generate(**{key: val for (key, val) in input_ids.items() if key != "history_prompt"})
+
+    @slow
+    def test_generate_end_to_end_with_args(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            self.model.generate(**input_ids, do_sample=True, temperature=0.6, penalty_alpha=0.6)
+            self.model.generate(**input_ids, do_sample=True, temperature=0.6, num_beams=4)
+
+    @slow
+    def test_generate_end_to_end_with_sub_models_args(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            self.model.generate(**input_ids, do_sample=False, coarse_do_sample=True, coarse_temperature=0.7)
+            self.model.generate(
+                **input_ids, do_sample=False, coarse_do_sample=True, coarse_temperature=0.7, fine_temperature=0.3
+            )
+            self.model.generate(
+                **input_ids,
+                do_sample=True,
+                temperature=0.6,
+                penalty_alpha=0.6,
+                semantic_temperature=0.9,
+                coarse_temperature=0.2,
+                fine_temperature=0.1,
+            )
diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py
new file mode 100644
index 000000000000..aa25951b5c41
--- /dev/null
+++ b/tests/models/bark/test_processor_bark.py
@@ -0,0 +1,127 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import AutoTokenizer, BarkProcessor
+from transformers.testing_utils import require_torch, slow
+
+
+@require_torch
+class BarkProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "ylacombe/bark-small"
+        self.tmpdirname = tempfile.mkdtemp()
+        self.voice_preset = "en_speaker_1"
+        self.input_string = "This is a test string"
+        self.speaker_embeddings_dict_path = "speaker_embeddings_path.json"
+        self.speaker_embeddings_directory = "speaker_embeddings"
+
+    def get_tokenizer(self, **kwargs):
+        return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+
+        processor = BarkProcessor(tokenizer=tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = BarkProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+
+    @slow
+    def test_save_load_pretrained_additional_features(self):
+        processor = BarkProcessor.from_pretrained(
+            pretrained_processor_name_or_path=self.checkpoint,
+            speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
+        )
+        processor.save_pretrained(
+            self.tmpdirname,
+            speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
+            speaker_embeddings_directory=self.speaker_embeddings_directory,
+        )
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+
+        processor = BarkProcessor.from_pretrained(
+            self.tmpdirname,
+            self.speaker_embeddings_dict_path,
+            bos_token="(BOS)",
+            eos_token="(EOS)",
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+
+    def test_speaker_embeddings(self):
+        processor = BarkProcessor.from_pretrained(
+            pretrained_processor_name_or_path=self.checkpoint,
+            speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
+        )
+
+        seq_len = 35
+        nb_codebooks_coarse = 2
+        nb_codebooks_total = 8
+
+        voice_preset = {
+            "semantic_prompt": np.ones(seq_len),
+            "coarse_prompt": np.ones((nb_codebooks_coarse, seq_len)),
+            "fine_prompt": np.ones((nb_codebooks_total, seq_len)),
+        }
+
+        # test providing already loaded voice_preset
+        inputs = processor(text=self.input_string, voice_preset=voice_preset)
+
+        processed_voice_preset = inputs["history_prompt"]
+        for key in voice_preset:
+            self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
+
+        # test loading voice preset from npz file
+        tmpfilename = os.path.join(self.tmpdirname, "file.npz")
+        np.savez(tmpfilename, **voice_preset)
+        inputs = processor(text=self.input_string, voice_preset=tmpfilename)
+        processed_voice_preset = inputs["history_prompt"]
+
+        for key in voice_preset:
+            self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
+
+        # test loading voice preset from the hub
+        inputs = processor(text=self.input_string, voice_preset=self.voice_preset)
+
+    def test_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+
+        processor = BarkProcessor(tokenizer=tokenizer)
+
+        encoded_processor = processor(text=self.input_string)
+
+        encoded_tok = tokenizer(
+            self.input_string,
+            padding="max_length",
+            max_length=256,
+            add_special_tokens=False,
+            return_attention_mask=True,
+            return_token_type_ids=False,
+        )
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key].squeeze().tolist())
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 64bd343abfbc..66f75863a30c 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -167,6 +167,8 @@
     "SpeechT5SpeechEncoder",  # Building part of bigger (tested) model.
     "SpeechT5TextDecoder",  # Building part of bigger (tested) model.
     "SpeechT5TextEncoder",  # Building part of bigger (tested) model.
+    "BarkCausalModel",  # Building part of bigger (tested) model.
+    "BarkModel",  # Does not have a forward signature - generation tested with integration tests
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't
@@ -188,6 +190,7 @@
     "models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py",
     "models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py",
     "models/decision_transformer/test_modeling_decision_transformer.py",
+    "models/bark/test_modeling_bark.py",
 ]
 
 # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and
@@ -332,11 +335,15 @@
     "AltCLIPVisionModel",
     "AltRobertaModel",
     "TvltForAudioVisualClassification",
+    "BarkCausalModel",
+    "BarkCoarseModel",
+    "BarkFineModel",
+    "BarkSemanticModel",
+    "MusicgenModel",
+    "MusicgenForConditionalGeneration",
     "SpeechT5ForSpeechToSpeech",
     "SpeechT5ForTextToSpeech",
     "SpeechT5HifiGan",
-    "MusicgenModel",
-    "MusicgenForConditionalGeneration",
 ]
 
 # DO NOT edit this list!
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 3c8d9ef4ea25..1fbbc4ee9bb9 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -28,6 +28,9 @@ src/transformers/models/auto/feature_extraction_auto.py
 src/transformers/models/auto/image_processing_auto.py
 src/transformers/models/auto/processing_auto.py
 src/transformers/models/auto/tokenization_auto.py
+src/transformers/models/bark/configuration_bark.py
+src/transformers/models/bark/modeling_bark.py
+src/transformers/models/bark/processing_bark.py
 src/transformers/models/bart/configuration_bart.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/bart/tokenization_bart.py

From 49eb357564c74ca3e1813dc2a8bf036ca81ec535 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 17 Jul 2023 13:27:11 -0400
Subject: [PATCH 674/935] Fix token pass (#24862)

* Fix how token is passed along in from_pretrained for tokenizers

* It's actually not necessary
---
 src/transformers/tokenization_utils_base.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 2f18e964d13e..f5271aff667f 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1720,10 +1720,6 @@ def from_pretrained(
                 )
             token = use_auth_token
 
-        if token is not None:
-            # change to `token` in a follow-up PR
-            kwargs["use_auth_token"] = token
-
         user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline

From e9ad51306fdcc3fb79d837d667e21c6d075a2451 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 17 Jul 2023 13:30:44 -0400
Subject: [PATCH 675/935] 4.32.0.dev0

---
 README.md                                        | 16 ++++++++--------
 README_es.md                                     | 16 ++++++++--------
 README_hd.md                                     | 16 ++++++++--------
 README_ja.md                                     | 16 ++++++++--------
 README_ko.md                                     | 16 ++++++++--------
 README_zh-hans.md                                | 16 ++++++++--------
 README_zh-hant.md                                | 16 ++++++++--------
 examples/flax/question-answering/run_qa.py       |  2 +-
 .../flax/text-classification/run_flax_glue.py    |  2 +-
 .../flax/token-classification/run_flax_ner.py    |  2 +-
 .../run_audio_classification.py                  |  2 +-
 .../pytorch/contrastive-image-text/run_clip.py   |  2 +-
 .../run_image_classification.py                  |  2 +-
 .../run_image_classification_no_trainer.py       |  2 +-
 examples/pytorch/image-pretraining/run_mae.py    |  2 +-
 examples/pytorch/image-pretraining/run_mim.py    |  2 +-
 .../image-pretraining/run_mim_no_trainer.py      |  2 +-
 examples/pytorch/language-modeling/run_clm.py    |  2 +-
 .../language-modeling/run_clm_no_trainer.py      |  2 +-
 examples/pytorch/language-modeling/run_mlm.py    |  2 +-
 .../language-modeling/run_mlm_no_trainer.py      |  2 +-
 examples/pytorch/language-modeling/run_plm.py    |  2 +-
 examples/pytorch/multiple-choice/run_swag.py     |  2 +-
 .../multiple-choice/run_swag_no_trainer.py       |  2 +-
 examples/pytorch/question-answering/run_qa.py    |  2 +-
 .../question-answering/run_qa_beam_search.py     |  2 +-
 .../run_qa_beam_search_no_trainer.py             |  2 +-
 .../question-answering/run_qa_no_trainer.py      |  2 +-
 .../pytorch/question-answering/run_seq2seq_qa.py |  2 +-
 .../run_semantic_segmentation.py                 |  2 +-
 .../run_semantic_segmentation_no_trainer.py      |  2 +-
 .../run_speech_recognition_ctc.py                |  2 +-
 .../run_speech_recognition_ctc_adapter.py        |  2 +-
 .../run_speech_recognition_seq2seq.py            |  2 +-
 .../pytorch/summarization/run_summarization.py   |  2 +-
 .../run_summarization_no_trainer.py              |  2 +-
 examples/pytorch/text-classification/run_glue.py |  2 +-
 .../text-classification/run_glue_no_trainer.py   |  2 +-
 examples/pytorch/text-classification/run_xnli.py |  2 +-
 examples/pytorch/token-classification/run_ner.py |  2 +-
 .../token-classification/run_ner_no_trainer.py   |  2 +-
 examples/pytorch/translation/run_translation.py  |  2 +-
 .../translation/run_translation_no_trainer.py    |  2 +-
 .../contrastive-image-text/run_clip.py           |  2 +-
 .../run_image_classification.py                  |  2 +-
 examples/tensorflow/multiple-choice/run_swag.py  |  2 +-
 examples/tensorflow/question-answering/run_qa.py |  2 +-
 .../summarization/run_summarization.py           |  2 +-
 .../tensorflow/text-classification/run_glue.py   |  2 +-
 .../tensorflow/translation/run_translation.py    |  2 +-
 setup.py                                         |  2 +-
 src/transformers/__init__.py                     |  2 +-
 52 files changed, 101 insertions(+), 101 deletions(-)

diff --git a/README.md b/README.md
index 8f1a08e7ec87..825112b5b1b0 100644
--- a/README.md
+++ b/README.md
@@ -291,7 +291,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -345,12 +345,12 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -375,7 +375,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -410,9 +410,9 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -469,7 +469,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
@@ -481,7 +481,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
diff --git a/README_es.md b/README_es.md
index e9cdd14ce801..e9395e150b8e 100644
--- a/README_es.md
+++ b/README_es.md
@@ -268,7 +268,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -322,12 +322,12 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -352,7 +352,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -387,9 +387,9 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -446,7 +446,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
@@ -458,7 +458,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
diff --git a/README_hd.md b/README_hd.md
index b5262faef100..d9b0b9b010ed 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -240,7 +240,7 @@ conda install -c huggingface transformers
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया।
@@ -294,12 +294,12 @@ conda install -c huggingface transformers
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google रिसर्च/स्टैनफोर्ड यूनिवर्सिटी से) साथ में दिया गया पेपर [इलेक्ट्रा: जेनरेटर के बजाय भेदभाव करने वाले के रूप में टेक्स्ट एन्कोडर्स का पूर्व-प्रशिक्षण] (https://arxiv.org/abs/2003.10555) केविन क्लार्क, मिन्ह-थांग लुओंग, क्वोक वी. ले, क्रिस्टोफर डी. मैनिंग द्वारा पोस्ट किया गया।
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (Meta AI से) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. द्वाराअनुसंधान पत्र [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) के साथ जारी किया गया
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI से) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. द्वाराअनुसंधान पत्र [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) के साथ जारी किया गया
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google रिसर्च से) साथ में दिया गया पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https:/ /arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा।
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया।
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स।
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
@@ -324,7 +324,7 @@ conda install -c huggingface transformers
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https:// arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा।
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -359,9 +359,9 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा।
@@ -418,7 +418,7 @@ conda install -c huggingface transformers
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (Google Research से) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. द्वाराअनुसंधान पत्र [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) के साथ जारी किया गया
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research से) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. द्वाराअनुसंधान पत्र [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) के साथ जारी किया गया
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
@@ -430,7 +430,7 @@ conda install -c huggingface transformers
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (मेटा एआई से) साथ में कागज [मास्कड ऑटोएन्कोडर स्केलेबल विजन लर्नर्स हैं](https://arxiv.org/ एब्स/2111.06377) कैमिंग हे, ज़िनेली चेन, सेनिंग ज़ी, यांगहो ली, पिओट्र डॉलर, रॉस गिर्शिक द्वारा।
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI से) साथ वाला पेपर [FAIRSEQ S2T: FAIRSEQ के साथ फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग ](https://arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, सरव्या पोपुरी, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया।
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https:/ /arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा।
diff --git a/README_ja.md b/README_ja.md
index dbb6bb407f32..b702d3525d77 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -302,7 +302,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (BAAI から) Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell から公開された研究論文: [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679)
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (MIT から) Yuan Gong, Yu-An Chung, James Glass から公開された研究論文: [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (Facebook から) Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer から公開された研究論文: [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461)
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (École polytechnique から) Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis から公開された研究論文: [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research から) Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen から公開された研究論文: [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)
@@ -356,12 +356,12 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (Snap Research から) Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. から公開された研究論文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191)
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University から) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning から公開された研究論文: [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555)
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (Meta AI から) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. から公開された研究論文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI から) Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. から公開された研究論文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research から) Sascha Rothe, Shashi Narayan, Aliaksei Severyn から公開された研究論文: [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
@@ -386,7 +386,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321)
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/)
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
@@ -421,9 +421,9 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284)
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
@@ -480,7 +480,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (Google Research から) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. から公開された研究論文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research から) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. から公開された研究論文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
@@ -492,7 +492,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI から) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick から公開された研究論文: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI から) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas から公開された研究論文: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI から) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli から公開された研究論文: [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477)
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI から) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino から公開された研究論文: [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171)
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI から) Qiantong Xu, Alexei Baevski, Michael Auli から公開された研究論文: [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
diff --git a/README_ko.md b/README_ko.md
index 1e569b7d3869..92b22e38a4cd 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -217,7 +217,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -271,12 +271,12 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (Google Research/Stanford University 에서) Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 의 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 논문과 함께 발표했습니다.
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (Meta AI 에서 제공)은 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.의 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)논문과 함께 발표했습니다.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (Meta AI 에서 제공)은 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.의 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)논문과 함께 발표했습니다.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (Google Research 에서) Sascha Rothe, Shashi Narayan, Aliaksei Severyn 의 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 논문과 함께 발표했습니다.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -301,7 +301,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다.
@@ -336,9 +336,9 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
@@ -395,7 +395,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (Google Research 에서 제공)은 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.의 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)논문과 함께 발표했습니다.
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research 에서 제공)은 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.의 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)논문과 함께 발표했습니다.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
@@ -407,7 +407,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI 에서) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 의 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 논문과 함께 발표했습니다.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI 에서) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 의 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) 논문과 함께 발표했습니다.
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (Facebook AI 에서) Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 의 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI 에서) Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 의 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 논문과 함께 발표했습니다.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI 에서) Qiantong Xu, Alexei Baevski, Michael Auli 의 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index f6688e21a96f..2e4cded6192b 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -241,7 +241,7 @@ conda install -c huggingface transformers
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (来自 BAAI) 伴随论文 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) 由 Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell 发布。
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
@@ -295,12 +295,12 @@ conda install -c huggingface transformers
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (来自 Snap Research) 伴随论文 [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) 由 Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren 发布。
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (来自 Meta AI) 伴随论文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) 由 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi 发布。
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (来自 Meta AI) 伴随论文 [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) 由 Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi 发布。
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
@@ -325,7 +325,7 @@ conda install -c huggingface transformers
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
@@ -360,9 +360,9 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
@@ -419,7 +419,7 @@ conda install -c huggingface transformers
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (来自 Google Research) 伴随论文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 由 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant 发布。
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (来自 Google Research) 伴随论文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 由 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
@@ -431,7 +431,7 @@ conda install -c huggingface transformers
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (来自 Meta AI) 伴随论文 [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas 发布.
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (来自 Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) 由 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 1d97b4027cb5..9ae019e0c4ce 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -253,7 +253,7 @@ conda install -c huggingface transformers
 1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
-1. **[Bark](https://huggingface.co/docs/transformers/main/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
@@ -307,12 +307,12 @@ conda install -c huggingface transformers
 1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
 1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EnCodec](https://huggingface.co/docs/transformers/main/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/main/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -337,7 +337,7 @@ conda install -c huggingface transformers
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
-1. **[InstructBLIP](https://huggingface.co/docs/transformers/main/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
@@ -372,9 +372,9 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MRA](https://huggingface.co/docs/transformers/main/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/main/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -431,7 +431,7 @@ conda install -c huggingface transformers
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
-1. **[UMT5](https://huggingface.co/docs/transformers/main/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
@@ -443,7 +443,7 @@ conda install -c huggingface transformers
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
-1. **[ViViT](https://huggingface.co/docs/transformers/main/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index bd79b5822e50..48ed6173c77e 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index e7f6b78aad28..1e6c6c1b0fa4 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -54,7 +54,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index a15946b54b42..6f773e2f8c7e 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -55,7 +55,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 9400eb06f2fd..2edb90516b60 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 0b3c03dcbded..4685b62d28ca 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index b30a93edbdf1..88cfdc24af5a 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index f447b5a7a27f..27b4bee144f2 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 117a97deff53..3076b7b632d1 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 9a5c1611e7c1..4813ed3839ce 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 93782ae4b8cc..5f83fd2da8fe 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 791aef8e9cbf..f7825b0afe95 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 1ec311289aa8..dc619acb9a69 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 61fe39301c7c..9df7a89afe4f 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 760181feffc3..b57a7d331c88 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 33ab57df5607..d69d1621f485 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 8447b34a2e37..2050863eaf1d 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index ca6f49a4ab19..7f5969934e7d 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index e1f8a7af94b7..161c0f9cdd74 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 558494d7f50b..652678e43ffc 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 3cc6a9686a25..79aaa22cdb1b 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 60e54ad8c5eb..7f13c250e7a2 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index c8a1542a64fe..3924ecfcb2e2 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 907f9cea9912..227ba360a05e 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index fe0ae637936c..bcad655727c8 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index fc3c802b3db0..1b7bf7b06571 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index 786b7635a33d..d1114d04f950 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 5d5f255acf77..5305f91eedfe 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index f5181f55efd9..d981bdbd1591 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 7b5ce25d880c..bfefbee2505a 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 3753b4d66936..3af84b315892 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index ff8b65ef6c1d..a21a698ec904 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 6600ba5952fc..a7c858115e7a 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 2b88217e4551..38800c1754bc 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index cd8f7658d986..8ad0b6d7e47d 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index c7e4e9bfd54a..992abfecc0db 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 3faf99cc245c..70f831620d1e 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 6b0a4d1ddfc0..3f96d5d14e22 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index a95d155937c6..c8dc86ffa1fe 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 2640b654bae2..4122f724b703 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index d034a796c40c..a602e81e5bf9 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index fb48bb175cd1..e0a3ea404b90 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 12dfd201a95a..b8cfdf253ae1 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 34de29843043..67c4c1c4bbab 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.31.0.dev0")
+check_min_version("4.32.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index a3f2beba0a53..b9afe248525a 100644
--- a/setup.py
+++ b/setup.py
@@ -427,7 +427,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.31.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.32.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d53f23e727cf..80eb14588f40 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.31.0.dev0"
+__version__ = "4.32.0.dev0"
 
 from typing import TYPE_CHECKING
 

From 12b908c659843351b2112633b6b6024c82575b04 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:10:13 -0400
Subject: [PATCH 676/935] Fix the fetch of all example tests (#24864)

---
 .circleci/create_circleci_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 63894cddd1bf..8632723863f9 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -581,13 +581,13 @@ def create_circleci_config(folder=None):
     example_file = os.path.join(folder, "examples_test_list.txt")
     if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
         with open(example_file, "r", encoding="utf-8") as f:
-            example_tests = f.read().split(" ")
+            example_tests = f.read()
         for job in EXAMPLES_TESTS:
             framework = job.name.replace("examples_", "").replace("torch", "pytorch")
             if example_tests == "all":
                 job.tests_to_run = [f"examples/{framework}"]
             else:
-                job.tests_to_run = [f for f in example_tests if f.startswith(f"examples/{framework}")]
+                job.tests_to_run = [f for f in example_tests.split(" ") if f.startswith(f"examples/{framework}")]
             
             if len(job.tests_to_run) > 0:
                 jobs.append(job)

From d0154015f71e0e2ee03b5c059c80dea8ec9bf55d Mon Sep 17 00:00:00 2001
From: Syed Salman Habeeb Quadri <quadrisyedsalmanhabeeb@gmail.com>
Date: Tue, 18 Jul 2023 00:02:44 +0530
Subject: [PATCH 677/935] Replace assert statements with exceptions (#24856)

* Changed AssertionError to ValueError

try-except block was using AssesrtionError in except statement while the expected error is value error. Fixed the same.

* Changed AssertionError to ValueError

try-except block was using AssesrtionError in except statement while the expected error is ValueError. Fixed the same.
Note: While raising the ValueError args are passed to it, but later added again while handling the error (See the code snippet)

* Changed AssertionError to ValueError

try-except block was using AssesrtionError in except statement while the expected error is ValueError. Fixed the same.
Note: While raising the ValueError args are passed to it, but later added again while handling the error (See the code snippet)

* Changed AssertionError to ValueError

* Changed AssertionError to ValueError

* Changed AssertionError to ValueError

* Changed AssertionError to ValueError

* Changed AssertionError to ValueError

* Changed assert statement to ValueError based

* Changed assert statement to ValueError based

* Changed assert statement to ValueError based

* Changed incorrect error handling from AssertionError to ValueError

* Undoed change from AssertionError to ValueError as it is not needed

* Reverted back to using AssertionError as it is not necessary to make it into ValueError

* Fixed erraneous comparision

Changed == to !=

* Fixed erraneous comparision

Changed == to !=

* formatted the code

* Ran make fix-copies
---
 .../models/albert/modeling_albert.py            |  2 +-
 .../beit/convert_beit_unilm_to_pytorch.py       | 17 +++++++++--------
 src/transformers/models/bert/modeling_bert.py   |  2 +-
 .../models/big_bird/modeling_big_bird.py        |  2 +-
 .../modeling_decision_transformer.py            |  7 +++----
 .../models/electra/modeling_electra.py          |  2 +-
 src/transformers/models/gpt2/modeling_gpt2.py   |  7 +++----
 .../models/roc_bert/modeling_roc_bert.py        |  2 +-
 8 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 7196e14be291..749bb84803ac 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -182,7 +182,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
         try:
             if pointer.shape != array.shape:
                 raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         print(f"Initialize PyTorch weight {name} from {original_name}")
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
index 1797f3a7fb59..757113c8a60f 100644
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@@ -337,18 +337,19 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     else:
         raise ValueError("Can't verify logits as model is not supported")
 
-    assert logits.shape == expected_shape, "Shape of logits not as expected"
+    if logits.shape != expected_shape:
+        raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}")
     if not has_lm_head:
         if is_semantic:
-            assert torch.allclose(
-                logits[0, :3, :3, :3], expected_logits, atol=1e-3
-            ), "First elements of logits not as expected"
+            if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3):
+                raise ValueError("First elements of logits not as expected")
         else:
             print("Predicted class idx:", logits.argmax(-1).item())
-            assert torch.allclose(
-                logits[0, :3], expected_logits, atol=1e-3
-            ), "First elements of logits not as expected"
-            assert logits.argmax(-1).item() == expected_class_idx, "Predicted class index not as expected"
+
+            if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3):
+                raise ValueError("First elements of logits not as expected")
+            if logits.argmax(-1).item() != expected_class_idx:
+                raise ValueError("Predicted class index not as expected")
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 452741c6d8fb..0fc784fcfb9a 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -169,7 +169,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         try:
             if pointer.shape != array.shape:
                 raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index a2db2e2638f5..53a3a1d8c484 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -227,7 +227,7 @@ def load_tf_weights_trivia_qa(init_vars):
                 raise ValueError(
                     f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched of {txt_name}."
                 )
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         pt_weight_name = ".".join(pt_name)
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 064b3cb0ad72..d92aa20edba5 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -96,10 +96,9 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
                 num = int(scope_names[1])
                 pointer = pointer[num]
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 23ca78e8e064..fdab72bd126f 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -135,7 +135,7 @@ def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_
             try:
                 if pointer.shape != array.shape:
                     raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-            except AssertionError as e:
+            except ValueError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
             print(f"Initialize PyTorch weight {name}", original_name)
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 58b419897a7c..5014109c2d01 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -110,10 +110,9 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
                 num = int(scope_names[1])
                 pointer = pointer[num]
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index c57537ecf3e4..221991f354fb 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -146,7 +146,7 @@ def load_tf_weights_in_roc_bert(model, config, tf_checkpoint_path):
         try:
             if pointer.shape != array.shape:
                 raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-        except AssertionError as e:
+        except ValueError as e:
             e.args += (pointer.shape, array.shape)
             raise
         logger.info(f"Initialize PyTorch weight {name}")

From eeaa9c016a44b026b7f78a40c8003752a7c6e9c7 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 17 Jul 2023 20:35:20 +0200
Subject: [PATCH 678/935] Make CLIP model could use new added tokens with
 meaningful pooling (#24777)

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/clip/modeling_clip.py | 30 ++++++++++++----
 .../models/clip/modeling_flax_clip.py         | 18 ++++++++--
 .../models/clip/modeling_tf_clip.py           | 35 ++++++++++++++-----
 .../models/clipseg/configuration_clipseg.py   |  4 +--
 .../models/clipseg/modeling_clipseg.py        | 30 ++++++++++++----
 .../models/groupvit/configuration_groupvit.py |  4 +--
 .../models/groupvit/modeling_groupvit.py      | 30 ++++++++++++----
 .../models/groupvit/modeling_tf_groupvit.py   | 35 ++++++++++++++-----
 8 files changed, 142 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 405fe27d49c9..2297919cd14c 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -701,6 +701,9 @@ def __init__(self, config: CLIPTextConfig):
         self.encoder = CLIPEncoder(config)
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
     def forward(
@@ -750,13 +753,26 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index cb8ee4e7c9a4..750e5b054858 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -487,6 +487,9 @@ def setup(self):
         self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype)
         self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
 
+        # For `pooled_output` computation
+        self.eos_token_id = self.config.eos_token_id
+
     def __call__(
         self,
         input_ids,
@@ -517,9 +520,18 @@ def __call__(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the EOS embedding (eos_token_id is the highest number in each sequence)
-        pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the EOS embedding (eos_token_id is the highest number in each sequence)
+            pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)]
+        else:
+            # (no need to cast from bool to int after comparing to `eos_token_id`)
+            pooled_output = last_hidden_state[
+                jnp.arange(last_hidden_state.shape[0]), (input_ids == self.eos_token_id).argmax(axis=-1)
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 3452deba9cdc..335b1f7da8e4 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -494,6 +494,9 @@ def __init__(self, config: CLIPTextConfig, **kwargs):
             epsilon=config.layer_norm_eps, name="final_layer_norm"
         )
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     def call(
         self,
         input_ids: TFModelInputType,
@@ -530,14 +533,30 @@ def call(
         sequence_output = encoder_outputs[0]
         sequence_output = self.final_layer_norm(inputs=sequence_output)
 
-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        pooled_output = tf.gather_nd(
-            params=sequence_output,
-            indices=tf.stack(
-                values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
-            ),
-        )
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+                ),
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(
+                        tf.range(input_shape[0], dtype=tf.int64),
+                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
+                    ),
+                    axis=1,
+                ),
+            )
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 2f05492a7553..6d4147f82065 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -97,8 +97,8 @@ def __init__(
         initializer_range=0.02,
         initializer_factor=1.0,
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        bos_token_id=49406,
+        eos_token_id=49407,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 5f49160f4d3a..4cab4425f18d 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -712,6 +712,9 @@ def __init__(self, config: CLIPSegTextConfig):
         self.encoder = CLIPSegEncoder(config)
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
     # Copied from transformers.models.clip.modeling_clip.CLIPTextTransformer.forward with clip->clipseg, CLIP->CLIPSeg
@@ -762,13 +765,26 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIPSeg model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 867377a863e8..4d10a2dbb501 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -106,8 +106,8 @@ def __init__(
         initializer_range=0.02,
         initializer_factor=1.0,
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        bos_token_id=49406,
+        eos_token_id=49407,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 89bfb8d005da..59ff60ed765a 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1095,6 +1095,9 @@ def __init__(self, config: GroupViTTextConfig):
         self.encoder = GroupViTTextEncoder(config)
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
     def forward(
@@ -1144,13 +1147,26 @@ def forward(
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
-            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
-        ]
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
 
         if not return_dict:
             return (last_hidden_state, pooled_output) + encoder_outputs[1:]
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 2c3297a8f8b0..027117bdce23 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -1002,6 +1002,9 @@ def __init__(self, config: GroupViTTextConfig, **kwargs):
             epsilon=config.layer_norm_eps, name="final_layer_norm"
         )
 
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
     def call(
         self,
         input_ids: TFModelInputType,
@@ -1038,14 +1041,30 @@ def call(
         sequence_output = encoder_outputs[0]
         sequence_output = self.final_layer_norm(inputs=sequence_output)
 
-        # text_embeds.shape = [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        pooled_output = tf.gather_nd(
-            params=sequence_output,
-            indices=tf.stack(
-                values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
-            ),
-        )
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+                ),
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = tf.gather_nd(
+                params=sequence_output,
+                indices=tf.stack(
+                    values=(
+                        tf.range(input_shape[0], dtype=tf.int64),
+                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
+                    ),
+                    axis=1,
+                ),
+            )
 
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]

From 0f4502d335e98b189276b58220a7531035c91fab Mon Sep 17 00:00:00 2001
From: statelesshz <371503430@qq.com>
Date: Tue, 18 Jul 2023 02:45:59 +0800
Subject: [PATCH 679/935] Remove deprecated codes (#24837)

* remove `xpu_backend` training argument

* always call `contextlib.nullcontext()` since transformers updated to
python3.8

* these codes will not be executed
---
 src/transformers/trainer.py       | 17 ++---------------
 src/transformers/training_args.py | 16 ----------------
 2 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 8ed88d931e05..e401a0b8d37c 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -629,20 +629,7 @@ def __init__(
                     #  bf16 does not need grad scaling
                     self.do_grad_scaling = self.amp_dtype == torch.float16
                     if self.do_grad_scaling:
-                        if self.sharded_ddp is not None:
-                            self.scaler = ShardedGradScaler()
-                        elif self.fsdp is not None:
-                            from torch.distributed.fsdp.sharded_grad_scaler import (
-                                ShardedGradScaler as FSDPShardedGradScaler,
-                            )
-
-                            self.scaler = FSDPShardedGradScaler()
-                        elif is_torch_tpu_available():
-                            from torch_xla.amp import GradScaler
-
-                            self.scaler = GradScaler()
-                        else:
-                            self.scaler = torch.cuda.amp.GradScaler()
+                        self.scaler = ShardedGradScaler()
                 elif args.half_precision_backend == "cpu_amp":
                     self.use_cpu_amp = True
                     self.amp_dtype = torch.bfloat16
@@ -2621,7 +2608,7 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
                 else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
             )
         else:
-            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+            ctx_manager = contextlib.nullcontext()
 
         return ctx_manager
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 0527b4011473..8097237f5824 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1189,14 +1189,6 @@ class TrainingArguments:
         },
     )
 
-    xpu_backend: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The backend to be used for distributed training on Intel XPU.",
-            "choices": ["mpi", "ccl", "gloo"],
-        },
-    )
-
     def __post_init__(self):
         # expand paths, if not os.makedirs("~/bar") will make directory
         # in the current directory instead of the actual home
@@ -1220,14 +1212,6 @@ def __post_init__(self):
             # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
             self.evaluation_strategy = self.evaluation_strategy.value
 
-        if self.xpu_backend is not None:
-            warnings.warn(
-                "using `xpu_backend` is deprecated and will be removed in version 4.31"
-                " of 🤗 Transformers. Use `ddp_backend` instead",
-                FutureWarning,
-            )
-            self.ddp_backend = self.xpu_backend
-
         self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
         self.save_strategy = IntervalStrategy(self.save_strategy)

From 9dc965bb404c2bb8e3c02eaa5eea6502af1aee1a Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:52:28 -0400
Subject: [PATCH 680/935] deprecate no_cuda (#24863)

* deprecate no_cuda

* style

* remove doc

* remove doc 2

* fix style
---
 src/transformers/training_args.py | 32 ++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 8097237f5824..74f6d929e01d 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -297,8 +297,8 @@ class TrainingArguments:
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
-        no_cuda (`bool`, *optional*, defaults to `False`):
-            Whether to not use CUDA even when it is available or not.
+        use_cpu (`bool`, *optional*, defaults to `False`):
+            Whether or not to use cpu. If set to False, we will use cuda or mps device if available.
         seed (`int`, *optional*, defaults to 42):
             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
             [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
@@ -313,7 +313,7 @@ class TrainingArguments:
             installation](https://github.com/intel/intel-extension-for-pytorch).
         bf16 (`bool`, *optional*, defaults to `False`):
             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
-            NVIDIA architecture or using CPU (no_cuda). This is an experimental API and it may change.
+            NVIDIA architecture or using CPU (use_cpu). This is an experimental API and it may change.
         fp16 (`bool`, *optional*, defaults to `False`):
             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
         fp16_opt_level (`str`, *optional*, defaults to 'O1'):
@@ -793,7 +793,14 @@ class TrainingArguments:
             )
         },
     )
-    no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
+    no_cuda: bool = field(
+        default=False,
+        metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."},
+    )
+    use_cpu: bool = field(
+        default=False,
+        metadata={"help": " Whether or not to use cpu. If set to False, we will use cuda or mps device if available."},
+    )
     use_mps_device: bool = field(
         default=False,
         metadata={
@@ -820,7 +827,7 @@ class TrainingArguments:
         metadata={
             "help": (
                 "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA"
-                " architecture or using CPU (no_cuda). This is an experimental API and it may change."
+                " architecture or using CPU (use_cpu). This is an experimental API and it may change."
             )
         },
     )
@@ -1211,6 +1218,13 @@ def __post_init__(self):
             )
             # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
             self.evaluation_strategy = self.evaluation_strategy.value
+        if self.no_cuda:
+            warnings.warn(
+                "using `no_cuda` is deprecated and will be removed in version 5.0 of 🤗 Transformers. "
+                "Use `use_cpu` instead",
+                FutureWarning,
+            )
+            self.use_cpu = self.no_cuda
 
         self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
@@ -1305,10 +1319,10 @@ def __post_init__(self):
                 self.half_precision_backend = self.fp16_backend
 
             if self.bf16 or self.bf16_full_eval:
-                if self.no_cuda and not is_torch_bf16_cpu_available() and not is_torch_tpu_available():
+                if self.use_cpu and not is_torch_bf16_cpu_available() and not is_torch_tpu_available():
                     # cpu
                     raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
-                elif not self.no_cuda and torch.cuda.is_available() and not is_torch_bf16_gpu_available():
+                elif not self.use_cpu and torch.cuda.is_available() and not is_torch_bf16_gpu_available():
                     # gpu
                     raise ValueError(
                         "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
@@ -1702,7 +1716,7 @@ def _setup_devices(self) -> "torch.device":
                 )
             AcceleratorState._reset_state(reset_partial_state=True)
         self.distributed_state = None
-        if self.no_cuda:
+        if self.use_cpu:
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
             self._n_gpu = 0
         elif is_sagemaker_mp_enabled():
@@ -1752,7 +1766,7 @@ def _setup_devices(self) -> "torch.device":
                     )
             if device.type == "mps":
                 self._n_gpu = 1
-            elif self.no_cuda:
+            elif self.use_cpu:
                 device = torch.device("cpu")
                 self._n_gpu = 0
             else:

From 870dfc15b2c9db478e2aca85d84a97e7c549c381 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 17 Jul 2023 21:51:50 +0200
Subject: [PATCH 681/935] Skip failing
 `ZeroShotAudioClassificationPipelineTests::test_small_model_pt` for now
 (#24867)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../pipelines/test_pipelines_zero_shot_audio_classification.py  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 87f91a7d27ef..f766a5897811 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -27,6 +27,8 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
     # and only CLAP would be there for now.
     # model_mapping = {CLAPConfig: CLAPModel}
 
+    # TODO: fix me (ydshieh)
+    @unittest.skip("currently failing (probably due to `datasets` issue)")
     @require_torch
     def test_small_model_pt(self):
         audio_classifier = pipeline(

From d561408cc37722201bdc187a487d9b03f1ab51f8 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 17 Jul 2023 15:52:04 -0400
Subject: [PATCH 682/935] Skip Add model like job (#24865)

---
 .github/workflows/add-model-like.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/add-model-like.yml b/.github/workflows/add-model-like.yml
index 3ea3c89249fe..68133a7e2243 100644
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@@ -3,13 +3,13 @@ name: Add model like runner
 on:
   push:
     branches:
-      - main
-  pull_request:
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-    types: [opened, synchronize, reopened]
+      - none # put main here when this is fixed
+  #pull_request:
+  #  paths:
+  #    - "src/**"
+  #    - "tests/**"
+  #    - ".github/**"
+  #  types: [opened, synchronize, reopened]
 
 jobs:
   run_tests_templates_like:

From 2ab75add4b30c2fc44a8bf575156d448d9ed87a7 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 17 Jul 2023 22:37:28 +0200
Subject: [PATCH 683/935] Remove `tests/onnx` (#24868)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/onnx/__init__.py      |   0
 tests/onnx/test_features.py | 111 --------
 tests/onnx/test_onnx.py     | 195 -------------
 tests/onnx/test_onnx_v2.py  | 526 ------------------------------------
 4 files changed, 832 deletions(-)
 delete mode 100644 tests/onnx/__init__.py
 delete mode 100644 tests/onnx/test_features.py
 delete mode 100644 tests/onnx/test_onnx.py
 delete mode 100644 tests/onnx/test_onnx_v2.py

diff --git a/tests/onnx/__init__.py b/tests/onnx/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/onnx/test_features.py b/tests/onnx/test_features.py
deleted file mode 100644
index 4590ff0cc86c..000000000000
--- a/tests/onnx/test_features.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from tempfile import TemporaryDirectory
-from unittest import TestCase
-from unittest.mock import MagicMock, patch
-
-from transformers import AutoModel, TFAutoModel
-from transformers.onnx import FeaturesManager
-from transformers.testing_utils import SMALL_MODEL_IDENTIFIER, require_tf, require_torch
-
-
-@require_torch
-@require_tf
-class DetermineFrameworkTest(TestCase):
-    """
-    Test `FeaturesManager.determine_framework`
-    """
-
-    def setUp(self):
-        self.test_model = SMALL_MODEL_IDENTIFIER
-        self.framework_pt = "pt"
-        self.framework_tf = "tf"
-
-    def _setup_pt_ckpt(self, save_dir):
-        model_pt = AutoModel.from_pretrained(self.test_model)
-        model_pt.save_pretrained(save_dir)
-
-    def _setup_tf_ckpt(self, save_dir):
-        model_tf = TFAutoModel.from_pretrained(self.test_model, from_pt=True)
-        model_tf.save_pretrained(save_dir)
-
-    def test_framework_provided(self):
-        """
-        Ensure the that the provided framework is returned.
-        """
-        mock_framework = "mock_framework"
-
-        # Framework provided - return whatever the user provides
-        result = FeaturesManager.determine_framework(self.test_model, mock_framework)
-        self.assertEqual(result, mock_framework)
-
-        # Local checkpoint and framework provided - return provided framework
-        # PyTorch checkpoint
-        with TemporaryDirectory() as local_pt_ckpt:
-            self._setup_pt_ckpt(local_pt_ckpt)
-            result = FeaturesManager.determine_framework(local_pt_ckpt, mock_framework)
-            self.assertEqual(result, mock_framework)
-
-        # TensorFlow checkpoint
-        with TemporaryDirectory() as local_tf_ckpt:
-            self._setup_tf_ckpt(local_tf_ckpt)
-            result = FeaturesManager.determine_framework(local_tf_ckpt, mock_framework)
-            self.assertEqual(result, mock_framework)
-
-    def test_checkpoint_provided(self):
-        """
-        Ensure that the determined framework is the one used for the local checkpoint.
-
-        For the functionality to execute, local checkpoints are provided but framework is not.
-        """
-        # PyTorch checkpoint
-        with TemporaryDirectory() as local_pt_ckpt:
-            self._setup_pt_ckpt(local_pt_ckpt)
-            result = FeaturesManager.determine_framework(local_pt_ckpt)
-            self.assertEqual(result, self.framework_pt)
-
-        # TensorFlow checkpoint
-        with TemporaryDirectory() as local_tf_ckpt:
-            self._setup_tf_ckpt(local_tf_ckpt)
-            result = FeaturesManager.determine_framework(local_tf_ckpt)
-            self.assertEqual(result, self.framework_tf)
-
-        # Invalid local checkpoint
-        with TemporaryDirectory() as local_invalid_ckpt:
-            with self.assertRaises(FileNotFoundError):
-                result = FeaturesManager.determine_framework(local_invalid_ckpt)
-
-    def test_from_environment(self):
-        """
-        Ensure that the determined framework is the one available in the environment.
-
-        For the functionality to execute, framework and local checkpoints are not provided.
-        """
-        # Framework not provided, hub model is used (no local checkpoint directory)
-        # TensorFlow not in environment -> use PyTorch
-        mock_tf_available = MagicMock(return_value=False)
-        with patch("transformers.onnx.features.is_tf_available", mock_tf_available):
-            result = FeaturesManager.determine_framework(self.test_model)
-            self.assertEqual(result, self.framework_pt)
-
-        # PyTorch not in environment -> use TensorFlow
-        mock_torch_available = MagicMock(return_value=False)
-        with patch("transformers.onnx.features.is_torch_available", mock_torch_available):
-            result = FeaturesManager.determine_framework(self.test_model)
-            self.assertEqual(result, self.framework_tf)
-
-        # Both in environment -> use PyTorch
-        mock_tf_available = MagicMock(return_value=True)
-        mock_torch_available = MagicMock(return_value=True)
-        with patch("transformers.onnx.features.is_tf_available", mock_tf_available), patch(
-            "transformers.onnx.features.is_torch_available", mock_torch_available
-        ):
-            result = FeaturesManager.determine_framework(self.test_model)
-            self.assertEqual(result, self.framework_pt)
-
-        # Both not in environment -> raise error
-        mock_tf_available = MagicMock(return_value=False)
-        mock_torch_available = MagicMock(return_value=False)
-        with patch("transformers.onnx.features.is_tf_available", mock_tf_available), patch(
-            "transformers.onnx.features.is_torch_available", mock_torch_available
-        ):
-            with self.assertRaises(EnvironmentError):
-                result = FeaturesManager.determine_framework(self.test_model)
diff --git a/tests/onnx/test_onnx.py b/tests/onnx/test_onnx.py
deleted file mode 100644
index 851934ef3022..000000000000
--- a/tests/onnx/test_onnx.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from pathlib import Path
-from tempfile import NamedTemporaryFile, TemporaryDirectory
-
-from transformers import BertConfig, BertTokenizerFast, FeatureExtractionPipeline
-from transformers.convert_graph_to_onnx import (
-    convert,
-    ensure_valid_input,
-    generate_identified_filename,
-    infer_shapes,
-    quantize,
-)
-from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
-
-
-class FuncContiguousArgs:
-    def forward(self, input_ids, token_type_ids, attention_mask):
-        return None
-
-
-class FuncNonContiguousArgs:
-    def forward(self, input_ids, some_other_args, token_type_ids, attention_mask):
-        return None
-
-
-class OnnxExportTestCase(unittest.TestCase):
-    MODEL_TO_TEST = [
-        # (model_name, model_kwargs)
-        ("bert-base-cased", {}),
-        ("gpt2", {"use_cache": False}),  # We don't support exporting GPT2 past keys anymore
-    ]
-
-    @require_tf
-    @slow
-    def test_export_tensorflow(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            self._test_export(model, "tf", 12, **model_kwargs)
-
-    @require_torch
-    @slow
-    def test_export_pytorch(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            self._test_export(model, "pt", 12, **model_kwargs)
-
-    @require_torch
-    @slow
-    def test_export_custom_bert_model(self):
-        from transformers import BertModel
-
-        vocab = ["[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]", "some", "other", "words"]
-        with NamedTemporaryFile(mode="w+t") as vocab_file:
-            vocab_file.write("\n".join(vocab))
-            vocab_file.flush()
-            tokenizer = BertTokenizerFast(vocab_file.name)
-
-        with TemporaryDirectory() as bert_save_dir:
-            model = BertModel(BertConfig(vocab_size=len(vocab)))
-            model.save_pretrained(bert_save_dir)
-            self._test_export(bert_save_dir, "pt", 12, tokenizer)
-
-    @require_tf
-    @slow
-    def test_quantize_tf(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            path = self._test_export(model, "tf", 12, **model_kwargs)
-            quantized_path = quantize(Path(path))
-
-            # Ensure the actual quantized model is not bigger than the original one
-            if quantized_path.stat().st_size >= Path(path).stat().st_size:
-                self.fail("Quantized model is bigger than initial ONNX model")
-
-    @require_torch
-    @slow
-    def test_quantize_pytorch(self):
-        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
-            path = self._test_export(model, "pt", 12, **model_kwargs)
-            quantized_path = quantize(path)
-
-            # Ensure the actual quantized model is not bigger than the original one
-            if quantized_path.stat().st_size >= Path(path).stat().st_size:
-                self.fail("Quantized model is bigger than initial ONNX model")
-
-    def _test_export(self, model, framework, opset, tokenizer=None, **model_kwargs):
-        try:
-            # Compute path
-            with TemporaryDirectory() as tempdir:
-                path = Path(tempdir).joinpath("model.onnx")
-
-            # Remove folder if exists
-            if path.parent.exists():
-                path.parent.rmdir()
-
-            # Export
-            convert(framework, model, path, opset, tokenizer, **model_kwargs)
-
-            return path
-        except Exception as e:
-            self.fail(e)
-
-    @require_torch
-    @require_tokenizers
-    @slow
-    def test_infer_dynamic_axis_pytorch(self):
-        """
-        Validate the dynamic axis generated for each parameters are correct
-        """
-        from transformers import BertModel
-
-        model = BertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
-        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
-        self._test_infer_dynamic_axis(model, tokenizer, "pt")
-
-    @require_tf
-    @require_tokenizers
-    @slow
-    def test_infer_dynamic_axis_tf(self):
-        """
-        Validate the dynamic axis generated for each parameters are correct
-        """
-        from transformers import TFBertModel
-
-        model = TFBertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
-        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
-        self._test_infer_dynamic_axis(model, tokenizer, "tf")
-
-    def _test_infer_dynamic_axis(self, model, tokenizer, framework):
-        feature_extractor = FeatureExtractionPipeline(model, tokenizer)
-
-        variable_names = ["input_ids", "token_type_ids", "attention_mask", "output_0", "output_1"]
-        input_vars, output_vars, shapes, tokens = infer_shapes(feature_extractor, framework)
-
-        # Assert all variables are present
-        self.assertEqual(len(shapes), len(variable_names))
-        self.assertTrue(all(var_name in shapes for var_name in variable_names))
-        self.assertSequenceEqual(variable_names[:3], input_vars)
-        self.assertSequenceEqual(variable_names[3:], output_vars)
-
-        # Assert inputs are {0: batch, 1: sequence}
-        for var_name in ["input_ids", "token_type_ids", "attention_mask"]:
-            self.assertDictEqual(shapes[var_name], {0: "batch", 1: "sequence"})
-
-        # Assert outputs are {0: batch, 1: sequence} and {0: batch}
-        self.assertDictEqual(shapes["output_0"], {0: "batch", 1: "sequence"})
-        self.assertDictEqual(shapes["output_1"], {0: "batch"})
-
-    def test_ensure_valid_input(self):
-        """
-        Validate parameters are correctly exported
-        GPT2 has "past" parameter in the middle of input_ids, token_type_ids and attention_mask.
-        ONNX doesn't support export with a dictionary, only a tuple. Thus we need to ensure we remove
-        token_type_ids and attention_mask for now to not having a None tensor in the middle
-        """
-        # All generated args are valid
-        input_names = ["input_ids", "attention_mask", "token_type_ids"]
-        tokens = {"input_ids": [1, 2, 3, 4], "attention_mask": [0, 0, 0, 0], "token_type_ids": [1, 1, 1, 1]}
-        ordered_input_names, inputs_args = ensure_valid_input(FuncContiguousArgs(), tokens, input_names)
-
-        # Should have exactly the same number of args (all are valid)
-        self.assertEqual(len(inputs_args), 3)
-
-        # Should have exactly the same input names
-        self.assertEqual(set(ordered_input_names), set(input_names))
-
-        # Parameter should be reordered according to their respective place in the function:
-        # (input_ids, token_type_ids, attention_mask)
-        self.assertEqual(inputs_args, (tokens["input_ids"], tokens["token_type_ids"], tokens["attention_mask"]))
-
-        # Generated args are interleaved with another args (for instance parameter "past" in GPT2)
-        ordered_input_names, inputs_args = ensure_valid_input(FuncNonContiguousArgs(), tokens, input_names)
-
-        # Should have exactly the one arg (all before the one not provided "some_other_args")
-        self.assertEqual(len(inputs_args), 1)
-        self.assertEqual(len(ordered_input_names), 1)
-
-        # Should have only "input_ids"
-        self.assertEqual(inputs_args[0], tokens["input_ids"])
-        self.assertEqual(ordered_input_names[0], "input_ids")
-
-    def test_generate_identified_name(self):
-        generated = generate_identified_filename(Path("/home/something/my_fake_model.onnx"), "-test")
-        self.assertEqual("/home/something/my_fake_model-test.onnx", generated.as_posix())
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
deleted file mode 100644
index e160cd77f9a3..000000000000
--- a/tests/onnx/test_onnx_v2.py
+++ /dev/null
@@ -1,526 +0,0 @@
-import os
-import unittest
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-from unittest import TestCase
-from unittest.mock import patch
-
-import pytest
-from packaging import version
-from parameterized import parameterized
-
-from transformers import AutoConfig, PreTrainedTokenizerBase, is_tf_available, is_torch_available
-from transformers.onnx import (
-    EXTERNAL_DATA_FORMAT_SIZE_LIMIT,
-    OnnxConfig,
-    OnnxConfigWithPast,
-    ParameterFormat,
-    validate_model_outputs,
-)
-from transformers.onnx.utils import (
-    compute_effective_axis_dimension,
-    compute_serialized_parameters_size,
-    get_preprocessor,
-)
-from transformers.testing_utils import require_onnx, require_rjieba, require_tf, require_torch, require_vision, slow
-
-
-if is_torch_available() or is_tf_available():
-    from transformers.onnx.features import FeaturesManager
-
-if is_torch_available():
-    import torch
-
-    from transformers.models.deberta import modeling_deberta
-
-
-@require_onnx
-class OnnxUtilsTestCaseV2(TestCase):
-    """
-    Cover all the utilities involved to export ONNX models
-    """
-
-    def test_compute_effective_axis_dimension(self):
-        """
-        When exporting ONNX model with dynamic axis (batch or sequence) we set batch_size and/or sequence_length = -1.
-        We cannot generate an effective tensor with axis dim == -1, so we trick by using some "fixed" values
-        (> 1 to avoid ONNX squeezing the axis).
-
-        This test ensure we are correctly replacing generated batch / sequence tensor with axis > 1
-        """
-
-        # Dynamic axis (batch, no token added by the tokenizer)
-        self.assertEqual(compute_effective_axis_dimension(-1, fixed_dimension=2, num_token_to_add=0), 2)
-
-        # Static axis (batch, no token added by the tokenizer)
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=2, num_token_to_add=0), 2)
-
-        # Dynamic axis (sequence, token added by the tokenizer 2 (no pair))
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=2), 6)
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=2), 6)
-
-        # Dynamic axis (sequence, token added by the tokenizer 3 (pair))
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=3), 5)
-        self.assertEqual(compute_effective_axis_dimension(0, fixed_dimension=8, num_token_to_add=3), 5)
-
-    def test_compute_parameters_serialized_size(self):
-        """
-        This test ensures we compute a "correct" approximation of the underlying storage requirement (size) for all the
-        parameters for the specified parameter's dtype.
-        """
-        self.assertEqual(compute_serialized_parameters_size(2, ParameterFormat.Float), 2 * ParameterFormat.Float.size)
-
-    def test_flatten_output_collection_property(self):
-        """
-        This test ensures we correctly flatten nested collection such as the one we use when returning past_keys.
-        past_keys = Tuple[Tuple]
-
-        ONNX exporter will export nested collections as ${collection_name}.${level_idx_0}.${level_idx_1}...${idx_n}
-        """
-        self.assertEqual(
-            OnnxConfig.flatten_output_collection_property("past_key", [[0], [1], [2]]),
-            {
-                "past_key.0": 0,
-                "past_key.1": 1,
-                "past_key.2": 2,
-            },
-        )
-
-
-class OnnxConfigTestCaseV2(TestCase):
-    """
-    Cover the test for models default.
-
-    Default means no specific features is being enabled on the model.
-    """
-
-    @patch.multiple(OnnxConfig, __abstractmethods__=set())
-    def test_use_external_data_format(self):
-        """
-        External data format is required only if the serialized size of the parameters if bigger than 2Gb
-        """
-        TWO_GB_LIMIT = EXTERNAL_DATA_FORMAT_SIZE_LIMIT
-
-        # No parameters
-        self.assertFalse(OnnxConfig.use_external_data_format(0))
-
-        # Some parameters
-        self.assertFalse(OnnxConfig.use_external_data_format(1))
-
-        # Almost 2Gb parameters
-        self.assertFalse(OnnxConfig.use_external_data_format((TWO_GB_LIMIT - 1) // ParameterFormat.Float.size))
-
-        # Exactly 2Gb parameters
-        self.assertTrue(OnnxConfig.use_external_data_format(TWO_GB_LIMIT))
-
-        # More than 2Gb parameters
-        self.assertTrue(OnnxConfig.use_external_data_format((TWO_GB_LIMIT + 1) // ParameterFormat.Float.size))
-
-
-class OnnxConfigWithPastTestCaseV2(TestCase):
-    """
-    Cover the tests for model which have use_cache feature (i.e. "with_past" for ONNX)
-    """
-
-    SUPPORTED_WITH_PAST_CONFIGS = {}
-    # SUPPORTED_WITH_PAST_CONFIGS = {
-    #     ("BART", BartConfig),
-    #     ("GPT2", GPT2Config),
-    #     # ("T5", T5Config)
-    # }
-
-    @patch.multiple(OnnxConfigWithPast, __abstractmethods__=set())
-    def test_use_past(self):
-        """
-        Ensure the use_past variable is correctly being set
-        """
-        for name, config in OnnxConfigWithPastTestCaseV2.SUPPORTED_WITH_PAST_CONFIGS:
-            with self.subTest(name):
-                self.assertFalse(
-                    OnnxConfigWithPast.from_model_config(config()).use_past,
-                    "OnnxConfigWithPast.from_model_config() should not use_past",
-                )
-
-                self.assertTrue(
-                    OnnxConfigWithPast.with_past(config()).use_past,
-                    "OnnxConfigWithPast.from_model_config() should use_past",
-                )
-
-    @patch.multiple(OnnxConfigWithPast, __abstractmethods__=set())
-    def test_values_override(self):
-        """
-        Ensure the use_past variable correctly set the `use_cache` value in model's configuration
-        """
-        for name, config in OnnxConfigWithPastTestCaseV2.SUPPORTED_WITH_PAST_CONFIGS:
-            with self.subTest(name):
-                # without past
-                onnx_config_default = OnnxConfigWithPast.from_model_config(config())
-                self.assertIsNotNone(onnx_config_default.values_override, "values_override should not be None")
-                self.assertIn("use_cache", onnx_config_default.values_override, "use_cache should be present")
-                self.assertFalse(
-                    onnx_config_default.values_override["use_cache"], "use_cache should be False if not using past"
-                )
-
-                # with past
-                onnx_config_default = OnnxConfigWithPast.with_past(config())
-                self.assertIsNotNone(onnx_config_default.values_override, "values_override should not be None")
-                self.assertIn("use_cache", onnx_config_default.values_override, "use_cache should be present")
-                self.assertTrue(
-                    onnx_config_default.values_override["use_cache"], "use_cache should be False if not using past"
-                )
-
-
-PYTORCH_EXPORT_MODELS = {
-    ("albert", "hf-internal-testing/tiny-random-AlbertModel"),
-    ("bert", "hf-internal-testing/tiny-random-BertModel"),
-    ("beit", "microsoft/beit-base-patch16-224"),
-    ("big-bird", "hf-internal-testing/tiny-random-BigBirdModel"),
-    ("camembert", "camembert-base"),
-    ("clip", "hf-internal-testing/tiny-random-CLIPModel"),
-    ("convbert", "hf-internal-testing/tiny-random-ConvBertModel"),
-    ("codegen", "hf-internal-testing/tiny-random-CodeGenModel"),
-    ("data2vec-text", "hf-internal-testing/tiny-random-Data2VecTextModel"),
-    ("data2vec-vision", "facebook/data2vec-vision-base"),
-    ("deberta", "hf-internal-testing/tiny-random-DebertaModel"),
-    ("deberta-v2", "hf-internal-testing/tiny-random-DebertaV2Model"),
-    ("deit", "facebook/deit-small-patch16-224"),
-    ("convnext", "facebook/convnext-tiny-224"),
-    ("detr", "facebook/detr-resnet-50"),
-    ("distilbert", "hf-internal-testing/tiny-random-DistilBertModel"),
-    ("electra", "hf-internal-testing/tiny-random-ElectraModel"),
-    ("groupvit", "nvidia/groupvit-gcc-yfcc"),
-    ("ibert", "kssteven/ibert-roberta-base"),
-    ("imagegpt", "openai/imagegpt-small"),
-    ("levit", "facebook/levit-128S"),
-    ("layoutlm", "hf-internal-testing/tiny-random-LayoutLMModel"),
-    ("layoutlmv3", "microsoft/layoutlmv3-base"),
-    ("longformer", "allenai/longformer-base-4096"),
-    ("mobilebert", "hf-internal-testing/tiny-random-MobileBertModel"),
-    ("mobilenet_v1", "google/mobilenet_v1_0.75_192"),
-    ("mobilenet_v2", "google/mobilenet_v2_0.35_96"),
-    ("mobilevit", "apple/mobilevit-small"),
-    ("owlvit", "google/owlvit-base-patch32"),
-    ("perceiver", "hf-internal-testing/tiny-random-PerceiverModel", ("masked-lm", "sequence-classification")),
-    ("perceiver", "hf-internal-testing/tiny-random-PerceiverModel", ("image-classification",)),
-    ("poolformer", "sail/poolformer_s12"),
-    ("rembert", "google/rembert"),
-    ("resnet", "microsoft/resnet-50"),
-    ("roberta", "hf-internal-testing/tiny-random-RobertaModel"),
-    ("roformer", "hf-internal-testing/tiny-random-RoFormerModel"),
-    ("segformer", "nvidia/segformer-b0-finetuned-ade-512-512"),
-    ("squeezebert", "hf-internal-testing/tiny-random-SqueezeBertModel"),
-    ("swin", "microsoft/swin-tiny-patch4-window7-224"),
-    ("vit", "google/vit-base-patch16-224"),
-    ("yolos", "hustvl/yolos-tiny"),
-    ("whisper", "openai/whisper-tiny.en"),
-    ("xlm", "hf-internal-testing/tiny-random-XLMModel"),
-    ("xlm-roberta", "hf-internal-testing/tiny-random-XLMRobertaXLModel"),
-}
-
-PYTORCH_EXPORT_ENCODER_DECODER_MODELS = {
-    ("vision-encoder-decoder", "nlpconnect/vit-gpt2-image-captioning"),
-}
-
-PYTORCH_EXPORT_WITH_PAST_MODELS = {
-    ("bloom", "hf-internal-testing/tiny-random-BloomModel"),
-    ("gpt2", "hf-internal-testing/tiny-random-GPT2Model"),
-    ("gpt-neo", "hf-internal-testing/tiny-random-GPTNeoModel"),
-}
-
-PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = {
-    ("bart", "hf-internal-testing/tiny-random-BartModel"),
-    ("bigbird-pegasus", "hf-internal-testing/tiny-random-BigBirdPegasusModel"),
-    ("blenderbot-small", "facebook/blenderbot_small-90M"),
-    ("blenderbot", "hf-internal-testing/tiny-random-BlenderbotModel"),
-    ("longt5", "hf-internal-testing/tiny-random-LongT5Model"),
-    ("marian", "Helsinki-NLP/opus-mt-en-de"),
-    ("mbart", "sshleifer/tiny-mbart"),
-    ("mt5", "google/mt5-base"),
-    ("m2m-100", "hf-internal-testing/tiny-random-M2M100Model"),
-    ("t5", "hf-internal-testing/tiny-random-T5Model"),
-}
-
-# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_MODELS` once TensorFlow has parity with the PyTorch model implementations.
-TENSORFLOW_EXPORT_DEFAULT_MODELS = {
-    ("albert", "hf-internal-testing/tiny-albert"),
-    ("bert", "hf-internal-testing/tiny-random-BertModel"),
-    ("camembert", "camembert-base"),
-    ("distilbert", "hf-internal-testing/tiny-random-DistilBertModel"),
-    ("roberta", "hf-internal-testing/tiny-random-RobertaModel"),
-}
-
-# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations.
-TENSORFLOW_EXPORT_WITH_PAST_MODELS = {}
-
-# TODO(lewtun): Include the same model types in `PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS` once TensorFlow has parity with the PyTorch model implementations.
-TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = {}
-
-
-def _get_models_to_test(export_models_list):
-    models_to_test = []
-    if is_torch_available() or is_tf_available():
-        for name, model, *features in export_models_list:
-            if features:
-                feature_config_mapping = {
-                    feature: FeaturesManager.get_config(name, feature) for _ in features for feature in _
-                }
-            else:
-                # pre-process the model names
-                model_type = name.replace("_", "-")
-                model_name = getattr(model, "name", "")
-                feature_config_mapping = FeaturesManager.get_supported_features_for_model_type(
-                    model_type, model_name=model_name
-                )
-
-            for feature, onnx_config_class_constructor in feature_config_mapping.items():
-                models_to_test.append((f"{name}_{feature}", name, model, feature, onnx_config_class_constructor))
-        return sorted(models_to_test)
-    else:
-        # Returning some dummy test that should not be ever called because of the @require_torch / @require_tf
-        # decorators.
-        # The reason for not returning an empty list is because parameterized.expand complains when it's empty.
-        return [("dummy", "dummy", "dummy", "dummy", OnnxConfig.from_model_config)]
-
-
-class OnnxExportTestCaseV2(TestCase):
-    """
-    Integration tests ensuring supported models are correctly exported
-    """
-
-    def _onnx_export(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor, device="cpu", framework="pt"
-    ):
-        from transformers.onnx import export
-
-        model_class = FeaturesManager.get_model_class_for_feature(feature, framework=framework)
-        config = AutoConfig.from_pretrained(model_name)
-        model = model_class.from_config(config)
-
-        # Dynamic axes aren't supported for YOLO-like models. This means they cannot be exported to ONNX on CUDA devices.
-        # See: https://github.com/ultralytics/yolov5/pull/8378
-        if model.__class__.__name__.startswith("Yolos") and device != "cpu":
-            return
-
-        # ONNX inference fails with the following name, feature, framework parameterizations
-        # See: https://github.com/huggingface/transformers/issues/19357
-        if (name, feature, framework) in {
-            ("deberta-v2", "question-answering", "pt"),
-            ("deberta-v2", "multiple-choice", "pt"),
-            ("roformer", "multiple-choice", "pt"),
-            ("groupvit", "default", "pt"),
-            ("perceiver", "masked-lm", "pt"),
-            ("perceiver", "sequence-classification", "pt"),
-            ("perceiver", "image-classification", "pt"),
-            ("bert", "multiple-choice", "tf"),
-            ("camembert", "multiple-choice", "tf"),
-            ("roberta", "multiple-choice", "tf"),
-        }:
-            return
-
-        onnx_config = onnx_config_class_constructor(model.config)
-
-        if is_torch_available():
-            from transformers.utils import get_torch_version
-
-            if version.parse(get_torch_version()) < onnx_config.torch_onnx_minimum_version:
-                pytest.skip(
-                    "Skipping due to incompatible PyTorch version. Minimum required is"
-                    f" {onnx_config.torch_onnx_minimum_version}, got: {get_torch_version()}"
-                )
-
-        preprocessor = get_preprocessor(model_name)
-
-        # Useful for causal lm models that do not use pad tokens.
-        if isinstance(preprocessor, PreTrainedTokenizerBase) and not getattr(config, "pad_token_id", None):
-            config.pad_token_id = preprocessor.eos_token_id
-
-        with NamedTemporaryFile("w") as output:
-            try:
-                onnx_inputs, onnx_outputs = export(
-                    preprocessor, model, onnx_config, onnx_config.default_onnx_opset, Path(output.name), device=device
-                )
-                validate_model_outputs(
-                    onnx_config,
-                    preprocessor,
-                    model,
-                    Path(output.name),
-                    onnx_outputs,
-                    onnx_config.atol_for_validation,
-                )
-            except (RuntimeError, ValueError) as e:
-                self.fail(f"{name}, {feature} -> {e}")
-
-    def _onnx_export_encoder_decoder_models(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor, device="cpu"
-    ):
-        from transformers import AutoFeatureExtractor, AutoTokenizer
-        from transformers.onnx import export
-
-        model_class = FeaturesManager.get_model_class_for_feature(feature)
-        config = AutoConfig.from_pretrained(model_name)
-        model = model_class.from_config(config)
-
-        onnx_config = onnx_config_class_constructor(model.config)
-
-        if is_torch_available():
-            from transformers.utils import get_torch_version
-
-            if version.parse(get_torch_version()) < onnx_config.torch_onnx_minimum_version:
-                pytest.skip(
-                    "Skipping due to incompatible PyTorch version. Minimum required is"
-                    f" {onnx_config.torch_onnx_minimum_version}, got: {get_torch_version()}"
-                )
-
-        encoder_model = model.get_encoder()
-        decoder_model = model.get_decoder()
-
-        encoder_onnx_config = onnx_config.get_encoder_config(encoder_model.config)
-        decoder_onnx_config = onnx_config.get_decoder_config(encoder_model.config, decoder_model.config, feature)
-
-        preprocessor = AutoFeatureExtractor.from_pretrained(model_name)
-
-        onnx_opset = max(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)
-
-        with NamedTemporaryFile("w") as encoder_output:
-            onnx_inputs, onnx_outputs = export(
-                preprocessor, encoder_model, encoder_onnx_config, onnx_opset, Path(encoder_output.name), device=device
-            )
-            validate_model_outputs(
-                encoder_onnx_config,
-                preprocessor,
-                encoder_model,
-                Path(encoder_output.name),
-                onnx_outputs,
-                encoder_onnx_config.atol_for_validation,
-            )
-
-        preprocessor = AutoTokenizer.from_pretrained(model_name)
-
-        with NamedTemporaryFile("w") as decoder_output:
-            _, onnx_outputs = export(
-                preprocessor,
-                decoder_model,
-                decoder_onnx_config,
-                onnx_config.default_onnx_opset,
-                Path(decoder_output.name),
-                device=device,
-            )
-            validate_model_outputs(
-                decoder_onnx_config,
-                preprocessor,
-                decoder_model,
-                Path(decoder_output.name),
-                onnx_outputs,
-                decoder_onnx_config.atol_for_validation,
-            )
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export_on_cuda(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, device="cuda")
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_ENCODER_DECODER_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export_encoder_decoder_models(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export_encoder_decoder_models(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_ENCODER_DECODER_MODELS))
-    @slow
-    @require_torch
-    @require_vision
-    @require_rjieba
-    def test_pytorch_export_encoder_decoder_models_on_cuda(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export_encoder_decoder_models(
-            test_name, name, model_name, feature, onnx_config_class_constructor, device="cuda"
-        )
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_WITH_PAST_MODELS))
-    @slow
-    @require_torch
-    def test_pytorch_export_with_past(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS))
-    @slow
-    @require_torch
-    def test_pytorch_export_seq2seq_with_past(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor)
-
-    @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_DEFAULT_MODELS))
-    @slow
-    @require_tf
-    @require_vision
-    def test_tensorflow_export(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, framework="tf")
-
-    @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_WITH_PAST_MODELS), skip_on_empty=True)
-    @slow
-    @require_tf
-    def test_tensorflow_export_with_past(self, test_name, name, model_name, feature, onnx_config_class_constructor):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, framework="tf")
-
-    @parameterized.expand(_get_models_to_test(TENSORFLOW_EXPORT_SEQ2SEQ_WITH_PAST_MODELS), skip_on_empty=True)
-    @slow
-    @require_tf
-    def test_tensorflow_export_seq2seq_with_past(
-        self, test_name, name, model_name, feature, onnx_config_class_constructor
-    ):
-        self._onnx_export(test_name, name, model_name, feature, onnx_config_class_constructor, framework="tf")
-
-
-class StableDropoutTestCase(TestCase):
-    """Tests export of StableDropout module."""
-
-    @unittest.skip("torch 2.0.0 gives `torch.onnx.errors.OnnxExporterError: Module onnx is not installed!`.")
-    @require_torch
-    @pytest.mark.filterwarnings("ignore:.*Dropout.*:UserWarning:torch.onnx.*")  # torch.onnx is spammy.
-    def test_training(self):
-        """Tests export of StableDropout in training mode."""
-        devnull = open(os.devnull, "wb")
-        # drop_prob must be > 0 for the test to be meaningful
-        sd = modeling_deberta.StableDropout(0.1)
-        # Avoid warnings in training mode
-        do_constant_folding = False
-        # Dropout is a no-op in inference mode
-        training = torch.onnx.TrainingMode.PRESERVE
-        input = (torch.randn(2, 2),)
-
-        torch.onnx.export(
-            sd,
-            input,
-            devnull,
-            opset_version=12,  # Minimum supported
-            do_constant_folding=do_constant_folding,
-            training=training,
-        )
-
-        # Expected to fail with opset_version < 12
-        with self.assertRaises(Exception):
-            torch.onnx.export(
-                sd,
-                input,
-                devnull,
-                opset_version=11,
-                do_constant_folding=do_constant_folding,
-                training=training,
-            )

From ca974aff0fba284d0f062956d8eecef527de067e Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 18 Jul 2023 13:39:08 +0200
Subject: [PATCH 684/935] [`Docs`] Clarify 4bit docs (#24878)

* clarify 4bit docs

* Apply suggestions from code review

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 docs/source/en/main_classes/quantization.md | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index 8984ac478785..c8547ab0c714 100644
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -38,11 +38,21 @@ Make sure that you have installed the requirements below before running any of t
 - Latest `bitsandbytes` library
 `pip install bitsandbytes>=0.39.0`
 
-- Install latest `accelerate` from source
-`pip install git+https://github.com/huggingface/accelerate.git`
+- Install latest `accelerate`
+`pip install --upgrade accelerate`
 
 - Install latest `transformers` from source 
-`pip install git+https://github.com/huggingface/transformers.git`
+`pip install --upgrade transformers`
+
+#### Tips and best practices
+
+- **Advanced usage:** Refer to [this Google Colab notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) for advanced usage of 4-bit quantization with all the possible options.
+
+- **Faster inference with `batch_size=1` :** Since the `0.40.0` release of bitsandbytes, for `batch_size=1` you can benefit from fast inference. Check out [these release notes](https://github.com/TimDettmers/bitsandbytes/releases/tag/0.40.0) and make sure to have a version that is greater than `0.40.0` to benefit from this feature out of the box. 
+
+- **Training:** According to [QLoRA paper](https://arxiv.org/abs/2305.14314), for training 4-bit base models (e.g. using LoRA adapters) one should use `bnb_4bit_quant_type='nf4'`. 
+
+- **Inference:** For inference, `bnb_4bit_quant_type` does not have a huge impact on the performance. However for consistency with the model's weights, make sure you use the same `bnb_4bit_compute_dtype` and `torch_dtype` arguments.
 
 #### Load a large model in 4bit
 

From f14c7f999d61e6a995e303e1a58075dcbab34fec Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 18 Jul 2023 13:45:00 +0200
Subject: [PATCH 685/935] Fix CircleCI cache (#24880)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml                | 20 ++++++++++----------
 .circleci/create_circleci_config.py | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0756e0d4b721..7fd676c761a6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -136,20 +136,20 @@ jobs:
             - checkout
             - restore_cache:
                   keys:
-                      - v0.7-code_quality-{{ checksum "setup.py" }}
-                      - v0.7-code-quality
+                      - v0.7-code_quality-pip-{{ checksum "setup.py" }}
+                      - v0.7-code-quality-pip
             - restore_cache:
                   keys:
-                      - v0.7-code_quality-{{ checksum "setup.py" }}-site-packages
+                      - v0.7-code_quality-site-packages-{{ checksum "setup.py" }}
                       - v0.7-code-quality-site-packages
             - run: pip install --upgrade --upgrade-strategy eager pip
             - run: pip install -U --upgrade-strategy eager .[all,quality]
             - save_cache:
-                  key: v0.7-code_quality-{{ checksum "setup.py" }}
+                  key: v0.7-code_quality-pip-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
             - save_cache:
-                  key: v0.7-code_quality-{{ checksum "setup.py" }}-site-packages
+                  key: v0.7-code_quality-site-packages-{{ checksum "setup.py" }}
                   paths:
                       - '~/.pyenv/versions/'
             - run:
@@ -177,20 +177,20 @@ jobs:
             - checkout
             - restore_cache:
                   keys:
-                      - v0.7-repository_consistency-{{ checksum "setup.py" }}
-                      - v0.7-repository_consistency
+                      - v0.7-repository_consistency-pip-{{ checksum "setup.py" }}
+                      - v0.7-repository_consistency-pip
             - restore_cache:
                   keys:
-                      - v0.7-repository_consistency-{{ checksum "setup.py" }}-site-packages
+                      - v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }}
                       - v0.7-repository_consistency-site-packages
             - run: pip install --upgrade --upgrade-strategy eager pip
             - run: pip install -U --upgrade-strategy eager .[all,quality]
             - save_cache:
-                  key: v0.7-repository_consistency-{{ checksum "setup.py" }}
+                  key: v0.7-repository_consistency-pip-{{ checksum "setup.py" }}
                   paths:
                       - '~/.cache/pip'
             - save_cache:
-                  key: v0.7-repository_consistency-{{ checksum "setup.py" }}-site-packages
+                  key: v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }}
                   paths:
                       - '~/.pyenv/versions/'
             - run:
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 8632723863f9..d9fea03b455c 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -101,16 +101,16 @@ def to_dict(self):
             {
                 "restore_cache": {
                     "keys": [
-                        f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}',
-                        f"v{self.cache_version}-{self.cache_name}-",
+                        f"v{self.cache_version}-{self.cache_name}-pip-" + '{{ checksum "setup.py" }}',
+                        f"v{self.cache_version}-{self.cache_name}-pip-",
                     ]
                 }
             },
             {
                 "restore_cache": {
                     "keys": [
-                        f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}-site-packages',
-                        f"v{self.cache_version}-{self.cache_name}-site-packages",
+                        f"v{self.cache_version}-{self.cache_name}-site-packages-" + '{{ checksum "setup.py" }}',
+                        f"v{self.cache_version}-{self.cache_name}-site-packages-",
                     ]
                 }
             },
@@ -119,7 +119,7 @@ def to_dict(self):
         steps.append(
             {
                 "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}',
+                    "key": f"v{self.cache_version}-{self.cache_name}-pip-" + '{{ checksum "setup.py" }}',
                     "paths": ["~/.cache/pip"],
                 }
             }
@@ -127,7 +127,7 @@ def to_dict(self):
         steps.append(
             {
                 "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-" + '{{ checksum "setup.py" }}-site-packages',
+                    "key": f"v{self.cache_version}-{self.cache_name}-site-packages-" + '{{ checksum "setup.py" }}',
                     "paths": ["~/.pyenv/versions/"],
                 }
             }

From 9c875839c0800c55c23f651ec4a1821d0c20ca04 Mon Sep 17 00:00:00 2001
From: statelesshz <3140102143@zju.edu.cn>
Date: Tue, 18 Jul 2023 20:20:32 +0800
Subject: [PATCH 686/935] add ascend npu accelerator support (#24879)

* Add Ascend NPU accelerator support

* fix style warining
---
 src/transformers/__init__.py              |  2 ++
 src/transformers/testing_utils.py         | 21 +++++++++++++++++++++
 src/transformers/trainer_utils.py         |  3 +++
 src/transformers/training_args.py         |  8 +++++++-
 src/transformers/utils/__init__.py        |  1 +
 src/transformers/utils/import_utils.py    | 19 +++++++++++++++++++
 tests/trainer/test_trainer_distributed.py | 15 +++++++++++++++
 7 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 80eb14588f40..409c0812256c 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -721,6 +721,7 @@
         "is_tokenizers_available",
         "is_torch_available",
         "is_torch_neuroncore_available",
+        "is_torch_npu_available",
         "is_torch_tpu_available",
         "is_torchvision_available",
         "is_vision_available",
@@ -4643,6 +4644,7 @@
         is_tokenizers_available,
         is_torch_available,
         is_torch_neuroncore_available,
+        is_torch_npu_available,
         is_torch_tpu_available,
         is_torchvision_available,
         is_vision_available,
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 698327f65862..a9ab304d2aa6 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -91,6 +91,7 @@
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
     )
 
 
+def require_torch_npu(test_case):
+    """
+    Decorator marking a test that requires NPU (in PyTorch).
+    """
+    return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
+
+
+def require_torch_multi_npu(test_case):
+    """
+    Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple NPUs.
+
+    To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
+    """
+    if not is_torch_npu_available():
+        return unittest.skip("test requires PyTorch NPU")(test_case)
+
+    return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
+
+
 if is_torch_available():
     # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
     import torch
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 74f01ad9270e..30571597c235 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -36,6 +36,7 @@
     is_torch_available,
     is_torch_cuda_available,
     is_torch_mps_available,
+    is_torch_npu_available,
     is_torch_tpu_available,
     requires_backends,
 )
@@ -94,6 +95,8 @@ def set_seed(seed: int):
         torch.manual_seed(seed)
         torch.cuda.manual_seed_all(seed)
         # ^^ safe to call this function even if cuda is not available
+    if is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
     if is_tf_available():
         tf.random.set_seed(seed)
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 74f6d929e01d..3a30adbe0b07 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -47,6 +47,7 @@
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
     logging,
@@ -1368,12 +1369,13 @@ def __post_init__(self):
             self.framework == "pt"
             and is_torch_available()
             and (self.device.type != "cuda")
+            and (self.device.type != "npu")
             and (get_xla_device_type(self.device) != "GPU")
             and (self.fp16 or self.fp16_full_eval)
         ):
             raise ValueError(
                 "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
-                " (`--fp16_full_eval`) can only be used on CUDA devices."
+                " (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
             )
 
         if (
@@ -1769,6 +1771,10 @@ def _setup_devices(self) -> "torch.device":
             elif self.use_cpu:
                 device = torch.device("cpu")
                 self._n_gpu = 0
+            elif is_torch_npu_available():
+                device = torch.device("npu:0")
+                torch.npu.set_device(device)
+                self._n_gpu = 1
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 21430cd5ba32..bca5440f8e02 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -164,6 +164,7 @@
     is_torch_fx_proxy,
     is_torch_mps_available,
     is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 83b6f01dfdea..ffefaed9e627 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
     return False
 
 
+@lru_cache()
+def is_torch_npu_available(check_device=False):
+    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no NPU is found
+            _ = torch.npu.device_count()
+            return torch.npu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
 def is_torchdynamo_available():
     if not is_torch_available():
         return False
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 4078885a5dfd..8711a3970f3b 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -21,6 +21,7 @@
     get_torch_dist_unique_port,
     require_torch_multi_gpu,
     require_torch_neuroncore,
+    require_torch_npu,
 )
 from transformers.training_args import ParallelMode
 from transformers.utils import logging
@@ -77,6 +78,20 @@ def test_trainer(self):
         # successful return here == success - any errors would have caused an error in the sub-call
 
 
+class TestTrainerDistributedNPU(TestCasePlus):
+    @require_torch_npu
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node=2
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
 class TestTrainerDistributed(TestCasePlus):
     @require_torch_multi_gpu
     def test_trainer(self):

From 57da42ad053cc421b224c18d4545db485357c06e Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 18 Jul 2023 15:08:53 +0200
Subject: [PATCH 687/935] Enable
 `ZeroShotAudioClassificationPipelineTests::test_small_model_pt` (#24882)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../pipelines/test_pipelines_zero_shot_audio_classification.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index f766a5897811..87f91a7d27ef 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -27,8 +27,6 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
     # and only CLAP would be there for now.
     # model_mapping = {CLAPConfig: CLAPModel}
 
-    # TODO: fix me (ydshieh)
-    @unittest.skip("currently failing (probably due to `datasets` issue)")
     @require_torch
     def test_small_model_pt(self):
         audio_classifier = pipeline(

From 3ec10e6c76362191b61260300fe1d6173a8dd7e1 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 18 Jul 2023 16:34:06 +0200
Subject: [PATCH 688/935] Add DINOv2 (#24016)

* First draft

* More improvements

* Convert patch embedding layer

* Convert all weights

* Make conversion work

* Improve conversion script

* Fix style

* Make all tests pass

* Add image processor to auto mapping

* Add swiglu ffn

* Add image processor to conversion script

* Fix conversion of giant model

* Fix documentation

* Fix style

* Fix tests

* Address comments

* Address more comments

* Remove unused arguments

* Remove more arguments

* Rename parameters

* Include mask token

* Address comments

* Add docstring

* Transfer checkpoints

* Empty commit
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.md                       |   2 +
 docs/source/en/model_doc/dinov2.md            |  45 +
 docs/source/en/tasks/image_classification.md  |   2 +-
 src/transformers/__init__.py                  |  16 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 .../models/bit/image_processing_bit.py        |   4 +-
 src/transformers/models/dinov2/__init__.py    |  59 ++
 .../models/dinov2/configuration_dinov2.py     | 149 ++++
 .../models/dinov2/convert_dinov2_to_hf.py     | 244 ++++++
 .../models/dinov2/modeling_dinov2.py          | 781 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 tests/models/dinov2/__init__.py               |   0
 tests/models/dinov2/test_modeling_dinov2.py   | 254 ++++++
 24 files changed, 1593 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/en/model_doc/dinov2.md
 create mode 100644 src/transformers/models/dinov2/__init__.py
 create mode 100644 src/transformers/models/dinov2/configuration_dinov2.py
 create mode 100644 src/transformers/models/dinov2/convert_dinov2_to_hf.py
 create mode 100644 src/transformers/models/dinov2/modeling_dinov2.py
 create mode 100644 tests/models/dinov2/__init__.py
 create mode 100644 tests/models/dinov2/test_modeling_dinov2.py

diff --git a/README.md b/README.md
index 825112b5b1b0..e4ebcef09344 100644
--- a/README.md
+++ b/README.md
@@ -337,6 +337,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
diff --git a/README_es.md b/README_es.md
index e9395e150b8e..54b77b7a284a 100644
--- a/README_es.md
+++ b/README_es.md
@@ -314,6 +314,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
diff --git a/README_hd.md b/README_hd.md
index d9b0b9b010ed..602e8721f64a 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -286,6 +286,7 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (फेसबुक से) साथ में कागज [ट्रांसफॉर्मर्स के साथ एंड-टू-एंड ऑब्जेक्ट डिटेक्शन](https://arxiv. org/abs/2005.12872) निकोलस कैरियन, फ़्रांसिस्को मस्सा, गेब्रियल सिनेव, निकोलस उसुनियर, अलेक्जेंडर किरिलोव, सर्गेई ज़ागोरुयको द्वारा।
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [DialoGPT: बड़े पैमाने पर जनरेटिव प्री-ट्रेनिंग फॉर कन्वर्सेशनल रिस्पांस जेनरेशन](https ://arxiv.org/abs/1911.00536) यिज़े झांग, सिकी सन, मिशेल गैली, येन-चुन चेन, क्रिस ब्रोकेट, जियांग गाओ, जियानफेंग गाओ, जिंगजिंग लियू, बिल डोलन द्वारा।
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (Meta AI से) Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. द्वाराअनुसंधान पत्र [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) के साथ जारी किया गया
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (हगिंगफेस से), साथ में कागज [डिस्टिलबर्ट, बीईआरटी का डिस्टिल्ड वर्जन: छोटा, तेज, सस्ता और हल्का] (https://arxiv.org/abs/1910.01108) विक्टर सनह, लिसांड्रे डेब्यू और थॉमस वुल्फ द्वारा पोस्ट किया गया। यही तरीका GPT-2 को [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERta से [DistilRoBERta](https://github.com) पर कंप्रेस करने के लिए भी लागू किया जाता है। / हगिंगफेस/ट्रांसफॉर्मर्स/ट्री/मेन/उदाहरण/डिस्टिलेशन), बहुभाषी BERT से [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) और डिस्टिलबर्ट का जर्मन संस्करण।
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [DiT: सेल्फ सुपरवाइज्ड प्री-ट्रेनिंग फॉर डॉक्यूमेंट इमेज ट्रांसफॉर्मर](https://arxiv.org/abs/2203.02378) जुनलॉन्ग ली, यिहेंग जू, टेंगचाओ लव, लेई कुई, चा झांग द्वारा फुरु वेई द्वारा पोस्ट किया गया।
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER से) साथ में कागज [OCR-मुक्त डॉक्यूमेंट अंडरस्टैंडिंग ट्रांसफॉर्मर](https://arxiv.org/abs /2111.15664) गीवूक किम, टीकग्यू होंग, मूनबिन यिम, जियोंग्योन नाम, जिनयॉन्ग पार्क, जिनयॉन्ग यिम, वोनसेओक ह्वांग, सांगडू यूं, डोंगयून हान, सेउंग्युन पार्क द्वारा।
diff --git a/README_ja.md b/README_ja.md
index b702d3525d77..7f28bd931854 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -348,6 +348,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (Meta AI から) Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. から公開された研究論文 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378)
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
diff --git a/README_ko.md b/README_ko.md
index 92b22e38a4cd..2a5dc93db66e 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -263,6 +263,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook 에서) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 의 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 논문과 함께 발표했습니다.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research 에서) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 의 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 논문과 함께 발표했습니다.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs 에서) Ali Hassani and Humphrey Shi 의 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 논문과 함께 발표했습니다.
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (Meta AI 에서 제공)은 Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.의 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)논문과 함께 발표했습니다.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace 에서) Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT 의 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 논문과 함께 발표했습니다.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research 에서) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 의 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 논문과 함께 발표했습니다.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER 에서) Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 의 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 2e4cded6192b..4953420acfcd 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -287,6 +287,7 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (来自 SHI Labs) 伴随论文 [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) 由 Ali Hassani and Humphrey Shi 发布。
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (来自 Meta AI) 伴随论文 [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) 由 Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski 发布。
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) 和德语版 DistilBERT。
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (来自 Microsoft Research) 伴随论文 [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) 由 Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei 发布。
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (来自 NAVER) 伴随论文 [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) 由 Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9ae019e0c4ce..ea8709a10fe9 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -299,6 +299,7 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/main/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/distillation) and a German version of DistilBERT.
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER) released with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index bcdf5e50ff1a..a947587f486a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -472,6 +472,8 @@
         title: DETR
       - local: model_doc/dinat
         title: DiNAT
+      - local: model_doc/dinov2
+        title: DINO V2
       - local: model_doc/dit
         title: DiT
       - local: model_doc/dpt
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 25542df14392..0d315515fdb8 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -103,6 +103,7 @@ The documentation is organized into five sections:
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
@@ -323,6 +324,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             DETA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DINOv2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md
new file mode 100644
index 000000000000..71bdc11a9dfb
--- /dev/null
+++ b/docs/source/en/model_doc/dinov2.md
@@ -0,0 +1,45 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DINOv2
+
+## Overview
+
+The DINOv2 model was proposed in [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by
+Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
+DINOv2 is an upgrade of [DINO](https://arxiv.org/abs/2104.14294), a self-supervised method applied on [Vision Transformers](vit). This method enables all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning.
+
+The abstract from the paper is the following:
+
+*The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.*
+
+Tips:
+
+- One can use [`AutoImageProcessor`] class to prepare images for the model.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/facebookresearch/dinov2).
+
+
+## Dinov2Config
+
+[[autodoc]] Dinov2Config
+
+## Dinov2Model
+
+[[autodoc]] Dinov2Model
+    - forward
+
+## Dinov2ForImageClassification
+
+[[autodoc]] Dinov2ForImageClassification
+    - forward
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 67c3d0d7f0b6..1d0dd0b339ac 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -34,7 +34,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
 <!--End of the generated tip-->
 
 </Tip>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 409c0812256c..fdec70f4ac76 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -293,6 +293,7 @@
     "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
     "models.dialogpt": [],
     "models.dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"],
+    "models.dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config"],
     "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
     "models.dit": [],
     "models.donut": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutProcessor", "DonutSwinConfig"],
@@ -1579,6 +1580,14 @@
             "DinatPreTrainedModel",
         ]
     )
+    _import_structure["models.dinov2"].extend(
+        [
+            "DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Dinov2ForImageClassification",
+            "Dinov2Model",
+            "Dinov2PreTrainedModel",
+        ]
+    )
     _import_structure["models.distilbert"].extend(
         [
             "DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4244,6 +4253,7 @@
     from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
+    from .models.dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config
     from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
     from .models.donut import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutProcessor, DonutSwinConfig
     from .models.dpr import (
@@ -5360,6 +5370,12 @@
             DinatModel,
             DinatPreTrainedModel,
         )
+        from .models.dinov2 import (
+            DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Dinov2ForImageClassification,
+            Dinov2Model,
+            Dinov2PreTrainedModel,
+        )
         from .models.distilbert import (
             DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             DistilBertForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0a102e035d2b..853dd115b39e 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -65,6 +65,7 @@
     detr,
     dialogpt,
     dinat,
+    dinov2,
     distilbert,
     dit,
     donut,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2d3901b4d05c..83e16034be92 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -75,6 +75,7 @@
         ("deta", "DetaConfig"),
         ("detr", "DetrConfig"),
         ("dinat", "DinatConfig"),
+        ("dinov2", "Dinov2Config"),
         ("distilbert", "DistilBertConfig"),
         ("donut-swin", "DonutSwinConfig"),
         ("dpr", "DPRConfig"),
@@ -276,6 +277,7 @@
         ("deta", "DETA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dinat", "DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("dinov2", "DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("donut-swin", "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -470,6 +472,7 @@
         ("detr", "DETR"),
         ("dialogpt", "DialoGPT"),
         ("dinat", "DiNAT"),
+        ("dinov2", "DINOv2"),
         ("distilbert", "DistilBERT"),
         ("dit", "DiT"),
         ("donut-swin", "DonutSwin"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index ae8d64ae6137..c9b0eee6b4b3 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -56,6 +56,7 @@
         ("deta", "DetaImageProcessor"),
         ("detr", "DetrImageProcessor"),
         ("dinat", "ViTImageProcessor"),
+        ("dinov2", "BitImageProcessor"),
         ("donut-swin", "DonutImageProcessor"),
         ("dpt", "DPTImageProcessor"),
         ("efficientformer", "EfficientFormerImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2dc1b9f8e6c1..4eb90b7cd7e5 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -73,6 +73,7 @@
         ("deta", "DetaModel"),
         ("detr", "DetrModel"),
         ("dinat", "DinatModel"),
+        ("dinov2", "Dinov2Model"),
         ("distilbert", "DistilBertModel"),
         ("donut-swin", "DonutSwinModel"),
         ("dpr", "DPRQuestionEncoder"),
@@ -455,6 +456,7 @@
         ("data2vec-vision", "Data2VecVisionForImageClassification"),
         ("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")),
         ("dinat", "DinatForImageClassification"),
+        ("dinov2", "Dinov2ForImageClassification"),
         (
             "efficientformer",
             (
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 66b308c1bcd6..b88f7861d965 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -76,10 +76,10 @@ class BitImageProcessor(BaseImageProcessor):
             method.
         do_normalize:
             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
             Image standard deviation.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
diff --git a/src/transformers/models/dinov2/__init__.py b/src/transformers/models/dinov2/__init__.py
new file mode 100644
index 000000000000..524e77407bb9
--- /dev/null
+++ b/src/transformers/models/dinov2/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config", "Dinov2OnnxConfig"]
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_dinov2"] = [
+        "DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Dinov2ForImageClassification",
+        "Dinov2Model",
+        "Dinov2PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config, Dinov2OnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_dinov2 import (
+            DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Dinov2ForImageClassification,
+            Dinov2Model,
+            Dinov2PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py
new file mode 100644
index 000000000000..169d25ec8e97
--- /dev/null
+++ b/src/transformers/models/dinov2/configuration_dinov2.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DINOv2 model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/dinov2-base": "https://huggingface.co/facebook/dinov2-base/resolve/main/config.json",
+}
+
+
+class Dinov2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2Model`]. It is used to instantiate an
+    Dinov2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Dinov2
+    [google/dinov2-base-patch16-224](https://huggingface.co/google/dinov2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2Config, Dinov2Model
+
+    >>> # Initializing a Dinov2 dinov2-base-patch16-224 style configuration
+    >>> configuration = Dinov2Config()
+
+    >>> # Initializing a model (with random weights) from the dinov2-base-patch16-224 style configuration
+    >>> model = Dinov2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "dinov2"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+
+
+class Dinov2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
new file mode 100644
index 000000000000..352454c9f340
--- /dev/null
+++ b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
@@ -0,0 +1,244 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DINOv2 checkpoints from the original repository.
+
+URL: https://github.com/facebookresearch/dinov2/tree/main
+"""
+
+
+import argparse
+from pathlib import Path
+
+import requests
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from transformers import BitImageProcessor, Dinov2Config, Dinov2Model
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_dinov2_config(model_name):
+    config = Dinov2Config(image_size=518, patch_size=14)
+
+    # size of the architecture
+    if "vits" in model_name:
+        config.hidden_size = 384
+        config.num_attention_heads = 6
+    elif "vitb" in model_name:
+        pass
+    elif "vitl" in model_name:
+        config.hidden_size = 1024
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif "vitg" in model_name:
+        config.use_swiglu_ffn = True
+        config.hidden_size = 1536
+        config.num_hidden_layers = 40
+        config.num_attention_heads = 24
+    else:
+        raise ValueError("Model not supported")
+
+    return config
+
+
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+
+    # patch embedding layer
+    rename_keys.append(("cls_token", "embeddings.cls_token"))
+    rename_keys.append(("mask_token", "embeddings.mask_token"))
+    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
+    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
+
+    for i in range(config.num_hidden_layers):
+        # layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
+        # MLP
+        if config.use_swiglu_ffn:
+            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
+        else:
+            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
+            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
+        # layerscale
+        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
+        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
+        # attention projection layer
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
+
+    # final layernorm
+    rename_keys.append(("norm.weight", "layernorm.weight"))
+    rename_keys.append(("norm.bias", "layernorm.bias"))
+
+    # fmt: on
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    for i in range(config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
+        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
+        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@torch.no_grad()
+def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our DINOv2 structure.
+    """
+
+    # define default Dinov2 configuration
+    config = get_dinov2_config(model_name)
+
+    # load original model from torch hub
+    original_model = torch.hub.load("facebookresearch/dinov2", model_name)
+    original_model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = original_model.state_dict()
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config)
+
+    for key, val in state_dict.copy().items():
+        val = state_dict.pop(key)
+        if "w12" in key:
+            key = key.replace("w12", "weights_in")
+        if "w3" in key:
+            key = key.replace("w3", "weights_out")
+        state_dict[key] = val
+
+    # load HuggingFace model
+    model = Dinov2Model(config, add_pooling_layer=False).eval()
+    model.load_state_dict(state_dict)
+
+    # load image
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # preprocess image
+    transformations = transforms.Compose(
+        [
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
+                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
+            ),
+        ]
+    )
+
+    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
+
+    processor = BitImageProcessor(
+        size={"shortest_edge": 256},
+        resample=PILImageResampling.BICUBIC,
+        image_mean=IMAGENET_DEFAULT_MEAN,
+        image_std=IMAGENET_DEFAULT_STD,
+    )
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    assert torch.allclose(original_pixel_values, pixel_values)
+
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        original_outputs = original_model(pixel_values)
+
+    # assert values
+    assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
+    assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving image processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_name_to_hf_name = {
+            "dinov2_vits14": "dinov2-small",
+            "dinov2_vitb14": "dinov2-base",
+            "dinov2_vitl14": "dinov2-large",
+            "dinov2_vitg14": "dinov2-giant",
+        }
+
+        name = model_name_to_hf_name[model_name]
+        model.push_to_hub(f"facebook/{name}")
+        processor.push_to_hub(f"facebook/{name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="dinov2_vitb14",
+        type=str,
+        choices=["dinov2_vits14", "dinov2_vitb14", "dinov2_vitl14", "dinov2_vitg14"],
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
new file mode 100644
index 000000000000..3e49b50f213d
--- /dev/null
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -0,0 +1,781 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DINOv2 model."""
+
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_dinov2 import Dinov2Config
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/dinov2-base",
+    # See all DINOv2 models at https://huggingface.co/models?filter=dinov2
+]
+
+
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+            raise ValueError("Width or height does not match with the interpolated position embeddings")
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: torch.Tensor) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values)
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath:
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class Dinov2Layer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path1 = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+        self.drop_path2 = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in Dinov2, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
+class Dinov2Encoder(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Dinov2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+    def _set_gradient_checkpointing(self, module: Dinov2Encoder, value: bool = False) -> None:
+        if isinstance(module, Dinov2Encoder):
+            module.gradient_checkpointing = value
+
+
+DINOV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DINOV2_BASE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Model(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = Dinov2Pooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->Dinov2
+class Dinov2Pooler(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.dinov2(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 741b099e7b3d..20631fc06692 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2540,6 +2540,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class Dinov2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Dinov2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Dinov2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/dinov2/__init__.py b/tests/models/dinov2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
new file mode 100644
index 000000000000..ed69faa444d5
--- /dev/null
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Dinov2 model. """
+
+
+import inspect
+import unittest
+
+from transformers import Dinov2Config
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import Dinov2ForImageClassification, Dinov2Model
+    from transformers.models.dinov2.modeling_dinov2 import DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class Dinov2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in Dinov2, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return Dinov2Config(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = Dinov2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = Dinov2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = Dinov2ForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (Dinov2Model, Dinov2ForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": Dinov2Model, "image-classification": Dinov2ForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = Dinov2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Dinov2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = Dinov2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class Dinov2ModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None
+
+    @slow
+    def test_inference_no_head(self):
+        model = Dinov2Model.from_pretrained("facebook/dinov2-base").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the last hidden states
+        expected_shape = torch.Size((1, 257, 768))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-2.1747, -0.4729, 1.0936], [-3.2780, -0.8269, -0.9210], [-2.9129, 1.1284, -0.7306]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))

From a9e067a45ce2076bc1cc7951a697c9c201b4bae1 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 18 Jul 2023 19:24:36 +0200
Subject: [PATCH 689/935] [`InstructBlip`] Fix int8/fp4 issues (#24888)

* fix dtype issue

* revert `.float()`

* fix copies
---
 .../models/instructblip/modeling_instructblip.py             | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 1254ef558f6b..645c38c20462 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -558,7 +558,6 @@ def get_input_embeddings(self):
         return self.embeddings
 
 
-# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerMultiHeadAttention with Blip2->InstructBlip
 class InstructBlipQFormerMultiHeadAttention(nn.Module):
     def __init__(self, config, is_cross_attention=False):
         super().__init__()
@@ -659,13 +658,14 @@ def forward(
                 attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_scores_dtype = attention_scores.dtype
 
         if attention_mask is not None:
             # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype)
 
         if is_cross_attention and self.save_attention:
             self.save_attention_map(attention_probs)
@@ -1038,6 +1038,7 @@ def forward(
         else:
             embeddings = query_embeds
 
+        embeddings = embeddings.to(self.layernorm.weight.dtype)
         embeddings = self.layernorm(embeddings)
         embeddings = self.dropout(embeddings)
         return embeddings

From 5c5cb4eeb28231acf77272f950a9c1ae94446120 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 18 Jul 2023 19:30:27 +0200
Subject: [PATCH 690/935] [`Blip`] Fix blip output name (#24889)

* fix blip output name

* add property

* oops

* fix failing test
---
 src/transformers/models/blip/modeling_blip.py    | 16 +++++++++++++---
 src/transformers/models/blip/modeling_tf_blip.py | 16 +++++++++++++---
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index cb6a78639cfa..d52aa15fca7a 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ PyTorch BLIP model."""
 
+import warnings
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 
@@ -74,7 +75,7 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
             Languge modeling loss from the text decoder.
-        decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
             Prediction scores of the language modeling head of the text decoder model.
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
             The image embeddings obtained after applying the Vision Transformer model to the input image.
@@ -94,12 +95,21 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
     """
 
     loss: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_logits: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
     image_embeds: Optional[torch.FloatTensor] = None
     last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
+    @property
+    def decoder_logits(self):
+        warnings.warn(
+            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the `logits` attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.logits
+
 
 @dataclass
 class BlipTextVisionModelOutput(ModelOutput):
@@ -1011,7 +1021,7 @@ def forward(
 
         return BlipForConditionalGenerationModelOutput(
             loss=outputs.loss,
-            decoder_logits=outputs.logits,
+            logits=outputs.logits,
             image_embeds=image_embeds,
             last_hidden_state=vision_outputs.last_hidden_state,
             hidden_states=vision_outputs.hidden_states,
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 95c22e204497..54d15b3088c6 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import warnings
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 
@@ -84,7 +85,7 @@ class TFBlipForConditionalGenerationModelOutput(ModelOutput):
     Args:
         loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
             Languge modeling loss from the text decoder.
-        decoder_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
             Prediction scores of the language modeling head of the text decoder model.
         image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
             The image embeddings obtained after applying the Vision Transformer model to the input image.
@@ -104,12 +105,21 @@ class TFBlipForConditionalGenerationModelOutput(ModelOutput):
     """
 
     loss: Tuple[tf.Tensor] | None = None
-    decoder_logits: Tuple[tf.Tensor] | None = None
+    logits: Tuple[tf.Tensor] | None = None
     image_embeds: tf.Tensor | None = None
     last_hidden_state: tf.Tensor = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
+    @property
+    def decoder_logits(self):
+        warnings.warn(
+            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the `logits` attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.logits
+
 
 @dataclass
 class TFBlipTextVisionModelOutput(ModelOutput):
@@ -1078,7 +1088,7 @@ def call(
 
         return TFBlipForConditionalGenerationModelOutput(
             loss=outputs.loss,
-            decoder_logits=outputs.logits,
+            logits=outputs.logits,
             image_embeds=image_embeds,
             last_hidden_state=vision_outputs.last_hidden_state,
             hidden_states=vision_outputs.hidden_states,

From dd49404a897f84622d38254fe90cd07d8c1640b0 Mon Sep 17 00:00:00 2001
From: Hwijeen Ahn <hwijeen@gmail.com>
Date: Tue, 18 Jul 2023 10:33:41 -0700
Subject: [PATCH 691/935] check if eval dataset is dict (#24877)

* check if eval dataset is dict

* formatting
---
 examples/pytorch/summarization/run_summarization.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index d981bdbd1591..8168482ca75b 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -692,7 +692,13 @@ def compute_metrics(eval_preds):
     results = {}
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if isinstance(eval_dataset, dict):
+            metrics = {}
+            for eval_ds_name, eval_ds in eval_dataset.items():
+                dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}")
+                metrics.update(dataset_metrics)
+        else:
+            metrics = trainer.evaluate(metric_key_prefix="eval")
         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 

From 30c172fc203add539d5e08393c0c6c7a2fbc6a71 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 18 Jul 2023 21:05:26 +0200
Subject: [PATCH 692/935] Separate CircleCI cache between `main` and `pull` (or
 other branches) (#24886)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index d9fea03b455c..a902bf4b2dba 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -86,6 +86,11 @@ def __post_init__(self):
     def to_dict(self):
         env = COMMON_ENV_VARIABLES.copy()
         env.update(self.additional_env)
+
+        cache_branch_prefix = os.environ.get("CIRCLE_BRANCH", "pull")
+        if cache_branch_prefix != "main":
+            cache_branch_prefix = "pull"
+
         job = {
             "working_directory": self.working_directory,
             "docker": self.docker_image,
@@ -101,16 +106,21 @@ def to_dict(self):
             {
                 "restore_cache": {
                     "keys": [
-                        f"v{self.cache_version}-{self.cache_name}-pip-" + '{{ checksum "setup.py" }}',
-                        f"v{self.cache_version}-{self.cache_name}-pip-",
+                        # check the fully-matched cache first
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}',
+                        # try the partially-matched cache from `main`
+                        f"v{self.cache_version}-{self.cache_name}-main-pip-",
+                        # try the general partially-matched cache
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-",
                     ]
                 }
             },
             {
                 "restore_cache": {
                     "keys": [
-                        f"v{self.cache_version}-{self.cache_name}-site-packages-" + '{{ checksum "setup.py" }}',
-                        f"v{self.cache_version}-{self.cache_name}-site-packages-",
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}',
+                        f"v{self.cache_version}-{self.cache_name}-main-site-packages-",
+                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-",
                     ]
                 }
             },
@@ -119,7 +129,7 @@ def to_dict(self):
         steps.append(
             {
                 "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-pip-" + '{{ checksum "setup.py" }}',
+                    "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}',
                     "paths": ["~/.cache/pip"],
                 }
             }
@@ -127,7 +137,7 @@ def to_dict(self):
         steps.append(
             {
                 "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-site-packages-" + '{{ checksum "setup.py" }}',
+                    "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}',
                     "paths": ["~/.pyenv/versions/"],
                 }
             }

From 07360b6c9c9448d619a82798419ed291dfc6ac8f Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 18 Jul 2023 21:18:31 +0200
Subject: [PATCH 693/935] [`Llama2`]  Add support for Llama 2 (#24891)

* add llama

* add other readmes

* update padding id in readme

* add link to paper

* fix paths and tokenizer

* more nits

* styling

* fit operation in 2 lines when possible

* nits

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* add form

* update reademe

* update readme, we don't have a default pad token

* update test and tokenization

* LLaMA instead of Llama

* nits

* add expected text

* add greeedy output

* styling

* Update src/transformers/models/llama/modeling_llama.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* sequential device map

* skip relevant changes

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.md                       |   1 +
 docs/source/en/model_doc/llama.md             |   6 +
 docs/source/en/model_doc/llama2.md            |  96 ++++++++++++++
 .../models/auto/configuration_auto.py         |   1 +
 .../models/llama/configuration_llama.py       |  22 ++++
 .../llama/convert_llama_weights_to_hf.py      |  52 ++++++--
 .../models/llama/modeling_llama.py            | 122 +++++++++++++-----
 .../models/llama/tokenization_llama.py        |  80 +++++++++++-
 .../models/llama/tokenization_llama_fast.py   |  77 ++++++++++-
 .../models/open_llama/modeling_open_llama.py  |   6 +-
 tests/models/llama/test_modeling_llama.py     |  86 +++++++++++-
 .../test_pipelines_conversational.py          |  35 +++++
 20 files changed, 538 insertions(+), 55 deletions(-)
 create mode 100644 docs/source/en/model_doc/llama2.md
 mode change 100755 => 100644 src/transformers/models/llama/modeling_llama.py

diff --git a/README.md b/README.md
index e4ebcef09344..b24df9e24928 100644
--- a/README.md
+++ b/README.md
@@ -386,6 +386,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
diff --git a/README_es.md b/README_es.md
index 54b77b7a284a..14200d3173f2 100644
--- a/README_es.md
+++ b/README_es.md
@@ -363,6 +363,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
diff --git a/README_hd.md b/README_hd.md
index 602e8721f64a..19e6132015ff 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -335,6 +335,7 @@ conda install -c huggingface transformers
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (मेटा AI से) साथ वाला पेपर [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https:/ /arxiv.org/abs/2104.01136) बेन ग्राहम, अलाएल्डिन एल-नौबी, ह्यूगो टौवरन, पियरे स्टॉक, आर्मंड जौलिन, हर्वे जेगौ, मैथिज डूज़ द्वारा।
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (दक्षिण चीन प्रौद्योगिकी विश्वविद्यालय से) साथ में कागज [LiLT: एक सरल लेकिन प्रभावी भाषा-स्वतंत्र लेआउट ट्रांसफार्मर संरचित दस्तावेज़ समझ के लिए](https://arxiv.org/abs/2202.13669) जियापेंग वांग, लियानवेन जिन, काई डिंग द्वारा पोस्ट किया गया।
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI से) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. द्वाराअनुसंधान पत्र [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) के साथ जारी किया गया
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI से) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. द्वाराअनुसंधान पत्र [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) के साथ जारी किया गया
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (मैंडी गुओ, जोशुआ आइंस्ली, डेविड यूथस, सैंटियागो ओंटानन, जियानमो नि, यूं-हुआन सुंग, यिनफेई यांग द्वारा पोस्ट किया गया।
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (स्टूडियो औसिया से) साथ में पेपर [LUKE: डीप कॉन्टेक्स्टुअलाइज्ड एंटिटी रिप्रेजेंटेशन विद एंटिटी-अवेयर सेल्फ-अटेंशन](https ://arxiv.org/abs/2010.01057) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto द्वारा।
diff --git a/README_ja.md b/README_ja.md
index 7f28bd931854..723259253747 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -397,6 +397,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI から) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze から公開された研究論文: [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136)
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI から) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. から公開された研究論文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
diff --git a/README_ko.md b/README_ko.md
index 2a5dc93db66e..4b83d4031ab3 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -312,6 +312,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (Meta AI 에서) Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 의 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 논문과 함께 발표했습니다.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology 에서) Jiapeng Wang, Lianwen Jin, Kai Ding 의 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 논문과 함께 발표했습니다.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..의 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)논문과 함께 발표했습니다.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI 에서) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 의 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 논문과 함께 발표했습니다.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia 에서) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 의 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 4953420acfcd..dbabf6012326 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -336,6 +336,7 @@ conda install -c huggingface transformers
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (来自 The FAIR team of Meta AI) 伴随论文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) 由 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. 发布。
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index ea8709a10fe9..427885c85c4a 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -348,6 +348,7 @@ conda install -c huggingface transformers
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a947587f486a..488e1c2c37bc 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -335,6 +335,8 @@
         title: LED
       - local: model_doc/llama
         title: LLaMA
+      - local: model_doc/llama2
+        title: Llama2
       - local: model_doc/longformer
         title: Longformer
       - local: model_doc/longt5
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 0d315515fdb8..95a10e2e4b31 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -152,6 +152,7 @@ The documentation is organized into five sections:
 1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
 1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
 1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 01b3e28d2180..5ff039bebef2 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -50,6 +50,12 @@ come in several checkpoints they each contain a part of each weight of the model
 
 This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
 
+
+Based on the original LLaMA model, Meta AI has released some follow-up works:
+
+- **Llama2**: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. Refer to the documentation of Llama2 which can be found [here](llama2).
+
+
 ## LlamaConfig
 
 [[autodoc]] LlamaConfig
diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md
new file mode 100644
index 000000000000..3725c8583b61
--- /dev/null
+++ b/docs/source/en/model_doc/llama2.md
@@ -0,0 +1,96 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Llama2
+
+## Overview
+
+The Llama2 model was proposed in [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, whith checkpoints finetuned for chat application!
+
+The abstract from the paper is the following:
+
+*In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed- source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.*
+
+Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)
+
+Tips:
+
+- Weights for the Llama2 models can be obtained from by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+- The architecture is very similar to the first Llama, with the addition of Groupe Query Attention (GQA) following this [paper](https://arxiv.org/pdf/2305.13245.pdf)
+- Setting `config.pretraining_tp` to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
+- The original model uses `pad_id = -1` which means that there is not padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with`self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
+- After filling the form and gaining acces to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+- After conversion, the model and tokenizer can be loaded via:
+
+```python
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = LlamaForCausalLM.from_pretrained("/output/path")
+```
+
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
+
+- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) with contributions from [Lysandre Debut](https://huggingface.co/lysandre). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+
+## LlamaConfig
+
+[[autodoc]] LlamaConfig
+
+
+## LlamaTokenizer
+
+[[autodoc]] LlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## LlamaTokenizerFast
+
+[[autodoc]] LlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
+
+## LlamaModel
+
+[[autodoc]] LlamaModel
+    - forward
+
+
+## LlamaForCausalLM
+
+[[autodoc]] LlamaForCausalLM
+    - forward
+
+## LlamaForSequenceClassification
+
+[[autodoc]] LlamaForSequenceClassification
+    - forward
+
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 83e16034be92..2c696b26c8e8 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -522,6 +522,7 @@
         ("levit", "LeViT"),
         ("lilt", "LiLT"),
         ("llama", "LLaMA"),
+        ("llama2", "Llama2"),
         ("longformer", "Longformer"),
         ("longt5", "LongT5"),
         ("luke", "LUKE"),
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index d456b79e66fb..ea257b3a7b95 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -50,6 +50,19 @@ class LlamaConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
@@ -97,6 +110,7 @@ def __init__(
         intermediate_size=11008,
         num_hidden_layers=32,
         num_attention_heads=32,
+        num_key_value_heads=None,
         hidden_act="silu",
         max_position_embeddings=2048,
         initializer_range=0.02,
@@ -105,6 +119,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
+        pretraining_tp=1,
         tie_word_embeddings=False,
         rope_scaling=None,
         **kwargs,
@@ -115,9 +130,16 @@ def __init__(
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
         self.use_cache = use_cache
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index e8fb7f825279..75760f4bbed6 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -14,7 +14,6 @@
 import argparse
 import gc
 import json
-import math
 import os
 import shutil
 import warnings
@@ -59,17 +58,22 @@
     "13B": 13824,
     "30B": 17920,
     "65B": 22016,
+    "70B": 28672,
 }
 NUM_SHARDS = {
     "7B": 1,
+    "7Bf": 1,
     "13B": 2,
+    "13Bf": 2,
     "30B": 4,
     "65B": 8,
+    "70B": 8,
+    "70Bf": 8,
 }
 
 
-def compute_intermediate_size(n):
-    return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
 
 
 def read_json(path):
@@ -82,7 +86,7 @@ def write_json(text, path):
         json.dump(text, f)
 
 
-def write_model(model_path, input_base_path, model_size):
+def write_model(model_path, input_base_path, model_size, safe_serialization=True):
     os.makedirs(model_path, exist_ok=True)
     tmp_model_path = os.path.join(model_path, "tmp")
     os.makedirs(tmp_model_path, exist_ok=True)
@@ -97,9 +101,18 @@ def write_model(model_path, input_base_path, model_size):
     base = 10000.0
     inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
 
+    if "n_kv_heads" in params:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
+        key_value_dim = dim // num_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
     # permute for sliced rotary
-    def permute(w):
-        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
 
     print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
     # Load weights
@@ -160,19 +173,26 @@ def permute(w):
             state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
                 torch.cat(
                     [
-                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                            num_local_key_value_heads, dims_per_head, dim
+                        )
                         for i in range(num_shards)
                     ],
                     dim=0,
-                ).reshape(dim, dim)
+                ).reshape(key_value_dim, dim),
+                num_key_value_heads,
+                key_value_dim,
+                dim,
             )
             state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
                 [
-                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
                     for i in range(num_shards)
                 ],
                 dim=0,
-            ).reshape(dim, dim)
+            ).reshape(key_value_dim, dim)
 
             state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
                 [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
@@ -218,13 +238,15 @@ def permute(w):
     # Write configs
     index_dict["metadata"] = {"total_size": param_count * 2}
     write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
+    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
+    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
     config = LlamaConfig(
         hidden_size=dim,
-        intermediate_size=compute_intermediate_size(dim),
+        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
         num_attention_heads=params["n_heads"],
         num_hidden_layers=params["n_layers"],
         rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
     )
     config.save_pretrained(tmp_model_path)
 
@@ -239,7 +261,7 @@ def permute(w):
     del model.config._name_or_path
 
     print("Saving in the Transformers format.")
-    model.save_pretrained(model_path)
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
     shutil.rmtree(tmp_model_path)
 
 
@@ -259,18 +281,20 @@ def main():
     )
     parser.add_argument(
         "--model_size",
-        choices=["7B", "13B", "30B", "65B", "tokenizer_only"],
+        choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"],
     )
     parser.add_argument(
         "--output_dir",
         help="Location to write HF model and tokenizer",
     )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
     args = parser.parse_args()
     if args.model_size != "tokenizer_only":
         write_model(
             model_path=args.output_dir,
             input_base_path=os.path.join(args.input_dir, args.model_size),
             model_size=args.model_size,
+            safe_serialization=args.safe_serialization,
         )
     spm_path = os.path.join(args.input_dir, "tokenizer.model")
     write_tokenizer(args.output_dir, spm_path)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
old mode 100755
new mode 100644
index 6cdbb2623a7d..5709fc6a7781
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -22,6 +22,7 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
@@ -82,10 +83,10 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        return (self.weight * hidden_states).to(input_dtype)
+        return self.weight * hidden_states.to(input_dtype)
 
 
 class LlamaRotaryEmbedding(torch.nn.Module):
@@ -188,20 +189,45 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
 
 
 class LlamaMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
+    def __init__(self, config):
         super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
+        self.pretraining_tp = config.pretraining_tp
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        if self.pretraining_tp > 1:
+            slice = self.intermediate_size // self.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
 class LlamaAttention(nn.Module):
@@ -213,6 +239,9 @@ def __init__(self, config: LlamaConfig):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.pretraining_tp = config.pretraining_tp
         self.max_position_embeddings = config.max_position_embeddings
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -221,8 +250,8 @@ def __init__(self, config: LlamaConfig):
                 f" and `num_heads`: {self.num_heads})."
             )
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self._init_rope()
 
@@ -257,16 +286,35 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if self.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
+            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
 
         if past_key_value is not None:
             # reuse k, v, self_attention
@@ -275,6 +323,10 @@ def forward(
 
         past_key_value = (key_states, value_states) if use_cache else None
 
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
@@ -289,10 +341,6 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
-            dtype_min = torch.tensor(
-                torch.finfo(attn_weights.dtype).min, device=attn_weights.device, dtype=attn_weights.dtype
-            )
-            attn_weights = torch.max(attn_weights, dtype_min)
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -304,10 +352,15 @@ def forward(
                 f" {attn_output.size()}"
             )
 
-        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
-        attn_output = self.o_proj(attn_output)
+        if self.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
             attn_weights = None
@@ -320,11 +373,7 @@ def __init__(self, config: LlamaConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
+        self.mlp = LlamaMLP(config)
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -681,7 +730,8 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
-
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
         # Initialize weights and apply final processing
@@ -766,7 +816,13 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
+        if self.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 193d4edd59fb..ec7fdb37fcbd 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -21,7 +21,7 @@
 """Tokenization classes for LLaMA."""
 import os
 from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import sentencepiece as spm
 
@@ -29,6 +29,9 @@
 from ...utils import logging
 
 
+if TYPE_CHECKING:
+    from transformers.pipelines.conversational import Conversation
+
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
@@ -46,10 +49,23 @@
 }
 SPIECE_UNDERLINE = "▁"
 
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your\
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not\
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
 
 class LlamaTokenizer(PreTrainedTokenizer):
     """
-    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
 
     Args:
         vocab_file (`str`):
@@ -314,3 +330,63 @@ def create_token_type_ids_from_sequences(
             output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
 
         return output
+
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+        """Builds the input ids for a conversation.
+        This is the format used in the provided examples. System prompts should be manually added at the beginning of
+        the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
+        ```
+        <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        ```
+
+        If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
+        ```python
+        >>> from transformers import Conversation
+
+        >>> Conversation(
+        ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
+        ... )
+        ```
+        Args:
+            conversation (`Conversation`):
+                Conversation to build input ids for.
+        Returns:
+            `List[int]`:
+                Input ids for the conversation.
+        """
+        dialogue = list(conversation.iter_texts())
+        if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+            [not is_user for is_user, msg in dialogue[1::2]]
+        ):
+            raise ValueError(
+                "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+            )
+
+        dialog_tokens: List[int] = []
+        if len(conversation.past_user_inputs) > 0:
+            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+                conversation.past_user_inputs[0] = (
+                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                )
+        elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+            dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+        dialog_tokens += sum(
+            [
+                [self.bos_token_id]
+                + self.encode(
+                    f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+                )
+                + [self.eos_token_id]
+                for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+            ],
+            [],
+        )
+        if not (dialogue[-1][0]):
+            raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+        dialog_tokens += [self.bos_token_id] + self.encode(
+            f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+        )
+        return dialog_tokens
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 28e9413a50a2..b50f70158772 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import os
 from shutil import copyfile
-from typing import Optional, Tuple
+from typing import TYPE_CHECKING, Optional, Tuple
 
 from tokenizers import processors
 
@@ -23,6 +23,9 @@
 from ...utils.versions import require_version
 
 
+if TYPE_CHECKING:
+    from transformers.pipelines.conversational import Conversation
+
 require_version("tokenizers>=0.13.3")
 
 if is_sentencepiece_available():
@@ -33,6 +36,18 @@
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
 
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your\
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not\
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
 
 class LlamaTokenizerFast(PreTrainedTokenizerFast):
     """
@@ -171,3 +186,63 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             copyfile(self.vocab_file, out_vocab_file)
 
         return (out_vocab_file,)
+
+    def _build_conversation_input_ids(self, conversation: "Conversation"):
+        """Builds the input ids for a conversation.
+        This is the format used in the provided examples. System prompts should be manually added at the beginning of
+        the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
+        ```
+        <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+        ```
+
+        If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
+        ```python
+        >>> from transformers import Conversation
+
+        >>> Conversation(
+        ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
+        ... )
+        ```
+        Args:
+            conversation (`Conversation`):
+                Conversation to build input ids for.
+        Returns:
+            `List[int]`:
+                Input ids for the conversation.
+        """
+        dialogue = list(conversation.iter_texts())
+        if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+            [not is_user for is_user, msg in dialogue[1::2]]
+        ):
+            raise ValueError(
+                "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+            )
+
+        dialog_tokens = []
+        if len(conversation.past_user_inputs) > 0:
+            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+                conversation.past_user_inputs[0] = (
+                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                )
+        elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+            dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+        dialog_tokens += sum(
+            [
+                [self.bos_token_id]
+                + self.encode(
+                    f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+                )
+                + [self.eos_token_id]
+                for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+            ],
+            [],
+        )
+        if not (dialogue[-1][0]):
+            raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+        dialog_tokens += [self.bos_token_id] + self.encode(
+            f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+        )
+        return dialog_tokens
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py
index 34dc350dc556..bcb4e04c0eaa 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -92,10 +92,10 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        return (self.weight * hidden_states).to(input_dtype)
+        return self.weight * hidden_states.to(input_dtype)
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->OpenLlama
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index e8b808461945..80820a5013ff 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -20,7 +20,7 @@
 from parameterized import parameterized
 
 from transformers import LlamaConfig, is_torch_available, set_seed
-from transformers.testing_utils import require_torch, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -31,7 +31,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel
+    from transformers import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaTokenizer
 
 
 class LlamaModelTester:
@@ -365,3 +365,85 @@ def test_model_rope_scaling(self, scaling_type):
 
         # The output should be different for long inputs
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+
+@require_torch
+class LlamaIntegrationTest(unittest.TestCase):
+    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
+    @slow
+    def test_model_7b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto")
+        out = model(torch.tensor([input_ids]))
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-6.6550, -4.1227, -4.9859, -3.2406, 0.8262, -3.0033, 1.2964, -3.3699]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-12.8281, -7.4453, -0.4639, -8.0625, -7.2500, -8.0000, -6.4883, -7.7695, -7.8438, -7.0312, -6.2188, -7.1328, -1.8496, 1.9961, -8.6250, -6.7227, -12.8281, -6.9492, -7.0742, -7.7852, -7.5820, -7.9062, -6.9375, -7.9805, -8.3438, -8.1562, -8.0469, -7.6250, -7.7422, -7.3398,])
+        # fmt: on
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
+    @slow
+    def test_model_13b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", device_map="auto")
+        out = model(torch.tensor(input_ids))
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-2.0622, -1.2794, -1.1638, -0.9788, -1.4603, -1.0238, -1.7893, -1.4411]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-8.1406, -8.0547, 2.7461, -1.2344, -0.1448, -1.8262, -1.0020, -1.8154, -1.6895, -1.8516, -2.3574, -0.9277, 3.7598, 6.5742, -1.2998, -0.1177, -8.1406, -2.9688, -2.9199, -3.1699, -3.5254, -2.3555, -2.7988, -3.4141, -2.8262, -4.5195, -3.3379, -3.3164, -2.7832, -3.0273])
+        # fmt: on
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
+    @slow
+    def test_model_13bf_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", device_map="auto")
+        out = model(torch.tensor(input_ids))
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-0.8562, -1.8520, -0.7551, -0.4162, -1.5161, -1.2038, -2.4823, -2.3254]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-2.2227, 4.8828, 0.9023, -0.4578, -0.7871, -0.1033, -0.6221, -0.5786, -0.7803, -1.0674, -1.2920, -0.1570, 0.8008, 2.0723, -0.9497, 0.2771, -2.2227, -0.7612, -1.4346, -1.2061, -1.6426, -0.3000, -0.7139, -1.1934, -1.8691, -1.6973, -1.5947, -1.2705, -0.3523, -0.5513])
+        # fmt: on
+        torch.testing.assert_close(out.mean(-1), EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
+
+    @unittest.skip(
+        "Logits are not exactly the same, once we fix the instabalities somehow, will update! Also it is gonna be a `too_slow` test"
+    )
+    @slow
+    def test_model_70b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-70b-hf", device_map="auto")
+        out = model(torch.tensor(input_ids))
+
+        EXPECTED_MEAN = torch.tensor(
+            [[-4.2327, -3.3360, -4.6665, -4.7631, -1.8180, -3.4170, -1.4211, -3.1810]], dtype=torch.float32
+        )
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-9.4922, -3.9551, 1.7998, -5.6758, -5.1055, -5.8984, -4.8320, -6.8086, -6.5391, -5.6172, -5.5820, -5.5352, 1.7881, 3.6289, -6.5117, -3.4785, -9.5000, -6.0352, -6.8125, -6.0195, -6.6836, -5.4727, -6.2812, -6.0391, -7.3398, -7.4297, -7.4844, -6.5820, -5.8789, -5.5312])
+        # fmt: on
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
+
+    @unittest.skip("Model is curently gated")
+    @slow
+    def test_model_13b_greedy_generation(self):
+        EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the laws of physics are the same everywhere in the universe and 2) the passage of time and the length of objects can vary depending on the observer\'s frame of reference.\n\nThe first part of the theory, that the laws of physics are the same everywhere, is known as the "princi"""
+        prompt = "Simply put, the theory of relativity states that "
+        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt")
+        model = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-13b-chat-hf", device_map="sequential", use_safetensors=False
+        )
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py
index f627150d806a..2406357b89d9 100644
--- a/tests/pipelines/test_pipelines_conversational.py
+++ b/tests/pipelines/test_pipelines_conversational.py
@@ -216,6 +216,41 @@ def test_integration_torch_conversation_dialogpt_input_ids(self):
             inputs["input_ids"].tolist(), [[31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256]]
         )
 
+    @unittest.skip("Model is curently gated")
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_llama2_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+
+        conversation = Conversation(
+            "What is so great about #1?",
+            past_user_inputs=["I am going to Paris, what should I see?"],
+            generated_responses=[
+                """\
+Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:
+
+1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.
+2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.
+3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.
+
+These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."""
+            ],
+        )
+        inputs = tokenizer._build_conversation_input_ids(conversation)
+        # fmt: off
+        EXPECTED_INPUTS_IDS = [ 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29902, 626, 2675, 304, 3681, 29892, 825, 881, 306, 1074, 29973, 518, 29914, 25580, 29962, 3681, 29892, 278, 7483, 310, 3444, 29892, 338, 2998, 363, 967, 380, 27389, 11258, 29892, 1616, 19133, 29879, 29892, 15839, 2982, 22848, 29892, 322, 6017, 7716, 25005, 29889, 2266, 526, 777, 310, 278, 2246, 19650, 1953, 304, 1074, 297, 3681, 29901, 13, 13, 29896, 29889, 450, 382, 2593, 295, 23615, 29901, 450, 9849, 293, 382, 2593, 295, 23615, 338, 697, 310, 278, 1556, 5936, 13902, 2982, 22848, 297, 278, 3186, 322, 16688, 2078, 271, 400, 5086, 8386, 310, 278, 4272, 29889, 13, 29906, 29889, 450, 4562, 12675, 6838, 29901, 450, 4562, 12675, 338, 697, 310, 278, 3186, 29915, 29879, 10150, 322, 1556, 13834, 19133, 29879, 29892, 27261, 385, 21210, 573, 4333, 310, 1616, 322, 24238, 29879, 29892, 3704, 278, 2598, 29874, 29420, 29889, 13, 29941, 29889, 24337, 29899, 29928, 420, 315, 21471, 29901, 910, 9560, 274, 21471, 338, 697, 310, 278, 1556, 13834, 2982, 22848, 297, 3681, 322, 338, 2998, 363, 967, 22883, 293, 11258, 322, 380, 27389, 380, 7114, 12917, 5417, 29889, 13, 13, 1349, 968, 526, 925, 263, 2846, 310, 278, 1784, 19650, 1953, 393, 3681, 756, 304, 5957, 29889, 2973, 577, 1568, 304, 1074, 322, 437, 29892, 372, 29915, 29879, 694, 4997, 393, 3681, 338, 697, 310, 278, 1556, 5972, 6282, 391, 15422, 800, 297, 278, 3186, 29889, 29871, 2, 1, 518, 25580, 29962, 1724, 338, 577, 2107, 1048, 396, 29896, 29973, 518, 29914, 25580, 29962]
+        # fmt: on
+        self.assertEqual(inputs, EXPECTED_INPUTS_IDS)
+
+        model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+        EXPECTED_TEXT = "what topic you want to focus on and create content around it. This will help you stand out from other creators and attract a specific audience.\n\nStep 2: Set Up Your Channel\nCreate your YouTube account and customize your channel with your branding and logo. Make sure your channel name and profile picture are consistent with your niche.\n\nStep 3: Plan Your Content\nDevelop a content strategy that includes the type of content you want to create, how often you will post, and when you will post. Consider creating a content calendar to help you stay organized.\n\nStep 4: Invest in Quality Equipment\nInvest in good quality camera and microphone equipment to ensure your videos look and sound professional. You don't need to break the bank, but investing in good equipment will make a big difference in the quality of your videos.\n\nStep 5: Optimize Your Videos for Search\nUse keywords in your video titles, descriptions, and tags to help people find your videos when they search for topics related to your niche"
+        conversation = Conversation(
+            "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 steps?"
+        )
+        result = conversation_agent(conversation)
+        self.assertEqual(result.generated_responses[-1], EXPECTED_TEXT)
+
     @require_torch
     @slow
     def test_integration_torch_conversation_blenderbot_400M_input_ids(self):

From a982c0225efe3af4700f09d09890b239f551870c Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 18 Jul 2023 16:07:02 -0400
Subject: [PATCH 694/935] Disable ipex env var if false (#24885)

Disable ipex if in use
---
 src/transformers/training_args.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 3a30adbe0b07..17adfc3355d1 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1718,6 +1718,8 @@ def _setup_devices(self) -> "torch.device":
                 )
             AcceleratorState._reset_state(reset_partial_state=True)
         self.distributed_state = None
+        if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
+            os.environ["ACCELERATE_USE_IPEX"] = "false"
         if self.use_cpu:
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
             self._n_gpu = 0

From 476be08c4aa96f8c1cae4200d2677bbe8f12cf80 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 18 Jul 2023 18:40:37 -0400
Subject: [PATCH 695/935] Check for accelerate env var when doing CPU only
 (#24890)

Check for use-cpu
---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 17adfc3355d1..12f0496b53a1 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1720,7 +1720,7 @@ def _setup_devices(self) -> "torch.device":
         self.distributed_state = None
         if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
             os.environ["ACCELERATE_USE_IPEX"] = "false"
-        if self.use_cpu:
+        if self.use_cpu or os.environ.get("ACCELERATE_USE_CPU", False):
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
             self._n_gpu = 0
         elif is_sagemaker_mp_enabled():

From 129cb6d5239374869beea0e1317641426e8df572 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 19 Jul 2023 09:49:52 +0200
Subject: [PATCH 696/935] Avoid some pipeline tasks to use `use_cache=True`
 (#24893)

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/pipelines/question_answering.py       | 5 +++++
 src/transformers/pipelines/text_classification.py      | 5 +++++
 src/transformers/pipelines/zero_shot_classification.py | 5 +++++
 3 files changed, 15 insertions(+)

diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 884cee78ca5f..89a5c03c7b46 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -1,3 +1,4 @@
+import inspect
 import types
 import warnings
 from collections.abc import Iterable
@@ -510,6 +511,10 @@ def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_questio
     def _forward(self, inputs):
         example = inputs["example"]
         model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
         output = self.model(**model_inputs)
         if isinstance(output, dict):
             return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index f758522c9fe3..e92033e60f0e 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -1,3 +1,4 @@
+import inspect
 import warnings
 from typing import Dict
 
@@ -179,6 +180,10 @@ def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
         return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
 
     def _forward(self, model_inputs):
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
         return self.model(**model_inputs)
 
     def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 413591cf9050..eb01d3a5354a 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -1,3 +1,4 @@
+import inspect
 from typing import List, Union
 
 import numpy as np
@@ -221,6 +222,10 @@ def _forward(self, inputs):
         candidate_label = inputs["candidate_label"]
         sequence = inputs["sequence"]
         model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
         outputs = self.model(**model_inputs)
 
         model_outputs = {

From c03597021284478347664eb3b7bd517179421e4c Mon Sep 17 00:00:00 2001
From: Eliah Kagan <degeneracypressure@gmail.com>
Date: Wed, 19 Jul 2023 07:17:34 -0400
Subject: [PATCH 697/935] Update tested versions in READMEs (#24895)

* Update supported Python and PyTorch versions in readme

* Update Python, etc. versions in non-English readmes

These were more out of date than in the English readme. This
updates all the versions the readmes claim the repository is tested
with to the same versions stated in the English readme.

Those versions are current at least in the case of the Python and
PyTorch versions (and less out of date for the others).

* Propagate trailing whitespace fix to model list

This runs "make fix-copies". The only change is the removal of
whitespace. No actual information or wording is changed.

* Update tested TensorFlow to 2.6 in all readmes

Per pinning in setup.py

Unlike Python and PyTorch, the minimum supported TensorFlow version
has not very recently changed, but old versions were listed in all
READMEs.
---
 README.md               | 10 +++++-----
 README_es.md            |  6 +++---
 README_hd.md            |  8 ++++----
 README_ja.md            |  8 ++++----
 README_ko.md            |  8 ++++----
 README_zh-hans.md       | 10 +++++-----
 README_zh-hant.md       |  8 ++++----
 docs/source/en/index.md |  4 ++--
 8 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index b24df9e24928..d1a4b153e95b 100644
--- a/README.md
+++ b/README.md
@@ -116,8 +116,8 @@ In Multimodal tasks:
 
 ## 100 projects using Transformers
 
-Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the 
-Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone 
+Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the
+Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
 else to build their dream projects.
 
 In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the
@@ -247,7 +247,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 
 ### With pip
 
-This repository is tested on Python 3.7+, Flax 0.4.1+, PyTorch 1.9+ and TensorFlow 2.4+.
+This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ and TensorFlow 2.6+.
 
 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 
@@ -414,7 +414,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -422,7 +422,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
diff --git a/README_es.md b/README_es.md
index 14200d3173f2..94ec96a66131 100644
--- a/README_es.md
+++ b/README_es.md
@@ -224,7 +224,7 @@ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.h
 
 ### Con pip
 
-Este repositorio está probado en Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ y TensorFlow 2.3+.
+Este repositorio está probado en Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ y TensorFlow 2.6+.
 
 Deberías instalar 🤗 Transformers en un [ambiente virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 
@@ -328,7 +328,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -391,7 +391,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
diff --git a/README_hd.md b/README_hd.md
index 19e6132015ff..df23f4c9be9e 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -200,7 +200,7 @@ checkpoint: जाँच बिंदु
 
 ### पिप का उपयोग करना
 
-इस रिपॉजिटरी का परीक्षण Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ और TensorFlow 2.3+ के तहत किया गया है।
+इस रिपॉजिटरी का परीक्षण Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ और TensorFlow 2.6+ के तहत किया गया है।
 
 आप [वर्चुअल एनवायरनमेंट] (https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश] (https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें।
 
@@ -300,7 +300,7 @@ conda install -c huggingface transformers
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)**(Baidu से) साथ देने वाला पेपर [ERNIE: एन्हांस्ड रिप्रेजेंटेशन थ्रू नॉलेज इंटीग्रेशन](https://arxiv.org/abs/1904.09223) यू सन, शुओहुआन वांग, युकुन ली, शिकुन फेंग, ज़ुई चेन, हान झांग, शिन तियान, डैनक्सियांग झू, हाओ तियान, हुआ वू द्वारा पोस्ट किया गया।
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स।
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
@@ -363,7 +363,7 @@ conda install -c huggingface transformers
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (हुआवेई नूह के आर्क लैब से) साथ में कागज़ [NEZHA: चीनी भाषा समझ के लिए तंत्रिका प्रासंगिक प्रतिनिधित्व](https :/ /arxiv.org/abs/1909.00204) जुन्किउ वेई, ज़ियाओज़े रेन, ज़िआओगुआंग ली, वेनयोंग हुआंग, यी लियाओ, याशेंग वांग, जियाशू लिन, शिन जियांग, जिओ चेन और कुन लियू द्वारा।
@@ -371,7 +371,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta से) the NLLB team. द्वाराअनुसंधान पत्र [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) के साथ जारी किया गया
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
diff --git a/README_ja.md b/README_ja.md
index 723259253747..55591188d2a5 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -258,7 +258,7 @@ And here is the equivalent code for TensorFlow:
 
 ### pipにて
 
-このリポジトリは、Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+ でテストされています。
+このリポジトリは、Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, TensorFlow 2.6+ でテストされています。
 
 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。
 
@@ -362,7 +362,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu から) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu から公開された研究論文: [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223)
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです.  **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives　から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と　**ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902)
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
@@ -425,7 +425,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box から) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen から公開された研究論文: [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131)
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs から) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi から公開された研究論文: [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab から) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu から公開された研究論文: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
@@ -433,7 +433,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
diff --git a/README_ko.md b/README_ko.md
index 4b83d4031ab3..61235f4a22e9 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -175,7 +175,7 @@ limitations under the License.
 
 ### pip로 설치하기
 
-이 저장소는 Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+에서 테스트 되었습니다.
+이 저장소는 Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, TensorFlow 2.6+에서 테스트 되었습니다.
 
 [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요.
 
@@ -277,7 +277,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (Baidu 에서) Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 의 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) 논문과 함께 발표했습니다.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -340,7 +340,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (RUC AI Box 에서) Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 의 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 논문과 함께 발표했습니다.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (SHI Labs 에서) Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 의 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 논문과 함께 발표했습니다.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei Noah’s Ark Lab 에서) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 의 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 논문과 함께 발표했습니다.
@@ -348,7 +348,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index dbabf6012326..2a6ebd0897c9 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -200,7 +200,7 @@ checkpoint: 检查点
 
 ### 使用 pip
 
-这个仓库已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下经过测试。
+这个仓库已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.10+ 和 TensorFlow 2.6+ 下经过测试。
 
 你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境，请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
 
@@ -301,7 +301,7 @@ conda install -c huggingface transformers
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
@@ -346,7 +346,7 @@ conda install -c huggingface transformers
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
-1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov  
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
 1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (来自 Google AI) 伴随论文 [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) 由 Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos 发布。
 1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
@@ -364,7 +364,7 @@ conda install -c huggingface transformers
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (来自 中国人民大学 AI Box) 伴随论文 [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) 由 Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen 发布。
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (来自 SHI Labs) 伴随论文 [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) 由 Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi 发布。
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (来自华为诺亚方舟实验室) 伴随论文 [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) 由 Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu 发布。
@@ -372,7 +372,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布. 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 427885c85c4a..a2b35f085a4e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -212,7 +212,7 @@ Tokenizer 為所有的預訓練模型提供了預處理，並可以直接轉換
 
 ### 使用 pip
 
-這個 Repository 已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下經過測試。
+這個 Repository 已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.10+ 和 TensorFlow 2.6+ 下經過測試。
 
 你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境，請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
 
@@ -313,7 +313,7 @@ conda install -c huggingface transformers
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
@@ -376,7 +376,7 @@ conda install -c huggingface transformers
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -384,7 +384,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 95a10e2e4b31..ced49d3ed721 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -180,7 +180,7 @@ The documentation is organized into five sections:
 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MRA](model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[MusicGen](model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. 
+1. **[MusicGen](model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
 1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
 1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
 1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
@@ -188,7 +188,7 @@ The documentation is organized into five sections:
 1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.

From 243b2ea3fd6fbaf27cf93507159dde23b47a2b08 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 19 Jul 2023 13:18:16 +0200
Subject: [PATCH 698/935] Fix `test_model_parallelism` for `FalconModel`
 (#24914)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/falcon/modeling_falcon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index cb05d7259949..d0b0c40c8385 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -1040,7 +1040,7 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1).to(logits.device)
             else:
                 sequence_lengths = -1
                 logger.warning(

From aa4afa67f37e32a24ea3b98b8e3e6af20c91be90 Mon Sep 17 00:00:00 2001
From: Madhava Jay <me@madhavajay.com>
Date: Wed, 19 Jul 2023 21:30:01 +1000
Subject: [PATCH 699/935] Fixed issue where ACCELERATE_USE_CPU="False" results
 in bool(True) (#24907)

- This results in cpu mode on Apple Silicon mps
---
 src/transformers/training_args.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 12f0496b53a1..42a9ee2fbc26 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -53,6 +53,7 @@
     logging,
     requires_backends,
 )
+from .utils.generic import strtobool
 from .utils.import_utils import is_optimum_neuron_available
 
 
@@ -1720,7 +1721,7 @@ def _setup_devices(self) -> "torch.device":
         self.distributed_state = None
         if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
             os.environ["ACCELERATE_USE_IPEX"] = "false"
-        if self.use_cpu or os.environ.get("ACCELERATE_USE_CPU", False):
+        if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
             self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
             self._n_gpu = 0
         elif is_sagemaker_mp_enabled():

From 99c1268e0a6df61b8072ec1a2cd5ef840798ebd1 Mon Sep 17 00:00:00 2001
From: lee1jun <21jun7654@gmail.com>
Date: Wed, 19 Jul 2023 20:35:04 +0900
Subject: [PATCH 700/935] fix typo in BARK_PRETRAINED_MODEL_ARCHIVE_LIST
 (#24902)

fix typo in BARK_PRETRAINED_MODEL_ARCHIVE_LIST

suno/barh should be suno/bark
---
 src/transformers/models/bark/modeling_bark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index 6aa5be7b5b7d..ae70a91584c6 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -48,7 +48,7 @@
 
 BARK_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "suno/bark-small",
-    "suno/barh",
+    "suno/bark",
     # See all Bark models at https://huggingface.co/models?filter=bark
 ]
 

From 3a43794dd65c198f1979a3ddd04179f58b29097c Mon Sep 17 00:00:00 2001
From: Travis Cline <travis.cline@gmail.com>
Date: Wed, 19 Jul 2023 05:13:14 -0700
Subject: [PATCH 701/935] Fix minor llama2.md model doc typos (#24909)

Update llama2.md

 Fix typos in the llama2 model doc
---
 docs/source/en/model_doc/llama2.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md
index 3725c8583b61..07f59a620755 100644
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 
 -->
@@ -18,21 +18,21 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Llama2 model was proposed in [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, whith checkpoints finetuned for chat application!
+The Llama2 model was proposed in [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, with checkpoints finetuned for chat application!
 
 The abstract from the paper is the following:
 
-*In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed- source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.*
+*In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.*
 
 Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)
 
 Tips:
 
-- Weights for the Llama2 models can be obtained from by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+- Weights for the Llama2 models can be obtained by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
 - The architecture is very similar to the first Llama, with the addition of Groupe Query Attention (GQA) following this [paper](https://arxiv.org/pdf/2305.13245.pdf)
 - Setting `config.pretraining_tp` to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
-- The original model uses `pad_id = -1` which means that there is not padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with`self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
-- After filling the form and gaining acces to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+- The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
+- After filling out the form and gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
 
 ```bash
 python src/transformers/models/llama/convert_llama_weights_to_hf.py \

From ee4250a35f3bd5e9a4379b4907b3d8f9d5d9523f Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 19 Jul 2023 14:26:27 +0200
Subject: [PATCH 702/935] [`Llama2`] replace `self.pretraining_tp` with
 `self.config.pretraining_tp` (#24906)

* add possibility to disable TP

* fixup

* adapt from offline discussions
---
 .../models/llama/modeling_llama.py            | 46 ++++++++++---------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 5709fc6a7781..5ba76d714761 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -191,7 +191,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
 class LlamaMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.pretraining_tp = config.pretraining_tp
+        self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
@@ -200,17 +200,21 @@ def __init__(self, config):
         self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
             gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
             up_proj_slices = self.up_proj.weight.split(slice, dim=0)
             down_proj_slices = self.down_proj.weight.split(slice, dim=1)
 
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
 
             intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
             down_proj = sum(down_proj)
         else:
             down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
@@ -241,7 +245,6 @@ def __init__(self, config: LlamaConfig):
         self.head_dim = self.hidden_size // self.num_heads
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
         self.max_position_embeddings = config.max_position_embeddings
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -286,19 +289,21 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
             key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
             value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
 
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
             query_states = torch.cat(query_states, dim=-1)
 
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
             key_states = torch.cat(key_states, dim=-1)
 
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
             value_states = torch.cat(value_states, dim=-1)
 
         else:
@@ -355,10 +360,10 @@ def forward(
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
         else:
             attn_output = self.o_proj(attn_output)
 
@@ -730,7 +735,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
@@ -816,9 +820,9 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
             logits = torch.cat(logits, dim=-1)
         else:
             logits = self.lm_head(hidden_states)

From 6112b1c6442aaf7affd2b0676a1cd4eee30c45cf Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 19 Jul 2023 13:57:40 -0700
Subject: [PATCH 703/935] [doc] `image_processing_vilt.py` wrong default
 documented (#24931)

[doc] image_processing_vilt.py wrong default
---
 src/transformers/models/vilt/image_processing_vilt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 5377ffbb7cdc..930a2e653f30 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -307,7 +307,7 @@ def pad(
         Args:
             images (`List[np.ndarray]`):
                 Batch of images to pad.
-            return_pixel_mask (`bool`, *optional*, defaults to `False`):
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return the pixel mask.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:

From 7381987f906114a3b3c1297f2954df5a63c3eb69 Mon Sep 17 00:00:00 2001
From: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Date: Thu, 20 Jul 2023 19:19:36 +0900
Subject: [PATCH 704/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated`t?=
 =?UTF-8?q?asks/document=5Fquestion=5Fanswering.md`=20to=20Korean=20(#2458?=
 =?UTF-8?q?8)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: `document_question_answering.md`

* fix: resolve suggestions

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

---------

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
---
 docs/source/ko/_toctree.yml                   |   6 +-
 .../ko/tasks/document_question_answering.md   | 482 ++++++++++++++++++
 2 files changed, 485 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/ko/tasks/document_question_answering.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 0c8ee2a76738..1225b63b3035 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -69,9 +69,9 @@
   - sections:
       - local: tasks/image_captioning
         title: 이미지 캡셔닝
-      - local: in_translation
-        title: (번역중) Document Question Answering
-    title: (번역중) 멀티모달
+      - local: tasks/document_question_answering
+        title: 문서 질의 응답(Document Question Answering)
+    title: 멀티모달
     isExpanded: false
   title: 태스크 가이드
 - sections:
diff --git a/docs/source/ko/tasks/document_question_answering.md b/docs/source/ko/tasks/document_question_answering.md
new file mode 100644
index 000000000000..b9e98f3bf672
--- /dev/null
+++ b/docs/source/ko/tasks/document_question_answering.md
@@ -0,0 +1,482 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 문서 질의 응답(Document Question Answering) [[document_question_answering]]
+
+[[open-in-colab]]
+
+문서 시각적 질의 응답(Document Visual Question Answering)이라고도 하는 
+문서 질의 응답(Document Question Answering)은 문서 이미지에 대한 질문에 답변을 주는 태스크입니다. 
+이 태스크를 지원하는 모델의 입력은 일반적으로 이미지와 질문의 조합이고, 출력은 자연어로 된 답변입니다. 이러한 모델은 텍스트, 단어의 위치(바운딩 박스), 이미지 등 다양한 모달리티를 활용합니다.
+
+이 가이드는 다음 내용을 설명합니다:
+
+- [DocVQA dataset](https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut)을 사용해 [LayoutLMv2](../model_doc/layoutlmv2) 미세 조정하기
+- 추론을 위해 미세 조정된 모델을 사용하기
+
+<Tip>
+
+이 튜토리얼에서 설명하는 태스크는 다음과 같은 모델 아키텍처에서 지원됩니다:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+LayoutLMv2는 토큰의 마지막 은닉층 위에 질의 응답 헤드를 추가해 답변의 시작 토큰과 끝 토큰의 위치를 예측함으로써 문서 질의 응답 태스크를 해결합니다. 즉, 문맥이 주어졌을 때 질문에 답하는 정보를 추출하는 추출형 질의 응답(Extractive question answering)으로 문제를 처리합니다.
+문맥은 OCR 엔진의 출력에서 가져오며, 여기서는 Google의 Tesseract를 사용합니다.
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요. LayoutLMv2는 detectron2, torchvision 및 테서랙트를 필요로 합니다.
+
+```bash
+pip install -q transformers datasets
+```
+
+```bash
+pip install 'git+https://github.com/facebookresearch/detectron2.git'
+pip install torchvision
+```
+
+```bash
+sudo apt install tesseract-ocr
+pip install -q pytesseract
+```
+
+필요한 라이브러리들을 모두 설치한 후 런타임을 다시 시작합니다.
+
+커뮤니티에 당신의 모델을 공유하는 것을 권장합니다. Hugging Face 계정에 로그인해서 모델을 🤗 Hub에 업로드하세요.
+프롬프트가 실행되면, 로그인을 위해 토큰을 입력하세요:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+몇 가지 전역 변수를 정의해 보겠습니다.
+
+```py
+>>> model_checkpoint = "microsoft/layoutlmv2-base-uncased"
+>>> batch_size = 4
+```
+
+## 데이터 불러오기 [[load-the-data]]
+
+이 가이드에서는 🤗 Hub에서 찾을 수 있는 전처리된 DocVQA의 작은 샘플을 사용합니다. 
+DocVQA의 전체 데이터 세트를 사용하고 싶다면, [DocVQA homepage](https://rrc.cvc.uab.es/?ch=17)에 가입 후 다운로드 할 수 있습니다. 전체 데이터 세트를 다운로드 했다면, 이 가이드를 계속 진행하기 위해 [🤗 dataset에 파일을 가져오는 방법](https://huggingface.co/docs/datasets/loading#local-and-remote-files)을 확인하세요.
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("nielsr/docvqa_1200_examples")
+>>> dataset
+DatasetDict({
+    train: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 200
+    })
+})
+```
+
+보시다시피, 데이터 세트는 이미 훈련 세트와 테스트 세트로 나누어져 있습니다. 무작위로 예제를 살펴보면서 특성을 확인해보세요.
+
+```py
+>>> dataset["train"].features
+```
+
+각 필드가 나타내는 내용은 다음과 같습니다:
+* `id`: 예제의 id
+* `image`: 문서 이미지를 포함하는 PIL.Image.Image 객체
+* `query`: 질문 문자열 - 여러 언어의 자연어로 된 질문
+* `answers`: 사람이 주석을 단 정답 리스트
+* `words` and `bounding_boxes`: OCR의 결과값들이며 이 가이드에서는 사용하지 않을 예정
+* `answer`: 다른 모델과 일치하는 답변이며 이 가이드에서는 사용하지 않을 예정
+
+영어로 된 질문만 남기고 다른 모델에 대한 예측을 포함하는 `answer` 특성을 삭제하겠습니다.
+그리고 주석 작성자가 제공한 데이터 세트에서 첫 번째 답변을 가져옵니다. 또는 무작위로 샘플을 추출할 수도 있습니다.
+
+```py
+>>> updated_dataset = dataset.map(lambda example: {"question": example["query"]["en"]}, remove_columns=["query"])
+>>> updated_dataset = updated_dataset.map(
+...     lambda example: {"answer": example["answers"][0]}, remove_columns=["answer", "answers"]
+... )
+```
+
+이 가이드에서 사용하는 LayoutLMv2 체크포인트는 `max_position_embeddings = 512`로 훈련되었습니다(이 정보는 [체크포인트의 `config.json` 파일](https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18)에서 확인할 수 있습니다).
+바로 예제를 잘라낼 수도 있지만, 긴 문서의 끝에 답변이 있어 잘리는 상황을 피하기 위해 여기서는 임베딩이 512보다 길어질 가능성이 있는 몇 가지 예제를 제거하겠습니다.
+데이터 세트에 있는 대부분의 문서가 긴 경우 슬라이딩 윈도우 방법을 사용할 수 있습니다 - 자세한 내용을 확인하고 싶으면 이 [노트북](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)을 확인하세요.
+
+```py
+>>> updated_dataset = updated_dataset.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)
+```
+
+이 시점에서 이 데이터 세트의 OCR 특성도 제거해 보겠습니다. OCR 특성은 다른 모델을 미세 조정하기 위한 것으로, 이 가이드에서 사용하는 모델의 입력 요구 사항과 일치하지 않기 때문에 이 특성을 사용하기 위해서는 일부 처리가 필요합니다. 
+대신, 원본 데이터에 [`LayoutLMv2Processor`]를 사용하여 OCR 및 토큰화를 모두 수행할 수 있습니다.
+이렇게 하면 모델이 요구하는 입력을 얻을 수 있습니다. 
+이미지를 수동으로 처리하려면, [`LayoutLMv2` model documentation](../model_doc/layoutlmv2)에서 모델이 요구하는 입력 포맷을 확인해보세요.
+
+```py
+>>> updated_dataset = updated_dataset.remove_columns("words")
+>>> updated_dataset = updated_dataset.remove_columns("bounding_boxes")
+```
+
+마지막으로, 데이터 탐색을 완료하기 위해 이미지 예시를 살펴봅시다.
+
+```py
+>>> updated_dataset["train"][11]["image"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/docvqa_example.jpg" alt="DocVQA Image Example"/>
+ </div>
+
+## 데이터 전처리 [[preprocess-the-data]]
+
+
+문서 질의 응답 태스크는 멀티모달 태스크이며, 각 모달리티의 입력이 모델의 요구에 맞게 전처리 되었는지 확인해야 합니다.
+이미지 데이터를 처리할 수 있는 이미지 프로세서와 텍스트 데이터를 인코딩할 수 있는 토크나이저를 결합한 [`LayoutLMv2Processor`]를 가져오는 것부터 시작해 보겠습니다.
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+```
+
+### 문서 이미지 전처리 [[preprocessing-document-images]]
+
+먼저, 프로세서의 `image_processor`를 사용해 모델에 대한 문서 이미지를 준비해 보겠습니다.
+기본값으로, 이미지 프로세서는 이미지 크기를 224x224로 조정하고 색상 채널의 순서가 올바른지 확인한 후 단어와 정규화된 바운딩 박스를 얻기 위해 테서랙트를 사용해 OCR를 적용합니다.
+이 튜토리얼에서 우리가 필요한 것과 기본값은 완전히 동일합니다. 이미지 배치에 기본 이미지 처리를 적용하고 OCR의 결과를 변환하는 함수를 작성합니다.
+
+```py
+>>> image_processor = processor.image_processor
+
+
+>>> def get_ocr_words_and_boxes(examples):
+...     images = [image.convert("RGB") for image in examples["image"]]
+...     encoded_inputs = image_processor(images)
+
+...     examples["image"] = encoded_inputs.pixel_values
+...     examples["words"] = encoded_inputs.words
+...     examples["boxes"] = encoded_inputs.boxes
+
+...     return examples
+```
+
+이 전처리를 데이터 세트 전체에 빠르게 적용하려면 [`~datasets.Dataset.map`]를 사용하세요.
+
+```py
+>>> dataset_with_ocr = updated_dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2)
+```
+
+### 텍스트 데이터 전처리 [[preprocessing-text-data]]
+
+이미지에 OCR을 적용했으면 데이터 세트의 텍스트 부분을 모델에 맞게 인코딩해야 합니다.
+이 인코딩에는 이전 단계에서 가져온 단어와 박스를 토큰 수준의 `input_ids`, `attention_mask`, `token_type_ids` 및 `bbox`로 변환하는 작업이 포함됩니다. 
+텍스트를 전처리하려면 프로세서의 `tokenizer`가 필요합니다.
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+위에서 언급한 전처리 외에도 모델을 위해 레이블을 추가해야 합니다. 🤗 Transformers의 `xxxForQuestionAnswering` 모델의 경우, 레이블은 `start_positions`와 `end_positions`로 구성되며 어떤 토큰이 답변의 시작과 끝에 있는지를 나타냅니다.
+
+레이블 추가를 위해서, 먼저 더 큰 리스트(단어 리스트)에서 하위 리스트(단어로 분할된 답변)을 찾을 수 있는 헬퍼 함수를 정의합니다.
+
+이 함수는 `words_list`와 `answer_list`, 이렇게 두 리스트를 입력으로 받습니다. 
+그런 다음 `words_list`를 반복하여 `words_list`의 현재 단어(words_list[i])가 `answer_list`의 첫 번째 단어(answer_list[0])와 같은지, 
+현재 단어에서 시작해 `answer_list`와 같은 길이만큼의 `words_list`의 하위 리스트가 `answer_list`와 일치하는지 확인합니다.
+이 조건이 참이라면 일치하는 항목을 발견했음을 의미하며, 함수는 일치 항목, 시작 인덱스(idx) 및 종료 인덱스(idx + len(answer_list) - 1)를 기록합니다. 일치하는 항목이 두 개 이상 발견되면 함수는 첫 번째 항목만 반환합니다. 일치하는 항목이 없다면 함수는 (`None`, 0, 0)을 반환합니다.
+
+```py
+>>> def subfinder(words_list, answer_list):
+...     matches = []
+...     start_indices = []
+...     end_indices = []
+...     for idx, i in enumerate(range(len(words_list))):
+...         if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list:
+...             matches.append(answer_list)
+...             start_indices.append(idx)
+...             end_indices.append(idx + len(answer_list) - 1)
+...     if matches:
+...         return matches[0], start_indices[0], end_indices[0]
+...     else:
+...         return None, 0, 0
+```
+
+이 함수가 어떻게 정답의 위치를 찾는지 설명하기 위해 다음 예제에서 함수를 사용해 보겠습니다:
+
+```py
+>>> example = dataset_with_ocr["train"][1]
+>>> words = [word.lower() for word in example["words"]]
+>>> match, word_idx_start, word_idx_end = subfinder(words, example["answer"].lower().split())
+>>> print("Question: ", example["question"])
+>>> print("Words:", words)
+>>> print("Answer: ", example["answer"])
+>>> print("start_index", word_idx_start)
+>>> print("end_index", word_idx_end)
+Question:  Who is in  cc in this letter?
+Words: ['wie', 'baw', 'brown', '&', 'williamson', 'tobacco', 'corporation', 'research', '&', 'development', 'internal', 'correspondence', 'to:', 'r.', 'h.', 'honeycutt', 'ce:', 't.f.', 'riehl', 'from:', '.', 'c.j.', 'cook', 'date:', 'may', '8,', '1995', 'subject:', 'review', 'of', 'existing', 'brainstorming', 'ideas/483', 'the', 'major', 'function', 'of', 'the', 'product', 'innovation', 'graup', 'is', 'to', 'develop', 'marketable', 'nove!', 'products', 'that', 'would', 'be', 'profitable', 'to', 'manufacture', 'and', 'sell.', 'novel', 'is', 'defined', 'as:', 'of', 'a', 'new', 'kind,', 'or', 'different', 'from', 'anything', 'seen', 'or', 'known', 'before.', 'innovation', 'is', 'defined', 'as:', 'something', 'new', 'or', 'different', 'introduced;', 'act', 'of', 'innovating;', 'introduction', 'of', 'new', 'things', 'or', 'methods.', 'the', 'products', 'may', 'incorporate', 'the', 'latest', 'technologies,', 'materials', 'and', 'know-how', 'available', 'to', 'give', 'then', 'a', 'unique', 'taste', 'or', 'look.', 'the', 'first', 'task', 'of', 'the', 'product', 'innovation', 'group', 'was', 'to', 'assemble,', 'review', 'and', 'categorize', 'a', 'list', 'of', 'existing', 'brainstorming', 'ideas.', 'ideas', 'were', 'grouped', 'into', 'two', 'major', 'categories', 'labeled', 'appearance', 'and', 'taste/aroma.', 'these', 'categories', 'are', 'used', 'for', 'novel', 'products', 'that', 'may', 'differ', 'from', 'a', 'visual', 'and/or', 'taste/aroma', 'point', 'of', 'view', 'compared', 'to', 'canventional', 'cigarettes.', 'other', 'categories', 'include', 'a', 'combination', 'of', 'the', 'above,', 'filters,', 'packaging', 'and', 'brand', 'extensions.', 'appearance', 'this', 'category', 'is', 'used', 'for', 'novel', 'cigarette', 'constructions', 'that', 'yield', 'visually', 'different', 'products', 'with', 'minimal', 'changes', 'in', 'smoke', 'chemistry', 'two', 'cigarettes', 'in', 'cne.', 'emulti-plug', 'te', 'build', 'yaur', 'awn', 'cigarette.', 'eswitchable', 'menthol', 'or', 'non', 'menthol', 'cigarette.', '*cigarettes', 'with', 'interspaced', 'perforations', 'to', 'enable', 'smoker', 'to', 'separate', 'unburned', 'section', 'for', 'future', 'smoking.', '«short', 'cigarette,', 'tobacco', 'section', '30', 'mm.', '«extremely', 'fast', 'buming', 'cigarette.', '«novel', 'cigarette', 'constructions', 'that', 'permit', 'a', 'significant', 'reduction', 'iretobacco', 'weight', 'while', 'maintaining', 'smoking', 'mechanics', 'and', 'visual', 'characteristics.', 'higher', 'basis', 'weight', 'paper:', 'potential', 'reduction', 'in', 'tobacco', 'weight.', '«more', 'rigid', 'tobacco', 'column;', 'stiffing', 'agent', 'for', 'tobacco;', 'e.g.', 'starch', '*colored', 'tow', 'and', 'cigarette', 'papers;', 'seasonal', 'promotions,', 'e.g.', 'pastel', 'colored', 'cigarettes', 'for', 'easter', 'or', 'in', 'an', 'ebony', 'and', 'ivory', 'brand', 'containing', 'a', 'mixture', 'of', 'all', 'black', '(black', 'paper', 'and', 'tow)', 'and', 'ail', 'white', 'cigarettes.', '499150498']
+Answer:  T.F. Riehl
+start_index 17
+end_index 18
+```
+
+한편, 위 예제가 인코딩되면 다음과 같이 표시됩니다:
+
+```py
+>>> encoding = tokenizer(example["question"], example["words"], example["boxes"])
+>>> tokenizer.decode(encoding["input_ids"])
+[CLS] who is in cc in this letter? [SEP] wie baw brown & williamson tobacco corporation research & development ...
+```
+
+이제 인코딩된 입력에서 정답의 위치를 찾아야 합니다.
+* `token_type_ids`는 어떤 토큰이 질문에 속하는지, 그리고 어떤 토큰이 문서의 단어에 포함되는지를 알려줍니다.
+* `tokenizer.cls_token_id` 입력의 시작 부분에 있는 특수 토큰을 찾는 데 도움을 줍니다.
+* `word_ids`는 원본 `words`에서 찾은 답변을 전체 인코딩된 입력의 동일한 답과 일치시키고 인코딩된 입력에서 답변의 시작/끝 위치를 결정합니다.
+
+위 내용들을 염두에 두고 데이터 세트 예제의 배치를 인코딩하는 함수를 만들어 보겠습니다:
+
+```py
+>>> def encode_dataset(examples, max_length=512):
+...     questions = examples["question"]
+...     words = examples["words"]
+...     boxes = examples["boxes"]
+...     answers = examples["answer"]
+
+...     # 예제 배치를 인코딩하고 start_positions와 end_positions를 초기화합니다
+...     encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True)
+...     start_positions = []
+...     end_positions = []
+
+...     # 배치의 예제를 반복합니다
+...     for i in range(len(questions)):
+...         cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id)
+
+...         # 예제의 words에서 답변의 위치를 찾습니다
+...         words_example = [word.lower() for word in words[i]]
+...         answer = answers[i]
+...         match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())
+
+...         if match:
+...             # 일치하는 항목을 발견하면, `token_type_ids`를 사용해 인코딩에서 단어가 시작하는 위치를 찾습니다
+...             token_type_ids = encoding["token_type_ids"][i]
+...             token_start_index = 0
+...             while token_type_ids[token_start_index] != 1:
+...                 token_start_index += 1
+
+...             token_end_index = len(encoding["input_ids"][i]) - 1
+...             while token_type_ids[token_end_index] != 1:
+...                 token_end_index -= 1
+
+...             word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1]
+...             start_position = cls_index
+...             end_position = cls_index
+
+...             # words의 답변 위치와 일치할 때까지 word_ids를 반복하고 `token_start_index`를 늘립니다
+...             # 일치하면 `token_start_index`를 인코딩에서 답변의 `start_position`으로 저장합니다
+...             for id in word_ids:
+...                 if id == word_idx_start:
+...                     start_position = token_start_index
+...                 else:
+...                     token_start_index += 1
+
+...             # 비슷하게, 끝에서 시작해 `word_ids`를 반복하며 답변의 `end_position`을 찾습니다
+...             for id in word_ids[::-1]:
+...                 if id == word_idx_end:
+...                     end_position = token_end_index
+...                 else:
+...                     token_end_index -= 1
+
+...             start_positions.append(start_position)
+...             end_positions.append(end_position)
+
+...         else:
+...             start_positions.append(cls_index)
+...             end_positions.append(cls_index)
+
+...     encoding["image"] = examples["image"]
+...     encoding["start_positions"] = start_positions
+...     encoding["end_positions"] = end_positions
+
+...     return encoding
+```
+
+이제 이 전처리 함수가 있으니 전체 데이터 세트를 인코딩할 수 있습니다:
+
+```py
+>>> encoded_train_dataset = dataset_with_ocr["train"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["train"].column_names
+... )
+>>> encoded_test_dataset = dataset_with_ocr["test"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["test"].column_names
+... )
+```
+
+인코딩된 데이터 세트의 특성이 어떻게 생겼는지 확인해 보겠습니다:
+
+```py
+>>> encoded_train_dataset.features
+{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
+ 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
+ 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
+ 'start_positions': Value(dtype='int64', id=None),
+ 'end_positions': Value(dtype='int64', id=None)}
+```
+
+## 평가 [[evaluation]]
+
+문서 질의 응답을 평가하려면 상당한 양의 후처리가 필요합니다. 시간이 너무 많이 걸리지 않도록 이 가이드에서는 평가 단계를 생략합니다.
+[`Trainer`]가 훈련 과정에서 평가 손실(evaluation loss)을 계속 계산하기 때문에 모델의 성능을 대략적으로 알 수 있습니다.
+추출적(Extractive) 질의 응답은 보통 F1/exact match 방법을 사용해 평가됩니다.
+직접 구현해보고 싶으시다면, Hugging Face course의 [Question Answering chapter](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing)을 참고하세요.
+
+## 훈련 [[train]]
+
+축하합니다! 이 가이드의 가장 어려운 부분을 성공적으로 처리했으니 이제 나만의 모델을 훈련할 준비가 되었습니다. 
+훈련은 다음과 같은 단계로 이루어져 있습니다:
+* 전처리에서의 동일한 체크포인트를 사용하기 위해 [`AutoModelForDocumentQuestionAnswering`]으로 모델을 가져옵니다.
+* [`TrainingArguments`]로 훈련 하이퍼파라미터를 정합니다.
+* 예제를 배치 처리하는 함수를 정의합니다. 여기서는 [`DefaultDataCollator`]가 적당합니다.
+* 모델, 데이터 세트, 데이터 콜레이터(Data collator)와 함께 [`Trainer`]에 훈련 인수들을 전달합니다.
+* [`~Trainer.train`]을 호출해서 모델을 미세 조정합니다.
+
+```py
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)
+```
+
+[`TrainingArguments`]에서 `output_dir`을 사용하여 모델을 저장할 위치를 지정하고, 적절한 하이퍼파라미터를 설정합니다.
+모델을 커뮤니티와 공유하려면 `push_to_hub`를 `True`로 설정하세요 (모델을 업로드하려면 Hugging Face에 로그인해야 합니다).
+이 경우 `output_dir`은 모델의 체크포인트를 푸시할 레포지토리의 이름이 됩니다.
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> # 본인의 레포지토리 ID로 바꾸세요
+>>> repo_id = "MariaK/layoutlmv2-base-uncased_finetuned_docvqa"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     evaluation_strategy="steps",
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+간단한 데이터 콜레이터를 정의하여 예제를 함께 배치합니다.
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+마지막으로, 모든 것을 한 곳에 모아 [`~Trainer.train`]을 호출합니다:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=encoded_train_dataset,
+...     eval_dataset=encoded_test_dataset,
+...     tokenizer=processor,
+... )
+
+>>> trainer.train()
+```
+
+최종 모델을 🤗 Hub에 추가하려면, 모델 카드를 생성하고 `push_to_hub`를 호출합니다:
+
+```py
+>>> trainer.create_model_card()
+>>> trainer.push_to_hub()
+```
+
+## 추론 [[inference]]
+
+이제 LayoutLMv2 모델을 미세 조정하고 🤗 Hub에 업로드했으니 추론에도 사용할 수 있습니다. 
+추론을 위해 미세 조정된 모델을 사용해 보는 가장 간단한 방법은 [`Pipeline`]을 사용하는 것 입니다.
+
+예를 들어 보겠습니다:
+```py
+>>> example = dataset["test"][2]
+>>> question = example["query"]["en"]
+>>> image = example["image"]
+>>> print(question)
+>>> print(example["answers"])
+'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?'
+['TRRF Vice President', 'lee a. waller']
+```
+
+그 다음, 모델로 문서 질의 응답을 하기 위해 파이프라인을 인스턴스화하고 이미지 + 질문 조합을 전달합니다.
+
+```py
+>>> from transformers import pipeline
+
+>>> qa_pipeline = pipeline("document-question-answering", model="MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> qa_pipeline(image, question)
+[{'score': 0.9949808120727539,
+  'answer': 'Lee A. Waller',
+  'start': 55,
+  'end': 57}]
+```
+
+원한다면 파이프라인의 결과를 수동으로 복제할 수도 있습니다:
+1. 이미지와 질문을 가져와 모델의 프로세서를 사용해 모델에 맞게 준비합니다.
+2. 모델을 통해 결과 또는 전처리를 전달합니다.
+3. 모델은 어떤 토큰이 답변의 시작에 있는지, 어떤 토큰이 답변이 끝에 있는지를 나타내는 `start_logits`와 `end_logits`를 반환합니다. 둘 다 (batch_size, sequence_length) 형태를 갖습니다.
+4. `start_logits`와 `end_logits`의 마지막 차원을 최대로 만드는 값을 찾아 예상 `start_idx`와 `end_idx`를 얻습니다.
+5. 토크나이저로 답변을 디코딩합니다.
+
+```py
+>>> import torch
+>>> from transformers import AutoProcessor
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+
+>>> with torch.no_grad():
+...     encoding = processor(image.convert("RGB"), question, return_tensors="pt")
+...     outputs = model(**encoding)
+...     start_logits = outputs.start_logits
+...     end_logits = outputs.end_logits
+...     predicted_start_idx = start_logits.argmax(-1).item()
+...     predicted_end_idx = end_logits.argmax(-1).item()
+
+>>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])
+'lee a. waller'
+```
\ No newline at end of file

From 8fd8c8e49ee148adbc9152bc67099831686d2daa Mon Sep 17 00:00:00 2001
From: ranchlai <ranchlai@163.com>
Date: Thu, 20 Jul 2023 19:02:44 +0800
Subject: [PATCH 705/935] Add multi-label text classification support to
 pytorch example (#24770)

* Add text classification example

* set the problem type and finetuning task

* ruff reformated

* fix bug for unseting label_to_id for regression

* update README.md

* fixed finetuning task

* update comment

* check if label exists in feature before removing

* add useful logging
---
 .../pytorch/text-classification/README.md     |  49 ++
 .../text-classification/run_classification.py | 731 ++++++++++++++++++
 2 files changed, 780 insertions(+)
 create mode 100755 examples/pytorch/text-classification/run_classification.py

diff --git a/examples/pytorch/text-classification/README.md b/examples/pytorch/text-classification/README.md
index 1bc01b416b74..3e0d190e516e 100644
--- a/examples/pytorch/text-classification/README.md
+++ b/examples/pytorch/text-classification/README.md
@@ -81,6 +81,55 @@ python run_glue.py \
 
 > If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
 
+## Text classification
+As an alternative, we can use the script [`run_classification.py`](./run_classification.py) to fine-tune models on a single/multi-label classification task. 
+
+The following example fine-tunes BERT on the `en` subset of  [`amazon_reviews_multi`](https://huggingface.co/datasets/amazon_reviews_multi) dataset.
+We can specify the metric, the label column and aso choose which text columns to use jointly for classification. 
+```bash
+dataset="amazon_reviews_multi"
+subset="en"
+python run_classification.py \
+    --model_name_or_path  bert-base-uncased \
+    --dataset_name ${dataset} \
+    --dataset_config_name ${subset} \
+    --shuffle_train_dataset \
+    --metric_name accuracy \
+    --text_column_name "review_title,review_body,product_category" \
+    --text_column_delimiter "\n" \
+    --label_column_name stars \
+    --do_train \
+    --do_eval \
+    --max_seq_length 512 \
+    --per_device_train_batch_size 32 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 1 \
+    --output_dir /tmp/${dataset}_${subset}/
+```
+Training for 1 epoch results in acc of around 0.5958 for review_body only and 0.659 for title+body+category.
+
+The following is a multi-label classification example. It fine-tunes BERT on the `reuters21578` dataset hosted on our [hub](https://huggingface.co/datasets/reuters21578):
+```bash
+dataset="reuters21578"
+subset="ModApte"
+python run_classification.py \
+    --model_name_or_path bert-base-uncased \
+    --dataset_name ${dataset} \
+    --dataset_config_name ${subset} \
+    --shuffle_train_dataset \
+    --remove_splits "unused" \
+    --metric_name f1 \
+    --text_column_name text \
+    --label_column_name topics \
+    --do_train \
+    --do_eval \
+    --max_seq_length 512 \
+    --per_device_train_batch_size 32 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 15 \
+    --output_dir /tmp/${dataset}_${subset}/ 
+```
+ It results in a Micro F1 score of around 0.82 without any text and label filtering. Note that you have to explictly remove the "unused" split from the dataset, since it is not used for classification.
 
 ### Mixed precision training
 
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
new file mode 100755
index 000000000000..0a908277850a
--- /dev/null
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -0,0 +1,731 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for text classification."""
+# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
+
+import logging
+import os
+import random
+import sys
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import datasets
+import evaluate
+import numpy as np
+from datasets import Value, load_dataset
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.32.0.dev0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    do_regression: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to do regression instead of classification. If None, will be inferred from the dataset."
+        },
+    )
+    text_column_names: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The name of the text column in the input dataset or a CSV/JSON file."
+                'If not specified, will use the "sentence" column for single/multi-label classifcation task.'
+            )
+        },
+    )
+    text_column_delimiter: Optional[str] = field(
+        default=" ", metadata={"help": "THe delimiter to use to join text columns into a single sentence."}
+    )
+    train_split_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'The name of the train split in the input dataset. If not specified, will use the "train" split when do_train is enabled'
+        },
+    )
+    validation_split_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'The name of the validation split in the input dataset. If not specified, will use the "validation" split when do_eval is enabled'
+        },
+    )
+    test_split_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'The name of the test split in the input dataset. If not specified, will use the "test" split when do_predict is enabled'
+        },
+    )
+    remove_splits: Optional[str] = field(
+        default=None,
+        metadata={"help": "The splits to remove from the dataset. Multiple splits should be separated by commas."},
+    )
+    remove_columns: Optional[str] = field(
+        default=None,
+        metadata={"help": "The columns to remove from the dataset. Multiple columns should be separated by commas."},
+    )
+    label_column_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The name of the label column in the input dataset or a CSV/JSON file."
+                'If not specified, will use the "label" column for single/multi-label classifcation task'
+            )
+        },
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to pad all samples to `max_seq_length`. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+            )
+        },
+    )
+    shuffle_train_dataset: bool = field(
+        default=False, metadata={"help": "Whether to shuffle the train dataset or not."}
+    )
+    shuffle_seed: int = field(
+        default=42, metadata={"help": "Random seed that will be used to shuffle the train dataset."}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    metric_name: Optional[str] = field(default=None, metadata={"help": "The metric to use for evaluation."})
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the training data."}
+    )
+    validation_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the validation data."}
+    )
+    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
+
+    def __post_init__(self):
+        if self.dataset_name is None:
+            if self.train_file is None or self.validation_file is None:
+                raise ValueError(" training/validation file or a dataset name.")
+
+            train_extension = self.train_file.split(".")[-1]
+            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            validation_extension = self.validation_file.split(".")[-1]
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    ignore_mismatched_sizes: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+
+
+def get_label_list(raw_dataset, split="train") -> List[str]:
+    """Get the list of labels from a mutli-label dataset"""
+
+    if isinstance(raw_dataset[split]["label"][0], list):
+        label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
+        label_list = list(set(label_list))
+    else:
+        label_list = raw_dataset[split].unique("label")
+    # we will treat the label list as a list of string instead of int, consistent with model.config.label2id
+    label_list = [str(label) for label in label_list]
+    return label_list
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_classification", model_args, data_args)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files, or specify a dataset name
+    # to load from huggingface/datasets. In ether case, you can specify a the key of the column(s) containing the text and
+    # the key of the column containing the label. If multiple columns are specified for the text, they will be joined togather
+    # for the actual text value.
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+        # Try print some info about the dataset
+        logger.info(f"Dataset loaded: {raw_datasets}")
+        logger.info(raw_datasets)
+    else:
+        # Loading a dataset from your local files.
+        # CSV/JSON training and evaluation files are needed.
+        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
+
+        # Get the test dataset: you can provide your own CSV/JSON test file
+        if training_args.do_predict:
+            if data_args.test_file is not None:
+                train_extension = data_args.train_file.split(".")[-1]
+                test_extension = data_args.test_file.split(".")[-1]
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                data_files["test"] = data_args.test_file
+            else:
+                raise ValueError("Need either a dataset name or a test file for `do_predict`.")
+
+        for key in data_files.keys():
+            logger.info(f"load a local file for {key}: {data_files[key]}")
+
+        if data_args.train_file.endswith(".csv"):
+            # Loading a dataset from local csv files
+            raw_datasets = load_dataset(
+                "csv",
+                data_files=data_files,
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+        else:
+            # Loading a dataset from local json files
+            raw_datasets = load_dataset(
+                "json",
+                data_files=data_files,
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if data_args.remove_splits is not None:
+        for split in data_args.remove_splits.split(","):
+            logger.info(f"removing split {split}")
+            raw_datasets.pop(split)
+
+    if data_args.train_split_name is not None:
+        logger.info(f"using {data_args.validation_split_name} as validation set")
+        raw_datasets["train"] = raw_datasets[data_args.train_split_name]
+        raw_datasets.pop(data_args.train_split_name)
+
+    if data_args.validation_split_name is not None:
+        logger.info(f"using {data_args.validation_split_name} as validation set")
+        raw_datasets["validation"] = raw_datasets[data_args.validation_split_name]
+        raw_datasets.pop(data_args.validation_split_name)
+
+    if data_args.test_split_name is not None:
+        logger.info(f"using {data_args.test_split_name} as test set")
+        raw_datasets["test"] = raw_datasets[data_args.test_split_name]
+        raw_datasets.pop(data_args.test_split_name)
+
+    if data_args.remove_columns is not None:
+        for split in raw_datasets.keys():
+            for column in data_args.remove_columns.split(","):
+                logger.info(f"removing column {column} from split {split}")
+                raw_datasets[split].remove_columns(column)
+
+    if data_args.label_column_name is not None and data_args.label_column_name != "label":
+        for key in raw_datasets.keys():
+            raw_datasets[key] = raw_datasets[key].rename_column(data_args.label_column_name, "label")
+
+    # Trying to have good defaults here, don't hesitate to tweak to your needs.
+
+    is_regression = (
+        raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
+        if data_args.do_regression is None
+        else data_args.do_regression
+    )
+
+    is_multi_label = False
+    if is_regression:
+        label_list = None
+        num_labels = 1
+        # regession requires float as label type, let's cast it if needed
+        for split in raw_datasets.keys():
+            if raw_datasets[split].features["label"].dtype not in ["float32", "float64"]:
+                logger.warning(
+                    f"Label type for {split} set to float32, was {raw_datasets[split].features['label'].dtype}"
+                )
+                features = raw_datasets[split].features
+                features.update({"label": Value("float32")})
+                try:
+                    raw_datasets[split] = raw_datasets[split].cast(features)
+                except TypeError as error:
+                    logger.error(
+                        f"Unable to cast {split} set to float32, please check the labels are correct, or maybe try with --do_regression=False"
+                    )
+                    raise error
+
+    else:  # classification
+        if raw_datasets["train"].features["label"].dtype == "list":  # multi-label classification
+            is_multi_label = True
+            logger.info("Label type is list, doing multi-label classification")
+        # Trying to find the number of labels in a multi-label classification task
+        # We have to deal with common cases that labels appear in the training set but not in the validation/test set.
+        # So we build the label list from the union of labels in train/val/test.
+        label_list = get_label_list(raw_datasets, split="train")
+        for split in ["validation", "test"]:
+            if split in raw_datasets:
+                val_or_test_labels = get_label_list(raw_datasets, split=split)
+                diff = set(val_or_test_labels).difference(set(label_list))
+                if len(diff) > 0:
+                    # add the labels that appear in val/test but not in train, throw a warning
+                    logger.warning(
+                        f"Labels {diff} in {split} set but not in training set, adding them to the label list"
+                    )
+                    label_list += list(diff)
+        # if label is -1, we throw a warning and remove it from the label list
+        for label in label_list:
+            if label == -1:
+                logger.warning("Label -1 found in label list, removing it.")
+                label_list.remove(label)
+
+        label_list.sort()
+        num_labels = len(label_list)
+        if num_labels <= 1:
+            raise ValueError("You need more than one label to do classification.")
+
+    # Load pretrained model and tokenizer
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task="text-classification",
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    if is_regression:
+        config.problem_type = "regression"
+        logger.info("setting problem type to regression")
+    elif is_multi_label:
+        config.problem_type = "multi_label_classification"
+        logger.info("setting problem type to multi label classification")
+    else:
+        config.problem_type = "single_label_classification"
+        logger.info("setting problem type to single label classification")
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+    )
+
+    # Padding strategy
+    if data_args.pad_to_max_length:
+        padding = "max_length"
+    else:
+        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
+        padding = False
+
+    # for training ,we will update the config with label infos,
+    # if do_train is not set, we will use the label infos in the config
+    if training_args.do_train and not is_regression:  # classification, training
+        label_to_id = {v: i for i, v in enumerate(label_list)}
+        # update config with label infos
+        if model.config.label2id != label_to_id:
+            logger.warning(
+                "The label2id key in the model config.json is not equal to the label2id key of this "
+                "run. You can ignore this if you are doing finetuning."
+            )
+        model.config.label2id = label_to_id
+        model.config.id2label = {id: label for label, id in config.label2id.items()}
+    elif not is_regression:  # classification, but not training
+        logger.info("using label infos in the model config")
+        logger.info("label2id: {}".format(model.config.label2id))
+        label_to_id = model.config.label2id
+    else:  # regression
+        label_to_id = None
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def multi_labels_to_ids(labels: List[str]) -> List[float]:
+        ids = [0.0] * len(label_to_id)  # BCELoss requires float as target type
+        for label in labels:
+            ids[label_to_id[label]] = 1.0
+        return ids
+
+    def preprocess_function(examples):
+        if data_args.text_column_names is not None:
+            text_column_names = data_args.text_column_names.split(",")
+            # join together text columns into "sentence" column
+            examples["sentence"] = examples[text_column_names[0]]
+            for column in text_column_names[1:]:
+                for i in range(len(examples[column])):
+                    examples["sentence"][i] += data_args.text_column_delimiter + examples[column][i]
+        # Tokenize the texts
+        result = tokenizer(examples["sentence"], padding=padding, max_length=max_seq_length, truncation=True)
+        if label_to_id is not None and "label" in examples:
+            if is_multi_label:
+                result["label"] = [multi_labels_to_ids(l) for l in examples["label"]]
+            else:
+                result["label"] = [(label_to_id[str(l)] if l != -1 else -1) for l in examples["label"]]
+        return result
+
+    # Running the preprocessing pipeline on all the datasets
+    with training_args.main_process_first(desc="dataset map pre-processing"):
+        raw_datasets = raw_datasets.map(
+            preprocess_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset.")
+        train_dataset = raw_datasets["train"]
+        if data_args.shuffle_train_dataset:
+            logger.info("Shuffling the training dataset")
+            train_dataset = train_dataset.shuffle(seed=data_args.shuffle_seed)
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
+            if "test" not in raw_datasets and "test_matched" not in raw_datasets:
+                raise ValueError("--do_eval requires a validation or test dataset if validation is not defined.")
+            else:
+                logger.warning("Validation dataset not found. Falling back to test dataset for validation.")
+                eval_dataset = raw_datasets["test"]
+        else:
+            eval_dataset = raw_datasets["validation"]
+
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+    if training_args.do_predict or data_args.test_file is not None:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = raw_datasets["test"]
+        # remove label column if it exists
+        if data_args.max_predict_samples is not None:
+            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
+            predict_dataset = predict_dataset.select(range(max_predict_samples))
+
+    # Log a few random samples from the training set:
+    if training_args.do_train:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    if data_args.metric_name is not None:
+        metric = (
+            evaluate.load(data_args.metric_name, config_name="multilabel")
+            if is_multi_label
+            else evaluate.load(data_args.metric_name)
+        )
+        logger.info(f"Using metric {data_args.metric_name} for evaluation.")
+    else:
+        if is_regression:
+            metric = evaluate.load("mse")
+            logger.info("Using mean squared error (mse) as regression score, you can use --metric_name to overwrite.")
+        else:
+            if is_multi_label:
+                metric = evaluate.load("f1", config_name="multilabel")
+                logger.info(
+                    "Using multilabel F1 for multi-label classification task, you can use --metric_name to overwrite."
+                )
+            else:
+                metric = evaluate.load("accuracy")
+                logger.info("Using accuracy as classification score, you can use --metric_name to overwrite.")
+
+    def compute_metrics(p: EvalPrediction):
+        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        if is_regression:
+            preds = np.squeeze(preds)
+            result = metric.compute(predictions=preds, references=p.label_ids)
+        elif is_multi_label:
+            preds = np.array([np.where(p > 0.5, 1, 0) for p in preds])
+            # Micro F1 is commonly used in multi-label classification
+            result = metric.compute(predictions=preds, references=p.label_ids, average="micro")
+        else:
+            preds = np.argmax(preds, axis=1)
+            result = metric.compute(predictions=preds, references=p.label_ids)
+        if len(result) > 1:
+            result["combined_score"] = np.mean(list(result.values())).item()
+        return result
+
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
+    # we already did the padding.
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    elif training_args.fp16:
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+    else:
+        data_collator = None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate(eval_dataset=eval_dataset)
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        # Removing the `label` columns if exists because it might contains -1 and Trainer won't like that.
+        if "label" in predict_dataset.features:
+            predict_dataset = predict_dataset.remove_columns("label")
+        predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
+        if is_regression:
+            predictions = np.squeeze(predictions)
+        elif is_multi_label:
+            predictions = np.array([np.where(p > 0.5, 1, 0) for p in predictions])
+        else:
+            predictions = np.argmax(predictions, axis=1)
+        output_predict_file = os.path.join(training_args.output_dir, "predict_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_predict_file, "w") as writer:
+                logger.info("***** Predict results *****")
+                writer.write("index\tprediction\n")
+                for index, item in enumerate(predictions):
+                    if is_regression:
+                        writer.write(f"{index}\t{item:3.3f}\n")
+                    elif is_multi_label:
+                        # recover from multi-hot encoding
+                        item = [label_list[i] for i in range(len(item)) if item[i] == 1]
+                        writer.write(f"{index}\t{item}\n")
+                    else:
+                        item = label_list[item]
+                        writer.write(f"{index}\t{item}\n")
+        logger.info("Predict results saved at {}".format(output_predict_file))
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()

From 79444f370f855e36c97876b04831e0a6c94f007d Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 20 Jul 2023 13:03:24 +0200
Subject: [PATCH 706/935] Deprecate unused OpenLlama architecture (#24922)

* Resolve typo in check_repo.py

* Specify encoding when opening modeling files

* Deprecate the OpenLlama architecture

* Add disclaimer pointing to Llama

I'm open to different wordings here

* Match the capitalisation of LLaMA
---
 docs/source/en/model_doc/open-llama.md        |  15 +
 src/transformers/__init__.py                  |  22 +-
 src/transformers/models/__init__.py           |   1 -
 .../models/auto/configuration_auto.py         |   1 +
 .../{ => deprecated}/open_llama/__init__.py   |   2 +-
 .../open_llama/configuration_open_llama.py    |   4 +-
 .../open_llama/modeling_open_llama.py         |   8 +-
 src/transformers/utils/dummy_pt_objects.py    |  56 +--
 tests/models/open_llama/__init__.py           |   0
 .../open_llama/test_modeling_open_llama.py    | 370 ------------------
 utils/check_config_attributes.py              |   2 +-
 utils/check_repo.py                           |   2 +-
 12 files changed, 64 insertions(+), 419 deletions(-)
 rename src/transformers/models/{ => deprecated}/open_llama/__init__.py (99%)
 rename src/transformers/models/{ => deprecated}/open_llama/configuration_open_llama.py (98%)
 rename src/transformers/models/{ => deprecated}/open_llama/modeling_open_llama.py (99%)
 delete mode 100644 tests/models/open_llama/__init__.py
 delete mode 100644 tests/models/open_llama/test_modeling_open_llama.py

diff --git a/docs/source/en/model_doc/open-llama.md b/docs/source/en/model_doc/open-llama.md
index 23d35b805765..c20ecb7f88ca 100644
--- a/docs/source/en/model_doc/open-llama.md
+++ b/docs/source/en/model_doc/open-llama.md
@@ -16,6 +16,21 @@ rendered properly in your Markdown viewer.
 
 # Open-Llama
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code.
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.31.0.
+You can do so by running the following command: `pip install -U transformers==4.31.0`.
+
+</Tip>
+
+<Tip warning={true}>
+
+This model differs from the [OpenLLaMA models](https://huggingface.co/models?search=openllama) on the Hugging Face Hub, which primarily use the [LLaMA](llama) architecture.
+
+</Tip>
+
 ## Overview
 
 The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index fdec70f4ac76..1ebb68088020 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -278,6 +278,7 @@
         "MCTCTProcessor",
     ],
     "models.deprecated.mmbt": ["MMBTConfig"],
+    "models.deprecated.open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
     "models.deprecated.retribert": [
         "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "RetriBertConfig",
@@ -445,7 +446,6 @@
         "NystromformerConfig",
     ],
     "models.oneformer": ["ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "OneFormerConfig", "OneFormerProcessor"],
-    "models.open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
     "models.openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig", "OpenAIGPTTokenizer"],
     "models.opt": ["OPTConfig"],
     "models.owlvit": [
@@ -1536,6 +1536,9 @@
         ]
     )
     _import_structure["models.deprecated.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
+    _import_structure["models.deprecated.open_llama"].extend(
+        ["OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification", "OpenLlamaModel", "OpenLlamaPreTrainedModel"]
+    )
     _import_structure["models.deprecated.retribert"].extend(
         ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
     )
@@ -2300,9 +2303,6 @@
             "OneFormerPreTrainedModel",
         ]
     )
-    _import_structure["models.open_llama"].extend(
-        ["OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification", "OpenLlamaModel", "OpenLlamaPreTrainedModel"]
-    )
     _import_structure["models.openai"].extend(
         [
             "OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4239,6 +4239,7 @@
         MCTCTProcessor,
     )
     from .models.deprecated.mmbt import MMBTConfig
+    from .models.deprecated.open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
     from .models.deprecated.retribert import (
         RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         RetriBertConfig,
@@ -4390,7 +4391,6 @@
     from .models.nllb_moe import NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP, NllbMoeConfig
     from .models.nystromformer import NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, NystromformerConfig
     from .models.oneformer import ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, OneFormerConfig, OneFormerProcessor
-    from .models.open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
     from .models.openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, OpenAIGPTTokenizer
     from .models.opt import OPTConfig
     from .models.owlvit import (
@@ -5334,6 +5334,12 @@
             MCTCTPreTrainedModel,
         )
         from .models.deprecated.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+        from .models.deprecated.open_llama import (
+            OpenLlamaForCausalLM,
+            OpenLlamaForSequenceClassification,
+            OpenLlamaModel,
+            OpenLlamaPreTrainedModel,
+        )
         from .models.deprecated.retribert import (
             RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             RetriBertModel,
@@ -5954,12 +5960,6 @@
             OneFormerModel,
             OneFormerPreTrainedModel,
         )
-        from .models.open_llama import (
-            OpenLlamaForCausalLM,
-            OpenLlamaForSequenceClassification,
-            OpenLlamaModel,
-            OpenLlamaPreTrainedModel,
-        )
         from .models.openai import (
             OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
             OpenAIGPTDoubleHeadsModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 853dd115b39e..8649286b511d 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -145,7 +145,6 @@
     nllb_moe,
     nystromformer,
     oneformer,
-    open_llama,
     openai,
     opt,
     owlvit,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2c696b26c8e8..2eb7bd974567 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -652,6 +652,7 @@
     "bort",
     "mctct",
     "mmbt",
+    "open_llama",
     "retribert",
     "tapex",
     "trajectory_transformer",
diff --git a/src/transformers/models/open_llama/__init__.py b/src/transformers/models/deprecated/open_llama/__init__.py
similarity index 99%
rename from src/transformers/models/open_llama/__init__.py
rename to src/transformers/models/deprecated/open_llama/__init__.py
index 757cba9cf8d1..446c9f076d31 100644
--- a/src/transformers/models/open_llama/__init__.py
+++ b/src/transformers/models/deprecated/open_llama/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
+from ....utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
     is_sentencepiece_available,
diff --git a/src/transformers/models/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
similarity index 98%
rename from src/transformers/models/open_llama/configuration_open_llama.py
rename to src/transformers/models/deprecated/open_llama/configuration_open_llama.py
index c0629b31e812..e4694fe11bb4 100644
--- a/src/transformers/models/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -19,8 +19,8 @@
 # limitations under the License.
 """ Open-Llama model configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
similarity index 99%
rename from src/transformers/models/open_llama/modeling_open_llama.py
rename to src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index bcb4e04c0eaa..a9948afa80f2 100644
--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -26,10 +26,10 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_open_llama import OpenLlamaConfig
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 20631fc06692..6ef0dd43446f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2396,6 +2396,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class OpenLlamaForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OpenLlamaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5461,34 +5489,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class OpenLlamaForCausalLM(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class OpenLlamaForSequenceClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class OpenLlamaModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class OpenLlamaPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/open_llama/__init__.py b/tests/models/open_llama/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/models/open_llama/test_modeling_open_llama.py b/tests/models/open_llama/test_modeling_open_llama.py
deleted file mode 100644
index 687b267b70ca..000000000000
--- a/tests/models/open_llama/test_modeling_open_llama.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Open-Llama model. """
-
-
-import unittest
-
-from parameterized import parameterized
-
-from transformers import OpenLlamaConfig, is_torch_available, set_seed
-from transformers.testing_utils import require_torch, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import OpenLlamaForCausalLM, OpenLlamaForSequenceClassification, OpenLlamaModel
-
-
-class OpenLlamaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return OpenLlamaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            use_stable_embedding=False,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = OpenLlamaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = OpenLlamaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = OpenLlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = OpenLlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class OpenLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (OpenLlamaModel, OpenLlamaForCausalLM, OpenLlamaForSequenceClassification) if is_torch_available() else ()
-    )
-    all_generative_model_classes = (OpenLlamaForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": OpenLlamaModel,
-            "text-classification": OpenLlamaForSequenceClassification,
-            "text-generation": OpenLlamaForCausalLM,
-            "zero-shot": OpenLlamaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = OpenLlamaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OpenLlamaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_open_llama_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = OpenLlamaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_open_llama_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = OpenLlamaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_open_llama_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = OpenLlamaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Open-Llama buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    def test_model_rope_scaling(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = OpenLlamaModel(config)
-        original_model.to(torch_device)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = OpenLlamaModel(config)
-        scaled_model.to(torch_device)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index ac68337ea28a..f96b9f3700b5 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -238,7 +238,7 @@ def check_config_attributes_being_used(config_class):
     modeling_sources = []
     for path in modeling_paths:
         if os.path.isfile(path):
-            with open(path) as fp:
+            with open(path, encoding="utf8") as fp:
                 modeling_sources.append(fp.read())
 
     unused_attributes = []
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 66f75863a30c..7af69519c68f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -1093,7 +1093,7 @@ def check_deprecated_constant_is_up_to_date():
     if len(missing_models) != 0:
         missing_models = ", ".join(missing_models)
         message.append(
-            "The following models are in the deprecated folder, make sur to add them to `DEPRECATED_MODELS` in "
+            "The following models are in the deprecated folder, make sure to add them to `DEPRECATED_MODELS` in "
             f"`models/auto/configuration_auto.py`: {missing_models}."
         )
 

From 37d8611ac9aab61d4a738e511d738847882ef47a Mon Sep 17 00:00:00 2001
From: statelesshz <3140102143@zju.edu.cn>
Date: Thu, 20 Jul 2023 19:09:04 +0800
Subject: [PATCH 707/935] replace no_cuda with use_cpu in test_pytorch_examples
 (#24944)

* replace no_cuda with use_cpu in test_pytorch_examples

* remove codes that never be used

* fix style
---
 examples/pytorch/test_pytorch_examples.py | 24 ++++++++---------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index a94e23e614d5..269d7844f79f 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import argparse
 import json
 import logging
 import os
@@ -76,13 +75,6 @@
 logger = logging.getLogger()
 
 
-def get_setup_file():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f")
-    args = parser.parse_args()
-    return args.f
-
-
 def get_results(output_dir):
     results = {}
     path = os.path.join(output_dir, "all_results.json")
@@ -153,8 +145,8 @@ def test_run_clm(self):
             # Skipping because there are not enough batches to train the model + would need a drop_last to work.
             return
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         with patch.object(sys, "argv", testargs):
             run_clm.main()
@@ -175,8 +167,8 @@ def test_run_clm_config_overrides(self):
             --config_overrides n_embd=10,n_head=2
             """.split()
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         logger = run_clm.logger
         with patch.object(sys, "argv", testargs):
@@ -201,8 +193,8 @@ def test_run_mlm(self):
             --num_train_epochs=1
         """.split()
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         with patch.object(sys, "argv", testargs):
             run_mlm.main()
@@ -231,8 +223,8 @@ def test_run_ner(self):
             --seed 7
         """.split()
 
-        if torch_device != "cuda":
-            testargs.append("--no_cuda")
+        if torch_device == "cpu":
+            testargs.append("--use_cpu")
 
         with patch.object(sys, "argv", testargs):
             run_ner.main()

From 89136ff7f8037fe064b5525e28a54f70f6f770e6 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 20 Jul 2023 12:23:17 +0100
Subject: [PATCH 708/935] Generate: sequence bias can handle same terminations
 (#24822)

---
 src/transformers/generation/logits_process.py | 36 +++++--------------
 tests/generation/test_logits_process.py       |  3 ++
 2 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 5ff46910d8f4..8a3c8ec7aa11 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -624,9 +624,7 @@ def __init__(self, sequence_bias: Dict[Tuple[int], float]):
 
         # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
         # is infered in the first usage, which inhibits initializing here)
-        self.sequences_length_greater_than_1 = []
         self.length_1_bias = None
-        self.length_greather_than_1_bias = None
         self.prepared_bias_variables = False
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -642,11 +640,9 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         bias += self.length_1_bias
 
         # 4 - include the bias from length > 1, after determining which biased sequences may be completed.
-        # `matching_mask` is a (batch_size, vocab_size) boolean mask that is True for all tokens whose corresponding
-        # bias should be applied. The bias is applied on the last token of the sequence, if (and only if) the sequence
-        # may become complete this iteration.
-        matching_mask = torch.zeros_like(scores, dtype=torch.bool)
-        for sequence_ids in self.sequences_length_greater_than_1:
+        for sequence_ids, sequence_bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:  # the sequence is of length 1, already applied
+                continue
             if len(sequence_ids) > input_ids.shape[1]:  # the sequence is longer than the context, ignore
                 continue
             prefix_length = len(sequence_ids) - 1
@@ -655,12 +651,9 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
                 input_ids[:, -prefix_length:],
                 torch.tensor(sequence_ids[:-1], dtype=input_ids.dtype, device=input_ids.device),
             ).prod(dim=1)
-            matching_mask[:, last_token] |= matching_rows.bool()
-        bias += torch.where(
-            matching_mask,
-            self.length_greather_than_1_bias,
-            torch.tensor(0.0, device=self.length_greather_than_1_bias.device),
-        )
+            bias[:, last_token] += torch.where(
+                matching_rows.bool(), sequence_bias, torch.tensor(0.0, device=input_ids.device)
+            )
 
         # 5 - apply the bias to the scores
         scores = scores + bias
@@ -668,12 +661,10 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
     def _prepare_bias_variables(self, scores: torch.FloatTensor):
         vocabulary_size = scores.shape[-1]
-        sequence_bias = self.sequence_bias
-        tokens_with_bias = []
 
         # Check biased tokens out of bounds
         invalid_biases = []
-        for sequence_ids in sequence_bias:
+        for sequence_ids in self.sequence_bias:
             for token_id in sequence_ids:
                 if token_id >= vocabulary_size:
                     invalid_biases.append(token_id)
@@ -686,20 +677,9 @@ def _prepare_bias_variables(self, scores: torch.FloatTensor):
         # Precompute the bias tensors to be applied. Sequences of length 1 are kept separately, as they can be applied
         # with simpler logic.
         self.length_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
-        self.length_greather_than_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
-        for sequence_ids, bias in sequence_bias.items():
+        for sequence_ids, bias in self.sequence_bias.items():
             if len(sequence_ids) == 1:
                 self.length_1_bias[sequence_ids[-1]] = bias
-            else:
-                self.sequences_length_greater_than_1.append(sequence_ids)
-                if self.length_greather_than_1_bias[sequence_ids[-1]] != 0.0:
-                    raise ValueError(
-                        "Setting a bias on sequences that share a common token termination is not yet supported. "
-                        "Please open an issue if you see this error message (after checking that it doesn't already "
-                        "exist)."
-                    )
-                self.length_greather_than_1_bias[sequence_ids[-1]] = bias
-            tokens_with_bias.append(sequence_ids[-1])
 
         self.prepared_bias_variables = True
 
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index e560692d4c29..fed27097a0d3 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -520,6 +520,9 @@ def test_bias_dist_processor(self):
         input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
         positive_bias = {(1,): 100.0, (4,): 100.0}
         negative_bias = {(1, 0): -100.0, (0, 1, 2): -100.0, (1, 3, 1, 3): -100.0}
+        # biases the same termination twice, to ensure we can handle overlapping terminations (it won't have an effect
+        # on the test cases, though)
+        negative_bias.update({(1, 3, 1, 3, 1, 3): -100.0})
         sequence_bias = {**positive_bias, **negative_bias}
 
         # scores = 0 to facilitate checks

From 9859806608c0609ff04cc7999cd04c5252fbc1fb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 20 Jul 2023 07:43:48 -0400
Subject: [PATCH 709/935] Bump pygments from 2.11.2 to 2.15.0 in
 /examples/research_projects/decision_transformer (#24949)

Bump pygments in /examples/research_projects/decision_transformer

Bumps [pygments](https://github.com/pygments/pygments) from 2.11.2 to 2.15.0.
- [Release notes](https://github.com/pygments/pygments/releases)
- [Changelog](https://github.com/pygments/pygments/blob/master/CHANGES)
- [Commits](https://github.com/pygments/pygments/compare/2.11.2...2.15.0)

---
updated-dependencies:
- dependency-name: pygments
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index fbfc600b1524..172cfa841cd6 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -157,7 +157,7 @@ pycodestyle==2.8.0
 pycparser==2.21
 pyctcdecode==0.3.0
 pyflakes==2.4.0
-Pygments==2.11.2
+Pygments==2.15.0
 pygtrie==2.4.2
 pynvml==11.4.1
 pyOpenSSL==22.0.0

From 85514c17d1961fdb831eb3d3715bc9ed7663816b Mon Sep 17 00:00:00 2001
From: Premtim Sa <38909445+premsa@users.noreply.github.com>
Date: Thu, 20 Jul 2023 14:25:38 +0200
Subject: [PATCH 710/935] Update processing_vision_text_dual_encoder.py
 (#24950)

Fixing small typo: kwrags -> kwargs
---
 .../processing_vision_text_dual_encoder.py                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index 72b139c4de96..b6b22a3be052 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -64,7 +64,7 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
-        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
         AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
 

From 35c04596f8938370dd5a2930fb724781f8ea35b0 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 20 Jul 2023 15:01:37 +0200
Subject: [PATCH 711/935] Fix `main_input_name` in
 `src/transformers/keras_callbacks.py` (#24916)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/keras_callbacks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 22e1ff682ccf..15b4edf07d1c 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -193,8 +193,7 @@ def on_epoch_end(self, epoch, logs=None):
             # This dense conditional recognizes the case where we have an encoder-decoder model, but
             # avoids getting tangled up when we just have a model with a layer called 'encoder'
             if hasattr(self.model, "encoder") and hasattr(self.model.encoder, "main_input_name"):
-                if self.model.encoder.main_input_name != self.model.main_input_name:
-                    main_input_name = self.model.encoder.main_input_name
+                main_input_name = self.model.encoder.main_input_name
             else:
                 main_input_name = getattr(self.model, "main_input_name", "input_ids")
 

From 0c41765df4b1d79debdbe0293bc30cf65d0714b8 Mon Sep 17 00:00:00 2001
From: Shauray Singh <39147312+shauray8@users.noreply.github.com>
Date: Thu, 20 Jul 2023 19:39:40 +0530
Subject: [PATCH 712/935] [DOCS] Example for `LogitsProcessor` class (#24848)

* make docs

* fixup

* resolved

* remove debugs

* Revert "fixup"

This reverts commit 5e0f636aae0bf8707bc8bdaa6a9427fbf66834ed.

* prev (ignore)

* fixup broke some files

* remove files

* reverting modeling_reformer

* lang fix
---
 src/transformers/generation/logits_process.py | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 8a3c8ec7aa11..195dfcdc592d 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -193,12 +193,38 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
+    [`LogitsProcessor`] that prevents the repetition of previous tokens through an exponential penalty. This technique
+    shares some similarities with coverage mechanisms and other aimed at reducing repetition. During the text
+    generation process, the probability distribution for the next token is determined using a formula that incorporates
+    token scores based on their occurrence in the generated sequence. Tokens with higher scores are less likely to be
+    selected. The formula can be seen in the original [paper](https://arxiv.org/pdf/1909.05858.pdf). According to the
+    paper a penalty of around 1.2 yields a good balance between truthful generation and lack of repetition.
 
     Args:
         repetition_penalty (`float`):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> # Initializing the model and tokenizer for it
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["I'm not going to"], return_tensors="pt")
+
+    >>> # This shows a normal generate without any specific parameters
+    >>> summary_ids = model.generate(inputs["input_ids"], max_length=20)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    I'm not going to lie, I'm not going to lie. I'm not going to lie
+
+    >>> # This generates a penalty for repeated tokens
+    >>> penalized_ids = model.generate(inputs["input_ids"], max_length=20, repetition_penalty=1.2)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    I'm not going to lie, I was really excited about this. It's a great game
+    ```
     """
 
     def __init__(self, penalty: float):

From e75cb0cb3c5fef887abea6f099252e59a659af9d Mon Sep 17 00:00:00 2001
From: Shauray Singh <39147312+shauray8@users.noreply.github.com>
Date: Thu, 20 Jul 2023 19:43:13 +0530
Subject: [PATCH 713/935] fix type annotations for arguments in training_args
 (#24550)

* testing

* example script

* fix typehinting

* some tests

* make test

* optional update

* Union of arguments

* does this fix the issue

* remove reports

* set default to False

* documentation change

* None support

* does not need None

* Fix typing annotations for FSDP and DeepSpeed in TrainingArguments (#24549)

* Fix typing annotations for FSDP and DeepSpeed in TrainingArguments

* Change dict to Dict

* Revert "Fix typing annotations for FSDP and DeepSpeed in TrainingArguments" (#24574)

Revert "Fix typing annotations for FSDP and DeepSpeed in TrainingArguments (#24549)"

This reverts commit c5e29d4381d4b9739e6cb427adbca87fbb43a3ad.

* Fix typing annotations for FSDP and DeepSpeed in TrainingArguments (#24549)

* Fix typing annotations for FSDP and DeepSpeed in TrainingArguments

* Change dict to Dict

* merge

* hacky fix

* fixup

---------

Co-authored-by: Max Ryabinin <mryabinin0@gmail.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/training_args.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 42a9ee2fbc26..7be24cc315c8 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -406,7 +406,7 @@ class TrainingArguments:
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
             stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
             can take a long time) but will not yield the same results as the interrupted training would have.
-        sharded_ddp (`bool`, `str` or list of [`~trainer_utils.ShardedDDPOption`], *optional*, defaults to `False`):
+        sharded_ddp (`bool`, `str` or list of [`~trainer_utils.ShardedDDPOption`], *optional*, defaults to `''`):
             Use Sharded DDP training from [FairScale](https://github.com/facebookresearch/fairscale) (in distributed
             training only). This is an experimental feature.
 
@@ -421,7 +421,7 @@ class TrainingArguments:
 
             If a string is passed, it will be split on space. If a bool is passed, it will be converted to an empty
             list for `False` and `["simple"]` for `True`.
-        fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `False`):
+        fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `''`):
             Use PyTorch Distributed Parallel Training (in distributed training only).
 
             A list of options along the following:
@@ -969,7 +969,7 @@ class TrainingArguments:
             )
         },
     )
-    sharded_ddp: str = field(
+    sharded_ddp: Optional[Union[List[ShardedDDPOption], str]] = field(
         default="",
         metadata={
             "help": (
@@ -980,7 +980,7 @@ class TrainingArguments:
             ),
         },
     )
-    fsdp: str = field(
+    fsdp: Optional[Union[List[FSDPOption], str]] = field(
         default="",
         metadata={
             "help": (
@@ -1005,8 +1005,8 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The  value is either a"
-                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded  json file as `dict`."
+                "Config to be used with FSDP (Pytorch Fully Sharded  Data Parallel). The value is either a"
+                "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`."
             )
         },
     )
@@ -1019,11 +1019,11 @@ class TrainingArguments:
             )
         },
     )
-    deepspeed: Optional[str] = field(
+    deepspeed: Optional[Union[str, Dict]] = field(
         default=None,
         metadata={
             "help": (
-                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already"
                 " loaded json file as a dict"
             )
         },

From 9f912ef62ab4ee2ad88bb8ada19414a333d32615 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 20 Jul 2023 12:17:38 -0400
Subject: [PATCH 714/935] Bump aiohttp from 3.8.1 to 3.8.5 in
 /examples/research_projects/decision_transformer (#24954)

Bump aiohttp in /examples/research_projects/decision_transformer

Bumps [aiohttp](https://github.com/aio-libs/aiohttp) from 3.8.1 to 3.8.5.
- [Release notes](https://github.com/aio-libs/aiohttp/releases)
- [Changelog](https://github.com/aio-libs/aiohttp/blob/v3.8.5/CHANGES.rst)
- [Commits](https://github.com/aio-libs/aiohttp/compare/v3.8.1...v3.8.5)

---
updated-dependencies:
- dependency-name: aiohttp
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index 172cfa841cd6..e7d3be0c4ccc 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -1,5 +1,5 @@
 absl-py==1.0.0
-aiohttp==3.8.1
+aiohttp==3.8.5
 aiosignal==1.2.0
 alembic==1.7.7
 appdirs==1.4.4

From 89a1f3427129954f2fbb517d4f65571db98811bd Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:29:23 +0200
Subject: [PATCH 715/935] [`RWKV`] Add Gradient Checkpointing support for RWKV
 (#24955)

add GC support for RWKV
---
 src/transformers/models/rwkv/modeling_rwkv.py | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index 255f0e3abf74..2d20590628fb 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -406,6 +406,7 @@ class RwkvPreTrainedModel(PreTrainedModel):
     base_model_prefix = "rwkv"
     _no_split_modules = ["RwkvBlock"]
     _keep_in_fp32_modules = ["time_decay", "time_first"]
+    supports_gradient_checkpointing = True
 
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -605,6 +606,8 @@ def __init__(self, config):
 
         self.layers_are_rescaled = False
 
+        self.gradient_checkpointing = False
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -659,14 +662,35 @@ def forward(
             ]
             state[4] -= 1e30
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         hidden_states = inputs_embeds
 
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for idx, block in enumerate(self.blocks):
-            hidden_states, state, attentions = block(
-                hidden_states, state=state, use_cache=use_cache, output_attentions=output_attentions
-            )
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                hidden_states, state, attentions = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block), hidden_states, state
+                )
+            else:
+                hidden_states, state, attentions = block(
+                    hidden_states, state=state, use_cache=use_cache, output_attentions=output_attentions
+                )
+
             if (
                 self.layers_are_rescaled
                 and self.config.rescale_every > 0

From aa1b09c5d1ab951bfd39778126a8a9a5cdf10ffc Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 20 Jul 2023 12:30:10 -0400
Subject: [PATCH 716/935] Change logic for logging in the examples (#24956)

Change logic
---
 .../pytorch/audio-classification/run_audio_classification.py    | 2 +-
 examples/pytorch/contrastive-image-text/run_clip.py             | 2 +-
 .../pytorch/image-classification/run_image_classification.py    | 2 +-
 examples/pytorch/image-pretraining/run_mae.py                   | 2 +-
 examples/pytorch/image-pretraining/run_mim.py                   | 2 +-
 examples/pytorch/language-modeling/run_clm.py                   | 2 +-
 examples/pytorch/language-modeling/run_mlm.py                   | 2 +-
 examples/pytorch/language-modeling/run_plm.py                   | 2 +-
 examples/pytorch/multiple-choice/run_swag.py                    | 2 +-
 examples/pytorch/question-answering/run_qa.py                   | 2 +-
 examples/pytorch/question-answering/run_qa_beam_search.py       | 2 +-
 examples/pytorch/question-answering/run_seq2seq_qa.py           | 2 +-
 .../pytorch/semantic-segmentation/run_semantic_segmentation.py  | 2 +-
 .../pytorch/speech-recognition/run_speech_recognition_ctc.py    | 2 +-
 .../speech-recognition/run_speech_recognition_ctc_adapter.py    | 2 +-
 .../speech-recognition/run_speech_recognition_seq2seq.py        | 2 +-
 examples/pytorch/summarization/run_summarization.py             | 2 +-
 examples/pytorch/text-classification/run_classification.py      | 2 +-
 examples/pytorch/text-classification/run_glue.py                | 2 +-
 examples/pytorch/text-classification/run_xnli.py                | 2 +-
 examples/pytorch/token-classification/run_ner.py                | 2 +-
 examples/pytorch/translation/run_translation.py                 | 2 +-
 22 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 2edb90516b60..d610b0470a26 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -222,7 +222,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} "
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 4685b62d28ca..fef7c9fec351 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -259,7 +259,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 88cfdc24af5a..f905c2cc2139 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -200,7 +200,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 3076b7b632d1..6685933b1a5a 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -199,7 +199,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 4813ed3839ce..01324da1884a 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -263,7 +263,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index f7825b0afe95..e4fb4bd7b7b1 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -263,7 +263,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 9df7a89afe4f..78a9eebaa4cf 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -263,7 +263,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     logger.info(f"Training/evaluation parameters {training_args}")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index d69d1621f485..d35e78f17090 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -254,7 +254,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 2050863eaf1d..f9836ce93463 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -250,7 +250,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 161c0f9cdd74..854bf9aca13e 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -252,7 +252,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 652678e43ffc..84e2948a2d94 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -251,7 +251,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index 3924ecfcb2e2..e232dacf1c44 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -298,7 +298,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 227ba360a05e..e200f658d456 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -289,7 +289,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 1b7bf7b06571..c8dbdda70dd8 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -409,7 +409,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     if is_main_process(training_args.local_rank):
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index d1114d04f950..f2e2234e86d7 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -405,7 +405,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     if is_main_process(training_args.local_rank):
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 5305f91eedfe..ce86ba1086f3 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -300,7 +300,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 8168482ca75b..9d1b38f2c267 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -337,7 +337,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 0a908277850a..1b680973c7f4 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -293,7 +293,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 3af84b315892..e361a735500f 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -241,7 +241,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index a7c858115e7a..a3fdf5e81593 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -200,7 +200,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 38800c1754bc..34c6e6faa4d5 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -242,7 +242,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 992abfecc0db..1004e77e489e 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -286,7 +286,7 @@ def main():
     # Log on each process the small summary:
     logger.warning(
         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 

From caf5e369fc7b4755d9f98568cbe5e36a0898c96c Mon Sep 17 00:00:00 2001
From: Benjamin Badger <54602201+blbadger@users.noreply.github.com>
Date: Thu, 20 Jul 2023 13:46:53 -0400
Subject: [PATCH 717/935] Contrastive Search peak memory reduction (#24120)

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 .../generation/configuration_utils.py         |   4 +
 src/transformers/generation/utils.py          | 131 +++++++++++++-----
 tests/generation/test_utils.py                |  43 ++++++
 3 files changed, 147 insertions(+), 31 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 096424b8585b..8b65e75904ed 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -189,6 +189,9 @@ class GenerationConfig(PushToHubMixin):
             The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
             Higher guidance scale encourages the model to generate samples that are more closely linked to the input
             prompt, usually at the expense of poorer quality.
+        low_memory (`bool`, *optional*):
+            Switch to sequential topk for contrastive search to reduce peak memory. Used with contrastive search.
+
 
         > Parameters that define the output variables of `generate`
 
@@ -270,6 +273,7 @@ def __init__(self, **kwargs):
         self.forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
         self.sequence_bias = kwargs.pop("sequence_bias", None)
         self.guidance_scale = kwargs.pop("guidance_scale", None)
+        self.low_memory = kwargs.pop("low_memory", None)
 
         # Parameters that define the output variables of `generate`
         self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index b4ef0af48ec9..0c657df7dd97 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1569,6 +1569,7 @@ def generate(
                 return_dict_in_generate=generation_config.return_dict_in_generate,
                 synced_gpus=synced_gpus,
                 streamer=streamer,
+                sequential=generation_config.low_memory,
                 **model_kwargs,
             )
 
@@ -1832,6 +1833,7 @@ def contrastive_search(
         return_dict_in_generate: Optional[bool] = None,
         synced_gpus: bool = False,
         streamer: Optional["BaseStreamer"] = None,
+        sequential: Optional[bool] = None,
         **model_kwargs,
     ) -> Union[ContrastiveSearchOutput, torch.LongTensor]:
         r"""
@@ -1882,6 +1884,8 @@ def contrastive_search(
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            sequential (`bool`, *optional*):
+                Switches topk hidden state computation from parallel to sequential to reduce memory if True.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -1921,6 +1925,7 @@ def contrastive_search(
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
         pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        sequential = sequential if sequential is not None else self.generation_config.low_memory
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
@@ -1986,6 +1991,7 @@ def contrastive_search(
                     last_hidden_states = outputs.decoder_hidden_states[-1]
                 else:
                     last_hidden_states = outputs.hidden_states[-1]
+
                 # next logit for contrastive search to select top-k candidate tokens
                 logit_for_next_step = outputs.logits[:, -1, :]
 
@@ -1995,11 +2001,11 @@ def contrastive_search(
                     is_encoder_decoder=self.config.is_encoder_decoder,
                     standardize_cache_format=True,
                 )
-
-                # Expands model inputs top_k times, for batched forward passes (akin to beam search).
-                _, model_kwargs = self._expand_inputs_for_generation(
-                    expand_size=top_k, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
-                )
+                if not sequential:
+                    # Expands model inputs top_k times, for batched forward passes (akin to beam search).
+                    _, model_kwargs = self._expand_inputs_for_generation(
+                        expand_size=top_k, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
+                    )
 
                 past_key_values = model_kwargs.get("past_key_values")
                 if past_key_values is None:
@@ -2019,7 +2025,6 @@ def contrastive_search(
             # contrastive_search main logic start:
             # contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by
             # degeneration penalty
-
             logit_for_next_step = logits_processor(input_ids, logit_for_next_step)
             logit_for_next_step = logits_warper(input_ids, logit_for_next_step)
             next_probs = nn.functional.softmax(logit_for_next_step, dim=-1)
@@ -2049,25 +2054,74 @@ def contrastive_search(
                 items = []
                 # item is either the key or the value matrix
                 for item in layer:
-                    items.append(item.repeat_interleave(top_k, dim=0))
+                    if sequential:
+                        items.append(item.repeat_interleave(1, dim=0))
+                    else:
+                        items.append(item.repeat_interleave(top_k, dim=0))
                 new_key_values.append(items)
             model_kwargs["past_key_values"] = new_key_values
 
-            # compute the candidate tokens by the language model and collects their hidden_states
-            next_model_inputs = self.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
-            outputs = self(
-                **next_model_inputs, return_dict=True, output_hidden_states=True, output_attentions=output_attentions
-            )
-            next_past_key_values = self._extract_past_from_model_output(outputs, standardize_cache_format=True)
+            if sequential:
+                all_outputs = {key: [] for key in outputs}  # defined in first loop iteration
+                all_last_hstates, all_hstates, all_logits = [], [], []
+                for i in range(top_k):
+                    # compute the candidate tokens by the language model and collect their hidden_states
+                    next_model_inputs = self.prepare_inputs_for_generation(top_k_ids[:, i].view(-1, 1), **model_kwargs)
+
+                    outputs = self(
+                        **next_model_inputs,
+                        return_dict=True,
+                        output_hidden_states=True,
+                        output_attentions=output_attentions,
+                    )
+                    for key in all_outputs:
+                        all_outputs[key].append(outputs[key])
+
+                    if self.config.is_encoder_decoder:
+                        next_hidden = outputs.decoder_hidden_states[-1]
+                        full_hidden_states = outputs.decoder_hidden_states
+
+                    else:
+                        next_hidden = outputs.hidden_states[-1]
+                        full_hidden_states = outputs.hidden_states
+
+                    all_last_hstates.append(torch.squeeze(next_hidden, 0))
+                    all_hstates.append(full_hidden_states)
+                    all_logits.append(outputs.logits[:, -1, :])
+
+                # stack hidden states
+                next_hidden = torch.stack([all_last_hstates[i] for i in range(top_k)], dim=0)
+                final_full_hstates = [0 for i in range(len(full_hidden_states))]
+                for layer in range(len(full_hidden_states)):
+                    final_full_hstates[layer] = torch.stack(
+                        [torch.squeeze(all_hstates[i][layer], 0) for i in range(top_k)], dim=0
+                    )
+                full_hidden_states = tuple(final_full_hstates)
+
+                # stack logits
+                logits = torch.cat(all_logits, dim=0)
 
-            logits = outputs.logits[:, -1, :]
-            # name is different for encoder-decoder and decoder-only models
-            if self.config.is_encoder_decoder:
-                next_hidden = outputs.decoder_hidden_states[-1]
-                full_hidden_states = outputs.decoder_hidden_states
             else:
-                next_hidden = outputs.hidden_states[-1]
-                full_hidden_states = outputs.hidden_states
+                # compute the candidate tokens by the language model and collect their hidden_states
+                # assembles top_k_ids into batch of size k
+                next_model_inputs = self.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
+
+                outputs = self(
+                    **next_model_inputs,
+                    return_dict=True,
+                    output_hidden_states=True,
+                    output_attentions=output_attentions,
+                )
+                # name is different for encoder-decoder and decoder-only models
+                if self.config.is_encoder_decoder:
+                    next_hidden = outputs.decoder_hidden_states[-1]
+                    full_hidden_states = outputs.decoder_hidden_states
+                else:
+                    next_hidden = outputs.hidden_states[-1]
+                    full_hidden_states = outputs.hidden_states
+
+                logits = outputs.logits[:, -1, :]
+
             context_hidden = last_hidden_states.repeat_interleave(top_k, dim=0)
 
             # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
@@ -2089,17 +2143,32 @@ def contrastive_search(
                 layer = torch.stack(torch.split(layer, top_k))[range(batch_size), selected_idx, :]
                 next_decoder_hidden_states += (layer,)
 
-            # select the past_key_value
-            new_key_values = ()
-            for layer in next_past_key_values:
-                items = ()
-                # item is either the key or the value matrix
-                for item in layer:
-                    item = torch.stack(torch.split(item, top_k, dim=0))  # [B, K, num_head, seq_len, esz]
-                    item = item[range(batch_size), selected_idx, ...]  # [B, num_head, seq_len, esz]
-                    items += (item,)
-                new_key_values += (items,)
-            next_past_key_values = new_key_values
+            # generate past_key_values cache of only the selected token
+            if sequential:
+                next_model_input = self.prepare_inputs_for_generation(
+                    top_k_ids[:, selected_idx].view(-1, 1), **model_kwargs
+                )
+
+                selected_outputs = self(
+                    **next_model_input,
+                    return_dict=True,
+                    output_hidden_states=False,
+                    output_attentions=False,
+                )
+                next_past_key_values = selected_outputs["past_key_values"]
+
+            else:
+                next_past_key_values = self._extract_past_from_model_output(outputs, standardize_cache_format=True)
+                new_key_values = ()
+                for layer in next_past_key_values:
+                    items = ()
+                    # item is either the key or the value matrix
+                    for item in layer:
+                        item = torch.stack(torch.split(item, top_k, dim=0))  # [B, K, num_head, seq_len, esz]
+                        item = item[range(batch_size), selected_idx, ...]  # [B, num_head, seq_len, esz]
+                        items += (item,)
+                    new_key_values += (items,)
+                next_past_key_values = new_key_values
 
             logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 0a133eb6bd1f..0f50632c63e4 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1457,6 +1457,49 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
             for output in (output_contrastive, output_generate):
                 self._check_outputs(output, input_ids, model.config, use_cache=True)
 
+    def test_contrastive_generate_low_memory(self):
+        # Check that choosing 'low_memory' does not change the model output
+        for model_class in self.all_generative_model_classes:
+            # won't fix: FSMT, Reformer, gptbigcode, and speech2text have a different cache variable type (and format).
+            if any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["fsmt", "reformer", "gptbigcode", "speech2text"]
+            ):
+                return
+
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+
+            # NOTE: contrastive search only works with cache on at the moment.
+            if not hasattr(config, "use_cache"):
+                return
+
+            config.use_cache = True
+            config.is_decoder = True
+
+            # test output equality of low versus high memory
+            model = model_class(config).to(torch_device).eval()
+
+            low_output = model.generate(
+                input_ids,
+                top_k=4,
+                penalty_alpha=0.6,
+                low_memory=True,
+                max_length=max_length,
+                attention_mask=attention_mask,
+            )
+
+            high_output = model.generate(
+                input_ids,
+                top_k=4,
+                penalty_alpha=0.6,
+                low_memory=False,
+                max_length=max_length,
+                attention_mask=attention_mask,
+            )
+            self.assertListEqual(low_output.tolist(), high_output.tolist())
+
+        return
+
     @slow  # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%.
     def test_assisted_decoding_matches_greedy_search(self):
         # This test ensures that the assisted generation does not introduce output changes over greedy search.

From 9ef5256dfb28f5419649af8ea94e82573a6b2e77 Mon Sep 17 00:00:00 2001
From: Apoorv Khandelwal <mail@apoorvkh.com>
Date: Thu, 20 Jul 2023 15:19:35 -0400
Subject: [PATCH 718/935] Fallback for missing attribute `Parameter.ds_numel`
 (#24942)

* [trainer] fallback for deepspeed param count

* [trainer] more readable numel count
---
 src/transformers/trainer_pt_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index dd995b9bfe70..b57770f33b29 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -1043,7 +1043,7 @@ def get_model_param_count(model, trainable_only=False):
     if is_deepspeed_zero3_enabled():
 
         def numel(p):
-            return p.ds_numel
+            return p.ds_numel if hasattr(p, "ds_numel") else p.numel()
 
     else:
 

From 1c7e5e236823cd38faac8115f96205a82c17fff9 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 21 Jul 2023 12:17:26 +0530
Subject: [PATCH 719/935] fix fsdp checkpointing issues (#24926)

* fix fsdp load

* Update trainer.py

* remove saving duplicate state_dict
---
 src/transformers/trainer.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e401a0b8d37c..da4e623205a7 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2115,7 +2115,7 @@ def _load_best_model(self):
                         state_dict["_smp_is_partial"] = False
                         load_result = model.load_state_dict(state_dict, strict=True)
                 elif self.is_fsdp_enabled:
-                    load_fsdp_model(
+                    load_result = load_fsdp_model(
                         self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint
                     )
                 else:
@@ -2298,6 +2298,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 # Needs to be called on all ranks to gather all states.
                 # full_optim_state_dict will be deprecated after Pytorch 2.2!
                 full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
+                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
 
         if is_torch_tpu_available():
             xm.rendezvous("saving_optimizer_states")
@@ -2321,12 +2322,9 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 reissue_pt_warnings(caught_warnings)
                 if self.do_grad_scaling:
                     torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
-        elif self.args.should_save and not self.is_deepspeed_enabled:
+        elif self.args.should_save and not self.is_deepspeed_enabled and not (self.fsdp or self.is_fsdp_enabled):
             # deepspeed.save_checkpoint above saves model/optim/sched
-            if self.fsdp and not self.is_fsdp_enabled:
-                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
-            else:
-                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
 
             with warnings.catch_warnings(record=True) as caught_warnings:
                 torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
@@ -2731,10 +2729,16 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             or self.fsdp is not None
             or self.is_fsdp_enabled
         ):
-            state_dict = self.model.state_dict()
+            state_dict = self.model.state_dict() if not self.is_fsdp_enabled else {}
             if self.args.should_save:
                 self._save(output_dir, state_dict=state_dict)
             if self.is_fsdp_enabled:
+                # remove the dummy state_dict saved above
+                if self.args.should_save:
+                    for filename in [WEIGHTS_NAME, SAFE_WEIGHTS_NAME]:
+                        file = os.path.join(output_dir, filename)
+                        if os.path.isfile(file):
+                            os.remove(file)
                 save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
 
         elif self.is_deepspeed_enabled:

From 83f9314d10ffe4ed30b9b7e642d3621e77ae080e Mon Sep 17 00:00:00 2001
From: Jim Allanson <github@jimallanson.com>
Date: Fri, 21 Jul 2023 13:16:57 +0100
Subject: [PATCH 720/935] fix: cast input pixels to appropriate dtype for
 image_to_text pipelines (#24947)

* fix: cast input pixels to appropriate dtype for image_to_text tasks

* fix: add casting to pixel inputs of additional models after running copy checks
---
 src/transformers/models/altclip/modeling_altclip.py           | 3 ++-
 src/transformers/models/blip/modeling_blip.py                 | 2 +-
 src/transformers/models/blip_2/modeling_blip_2.py             | 2 +-
 src/transformers/models/bridgetower/modeling_bridgetower.py   | 3 ++-
 src/transformers/models/chinese_clip/modeling_chinese_clip.py | 3 ++-
 src/transformers/models/clip/modeling_clip.py                 | 3 ++-
 src/transformers/models/git/modeling_git.py                   | 3 ++-
 src/transformers/models/instructblip/modeling_instructblip.py | 2 +-
 src/transformers/models/x_clip/modeling_x_clip.py             | 3 ++-
 9 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 820c0ad026f4..bbffbf10b688 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1022,7 +1022,8 @@ def __init__(self, config: AltCLIPVisionConfig):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index d52aa15fca7a..9fca7c28a1a0 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -246,7 +246,7 @@ def __init__(self, config: BlipVisionConfig):
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 5856df2c2572..a54aa9f633c3 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -109,7 +109,7 @@ def __init__(self, config: Blip2VisionConfig):
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index b2e03441d3f6..5fa87c853ab8 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -284,7 +284,8 @@ def __init__(self, config: BridgeTowerVisionConfig):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 32159c5ab854..f8972fab1ddc 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -196,7 +196,8 @@ def __init__(self, config: ChineseCLIPVisionConfig):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 2297919cd14c..c72e0d43e6b5 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -192,7 +192,8 @@ def __init__(self, config: CLIPVisionConfig):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 89696694ff4e..0c127a48dc47 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -628,7 +628,8 @@ def __init__(self, config: GitVisionConfig):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 645c38c20462..3d44bfd2cd7d 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -110,7 +110,7 @@ def __init__(self, config: InstructBlipVisionConfig):
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index e767c701b95d..44c6706afac9 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -143,7 +143,8 @@ def __init__(self, config: XCLIPVisionConfig):
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)

From ec3dfe5e2472e500d0329190e5d5121cbf6e7fc8 Mon Sep 17 00:00:00 2001
From: Wonhyeong Seo <wonhseo@kakao.com>
Date: Fri, 21 Jul 2023 21:19:28 +0900
Subject: [PATCH 721/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Fixed=20Kore?=
 =?UTF-8?q?an=20and=20English=20`quicktour.md`=20(#24664)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: english/korean quicktour.md

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Kihoon Son <75935546+kihoon71@users.noreply.github.com>

* fix: follow glossary

* 파인튜닝 -> 미세조정

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Kihoon Son <75935546+kihoon71@users.noreply.github.com>
---
 docs/source/en/quicktour.md |  10 +-
 docs/source/ko/quicktour.md | 205 +++++++++++++++++++-----------------
 2 files changed, 113 insertions(+), 102 deletions(-)

diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
index 442542234f27..268679fc5eb1 100644
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -64,7 +64,7 @@ For a complete list of available tasks, check out the [pipeline API reference](.
 | Audio classification         | assign a label to some audio data                                                                            | Audio           | pipeline(task=“audio-classification”)         |
 | Automatic speech recognition | transcribe speech into text                                                                                  | Audio           | pipeline(task=“automatic-speech-recognition”) |
 | Visual question answering    | answer a question about the image, given an image and a question                                             | Multimodal      | pipeline(task=“vqa”)                          |
-| Document question answering  | answer a question about a document, given an image and a question                                            | Multimodal      | pipeline(task="document-question-answering")  |
+| Document question answering  | answer a question about the document, given a document and a question                                        | Multimodal      | pipeline(task="document-question-answering")  |
 | Image captioning             | generate a caption for a given image                                                                         | Multimodal      | pipeline(task="image-to-text")                |
 
 Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. In this guide, you'll use the [`pipeline`] for sentiment analysis as an example:
@@ -289,7 +289,7 @@ See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] c
 
 </Tip>
 
-Now pass your preprocessed batch of inputs directly to the model by passing the dictionary keys directly to the tensors:
+Now pass your preprocessed batch of inputs directly to the model. You can pass the tensors as-is:
 
 ```py
 >>> tf_outputs = tf_model(tf_batch)
@@ -410,7 +410,7 @@ All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn
 
 Depending on your task, you'll typically pass the following parameters to [`Trainer`]:
 
-1. A [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
+1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
 
    ```py
    >>> from transformers import AutoModelForSequenceClassification
@@ -432,7 +432,7 @@ Depending on your task, you'll typically pass the following parameters to [`Trai
    ... )
    ```
 
-3. A preprocessing class like a tokenizer, image processor, feature extractor, or processor:
+3. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
 
    ```py
    >>> from transformers import AutoTokenizer
@@ -512,7 +512,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
    >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
    ```
 
-2. A preprocessing class like a tokenizer, image processor, feature extractor, or processor:
+2. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
 
    ```py
    >>> from transformers import AutoTokenizer
diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md
index d8c4038af9dd..a523aa53e01a 100644
--- a/docs/source/ko/quicktour.md
+++ b/docs/source/ko/quicktour.md
@@ -14,18 +14,19 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# 둘러보기[[quick-tour]]
+# 둘러보기 [[quick-tour]]
 
 [[open-in-colab]]
-🤗 Transformer를 시작해봐요! 둘러보기는 개발자와 일반 사용자 모두를 위해 쓰여졌습니다. [`pipeline`]으로 추론하는 방법, [AutoClass](./model_doc/auto)로 사전학습된 모델과 전처리기를 적재하는 방법과 PyTorch 또는 TensorFlow로 신속하게 모델을 훈련시키는 방법을 보여줍니다. 기본을 배우고 싶다면 튜토리얼이나 [course](https://huggingface.co/course/chapter1/1)에서 여기 소개된 개념에 대한 자세한 설명을 확인하시길 권장합니다.
 
-시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하고,
+🤗 Transformers를 시작해보세요! 개발해본 적이 없더라도 쉽게 읽을 수 있도록 쓰인 이 글은 [`pipeline`](./main_classes/pipelines)을 사용하여 추론하고, 사전학습된 모델과 전처리기를 [AutoClass](./model_doc/auto)로 로드하고, PyTorch 또는 TensorFlow로 모델을 빠르게 학습시키는 방법을 소개해 드릴 것입니다. 본 가이드에서 소개되는 개념을 (특히 초보자의 관점으로) 더 친절하게 접하고 싶다면, 튜토리얼이나 [코스](https://huggingface.co/course/chapter1/1)를 참조하기를 권장합니다.
+
+시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
 
 ```bash
 !pip install transformers datasets
 ```
 
-좋아하는 머신러닝 프레임워크도 설치해야 합니다.
+또한 선호하는 머신 러닝 프레임워크를 설치해야 합니다:
 
 <frameworkcontent>
 <pt>
@@ -40,29 +41,37 @@ pip install tensorflow
 </tf>
 </frameworkcontent>
 
-## Pipeline (파이프라인)
+## 파이프라인 [[pipeline]]
 
 <Youtube id="tiZFewofSLM"/>
 
-[`pipeline`]은 사전학습된 모델을 사용해 추론할 때 제일 쉬운 방법입니다. 여러 모달리티의 수많은 태스크에 [`pipeline`]을 즉시 사용할 수 있습니다. 지원하는 태스크의 예시는 아래 표를 참고하세요.
+[`pipeline`](./main_classes/pipelines)은 사전 훈련된 모델로 추론하기에 가장 쉽고 빠른 방법입니다. [`pipeline`]은 여러 모달리티에서 다양한 과업을 쉽게 처리할 수 있으며, 아래 표에 표시된 몇 가지 과업을 기본적으로 지원합니다:
+
+<Tip>
 
-| **태스크**     | **설명**                                                            | **모달리티**     | **파이프라인 ID**                             |
-|----------------|---------------------------------------------------------------------|------------------|-----------------------------------------------|
-| 텍스트 분류    | 텍스트에 알맞은 라벨 붙이기                                         | 자연어 처리(NLP) | pipeline(task="sentiment-analysis")           |
-| 텍스트 생성    | 주어진 문자열 입력과 이어지는 텍스트 생성하기                       | 자연어 처리(NLP) | pipeline(task="text-generation")              |
-| 개체명 인식    | 문자열의 각 토큰마다 알맞은 라벨 붙이기 (인물, 조직, 장소 등등)     | 자연어 처리(NLP) | pipeline(task="ner")                          |
-| 질의응답       | 주어진 문맥과 질문에 따라 올바른 대답하기                           | 자연어 처리(NLP) | pipeline(task="question-answering")           |
-| 빈칸 채우기    | 문자열의 빈칸에 알맞은 토큰 맞추기                                  | 자연어 처리(NLP) | pipeline(task="fill-mask")                    |
-| 요약           | 텍스트나 문서를 요약하기                                            | 자연어 처리(NLP) | pipeline(task="summarization")                |
-| 번역           | 텍스트를 한 언어에서 다른 언어로 번역하기                           | 자연어 처리(NLP) | pipeline(task="translation")                  |
-| 이미지 분류    | 이미지에 알맞은 라벨 붙이기                                         | 컴퓨터 비전(CV)  | pipeline(task="image-classification")         |
-| 이미지 분할    | 이미지의 픽셀마다 라벨 붙이기(시맨틱, 파놉틱 및 인스턴스 분할 포함) | 컴퓨터 비전(CV)  | pipeline(task="image-segmentation")           |
-| 객체 탐지      | 이미지 속 객체의 경계 상자를 그리고 클래스를 예측하기               | 컴퓨터 비전(CV)  | pipeline(task="object-detection")             |
-| 오디오 분류    | 오디오 파일에 알맞은 라벨 붙이기                                    | 오디오           | pipeline(task="audio-classification")         |
-| 자동 음성 인식 | 오디오 파일 속 음성을 텍스트로 바꾸기                               | 오디오           | pipeline(task="automatic-speech-recognition") |
-| 시각 질의응답  | 주어진 이미지와 이미지에 대한 질문에 따라 올바르게 대답하기         | 멀티모달         | pipeline(task="vqa")                          |
+사용 가능한 작업의 전체 목록은 [Pipelines API 참조](./main_classes/pipelines)를 확인하세요.
 
-먼저 [`pipeline`]의 인스턴스를 만들어 적용할 태스크를 고르세요. 위 태스크들은 모두 [`pipeline`]을 사용할 수 있고, 지원하는 태스크의 전체 목록을 보려면 [pipeline API 레퍼런스](./main_classes/pipelines)를 확인해주세요. 간단한 예시로 감정 분석 태스크에 [`pipeline`]를 적용해 보겠습니다.
+</Tip>
+
+| **태스크**      | **설명**                                                             | **모달리티**     | **파이프라인 ID**                             |
+|-----------------|----------------------------------------------------------------------|------------------|-----------------------------------------------|
+| 텍스트 분류      | 텍스트에 알맞은 레이블 붙이기                                         | 자연어 처리(NLP) | pipeline(task="sentiment-analysis")           |
+| 텍스트 생성      | 주어진 문자열 입력과 이어지는 텍스트 생성하기                       | 자연어 처리(NLP) | pipeline(task="text-generation")              |
+| 개체명 인식      | 문자열의 각 토큰마다 알맞은 레이블 붙이기 (인물, 조직, 장소 등등)     | 자연어 처리(NLP) | pipeline(task="ner")                          |
+| 질의응답         | 주어진 문맥과 질문에 따라 올바른 대답하기                           | 자연어 처리(NLP) | pipeline(task="question-answering")           |
+| 빈칸 채우기      | 문자열의 빈칸에 알맞은 토큰 맞추기                                  | 자연어 처리(NLP) | pipeline(task="fill-mask")                    |
+| 요약             | 텍스트나 문서를 요약하기                                            | 자연어 처리(NLP) | pipeline(task="summarization")                |
+| 번역             | 텍스트를 한 언어에서 다른 언어로 번역하기                           | 자연어 처리(NLP) | pipeline(task="translation")                  |
+| 이미지 분류      | 이미지에 알맞은 레이블 붙이기                                         | 컴퓨터 비전(CV)  | pipeline(task="image-classification")         |
+| 이미지 분할      | 이미지의 픽셀마다 레이블 붙이기(시맨틱, 파놉틱 및 인스턴스 분할 포함) | 컴퓨터 비전(CV)  | pipeline(task="image-segmentation")           |
+| 객체 탐지        | 이미지 속 객체의 경계 상자를 그리고 클래스를 예측하기               | 컴퓨터 비전(CV)  | pipeline(task="object-detection")             |
+| 오디오 분류      | 오디오 파일에 알맞은 레이블 붙이기                                    | 오디오           | pipeline(task="audio-classification")         |
+| 자동 음성 인식   | 오디오 파일 속 음성을 텍스트로 바꾸기                               | 오디오           | pipeline(task="automatic-speech-recognition") |
+| 시각 질의응답    | 주어진 이미지와 질문에 대해 올바르게 대답하기                       | 멀티모달         | pipeline(task="vqa")                          |
+| 문서 질의응답    | 주어진 문서와 질문에 대해 올바르게 대답하기                         | 멀티모달         | pipeline(task="document-question-answering")  |
+| 이미지 캡션 달기 | 주어진 이미지의 캡션 생성하기                                       | 멀티모달         | pipeline(task="image-to-text")                |
+
+먼저 [`pipeline`]의 인스턴스를 생성하고 사용할 작업을 지정합니다. 이 가이드에서는 감정 분석을 위해 [`pipeline`]을 사용하는 예제를 보여드리겠습니다:
 
 ```py
 >>> from transformers import pipeline
@@ -70,14 +79,14 @@ pip install tensorflow
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-[`pipeline`]은 기본 [사전학습된 모델(영어)](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)와 감정 분석을 하기 위한 tokenizer를 다운로드하고 캐시해놓습니다. 이제 원하는 텍스트에 `classifier`를 사용할 수 있습니다.
+[`pipeline`]은 감정 분석을 위한 [사전 훈련된 모델](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)과 토크나이저를 자동으로 다운로드하고 캐시합니다. 이제 `classifier`를 대상 텍스트에 사용할 수 있습니다:
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
 [{'label': 'POSITIVE', 'score': 0.9998}]
 ```
 
-입력이 여러 개라면, 입력을 [`pipeline`]에 리스트로 전달해서 딕셔너리로 된 리스트를 받을 수 있습니다.
+만약 입력이 여러 개 있는 경우, 입력을 리스트로 [`pipeline`]에 전달하여, 사전 훈련된 모델의 출력을 딕셔너리로 이루어진 리스트 형태로 받을 수 있습니다:
 
 ```py
 >>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
@@ -87,7 +96,7 @@ label: POSITIVE, with score: 0.9998
 label: NEGATIVE, with score: 0.5309
 ```
 
-[`pipeline`]은 특정 태스크용 데이터셋를 전부 순회할 수도 있습니다. 자동 음성 인식 태스크에 적용해 보겠습니다.
+[`pipeline`]은 주어진 과업에 관계없이 데이터셋 전부를 순회할 수도 있습니다. 이 예제에서는 자동 음성 인식을 과업으로 선택해 보겠습니다:
 
 ```py
 >>> import torch
@@ -96,7 +105,7 @@ label: NEGATIVE, with score: 0.5309
 >>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 ```
 
-이제 순회할 오디오 데이터셋를 적재하겠습니다. (자세한 내용은 🤗 Datasets [시작하기](https://huggingface.co/docs/datasets/quickstart#audio)를 참고해주세요) [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터셋로 해볼까요?
+데이터셋을 로드할 차례입니다. (자세한 내용은 🤗 Datasets [시작하기](https://huggingface.co/docs/datasets/quickstart#audio)을 참조하세요) 여기에서는 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터셋을 로드하겠습니다:
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -104,26 +113,25 @@ label: NEGATIVE, with score: 0.5309
 >>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
 ```
 
-데이터셋의 샘플링 레이트가 [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h)의 훈련 당시 샘플링 레이트와 일치해야만 합니다.
+데이터셋의 샘플링 레이트가 기존 모델인 [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h)의 훈련 당시 샘플링 레이트와 일치하는지 확인해야 합니다:
 
 ```py
 >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
 ```
 
-오디오 파일은 `"audio"` 열을 호출할 때 자동으로 적재되고 다시 샘플링됩니다.
-처음 4개 샘플에서 음성을 추출하여 파이프라인에 리스트 형태로 전달해보겠습니다.
+`"audio"` 열을 호출하면 자동으로 오디오 파일을 가져와서 리샘플링합니다. 첫 4개 샘플에서 원시 웨이브폼 배열을 추출하고 파이프라인에 리스트로 전달하세요:
 
 ```py
 >>> result = speech_recognizer(dataset[:4]["audio"])
 >>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
 ```
 
-(음성이나 비전처럼) 입력이 큰 대규모 데이터셋의 경우, 메모리에 적재시키기 위해 리스트 대신 제너레이터로 입력을 모두 전달할 수 있습니다. 자세한 내용은 [pipeline API 레퍼런스](./main_classes/pipelines)를 확인해주세요.
+음성이나 비전과 같이 입력이 큰 대규모 데이터셋의 경우, 모든 입력을 메모리에 로드하려면 리스트 대신 제너레이터 형태로 전달해야 합니다. 자세한 내용은 [Pipelines API 참조](./main_classes/pipelines)를 확인하세요.
 
-### 파이프라인에서 다른 모델이나 tokenizer 사용하는 방법[[use-another-model-and-tokenizer-in-the-pipeline]]
+### 파이프라인에서 다른 모델과 토크나이저 사용하기 [[use-another-model-and-tokenizer-in-the-pipeline]]
 
-[`pipeline`]은 [Hub](https://huggingface.co/models) 속 모든 모델을 사용할 수 있어, 얼마든지 [`pipeline`]을 사용하고 싶은대로 바꿀 수 있습니다. 예를 들어 프랑스어 텍스트를 다룰 수 있는 모델을 만드려면, Hub의 태그로 적절한 모델을 찾아보세요. 상위 검색 결과로 뜬 감정 분석을 위해 파인튜닝된 다국어 [BERT 모델](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)이 프랑스어를 지원하는군요.
+[`pipeline`]은 [Hub](https://huggingface.co/models)의 모든 모델을 사용할 수 있기 때문에, [`pipeline`]을 다른 용도에 맞게 쉽게 수정할 수 있습니다. 예를 들어, 프랑스어 텍스트를 처리할 수 있는 모델을 사용하기 위해선 Hub의 태그를 사용하여 적절한 모델을 필터링하면 됩니다. 필터링된 결과의 상위 항목으로는 프랑스어 텍스트에 사용할 수 있는 다국어 [BERT 모델](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)이 반환됩니다:
 
 ```py
 >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
@@ -131,7 +139,7 @@ label: NEGATIVE, with score: 0.5309
 
 <frameworkcontent>
 <pt>
-[`AutoModelForSequenceClassification`]과 [`AutoTokenizer`]로 사전학습된 모델과 함께 연관된 토크나이저를 불러옵니다. (`AutoClass`에 대한 내용은 다음 섹션에서 살펴보겠습니다)
+[`AutoModelForSequenceClassification`]과 [`AutoTokenizer`]를 사용하여 사전 훈련된 모델과 관련된 토크나이저를 로드하세요 (다음 섹션에서 [`AutoClass`]에 대해 더 자세히 알아보겠습니다):
 
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
@@ -141,7 +149,7 @@ label: NEGATIVE, with score: 0.5309
 ```
 </pt>
 <tf>
-[`TFAutoModelForSequenceClassification`]과 [`AutoTokenizer`]로 사전학습된 모델과 함께 연관된 토크나이저를 불러옵니다. (`TFAutoClass`에 대한 내용은 다음 섹션에서 살펴보겠습니다)
+[`TFAutoModelForSequenceClassification`]과 [`AutoTokenizer`]를 사용하여 사전 훈련된 모델과 관련된 토크나이저를 로드하세요 (다음 섹션에서 [`TFAutoClass`]에 대해 더 자세히 알아보겠습니다):
 
 ```py
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
@@ -152,7 +160,7 @@ label: NEGATIVE, with score: 0.5309
 </tf>
 </frameworkcontent>
 
-[`pipeline`]에서 사용할 모델과 토크나이저를 입력하면 이제 (감정 분석기인) `classifier`를 프랑스어 텍스트에 적용할 수 있습니다.
+[`pipeline`]에서 모델과 토크나이저를 지정하면, 이제 `classifier`를 프랑스어 텍스트에 적용할 수 있습니다:
 
 ```py
 >>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
@@ -160,21 +168,21 @@ label: NEGATIVE, with score: 0.5309
 [{'label': '5 stars', 'score': 0.7273}]
 ```
 
-하고싶은 것에 적용할 마땅한 모델이 없다면, 가진 데이터로 사전학습된 모델을 파인튜닝해야 합니다. 자세한 방법은 [파인튜닝 튜토리얼](./training)을 참고해주세요. 사전학습된 모델의 파인튜닝을 마치셨으면, 누구나 머신러닝을 할 수 있도록 [공유](./model_sharing)하는 것을 고려해주세요. 🤗
+마땅한 모델을 찾을 수 없는 경우 데이터를 기반으로 사전 훈련된 모델을 미세조정해야 합니다. 미세조정 방법에 대한 자세한 내용은 [미세조정 튜토리얼](./training)을 참조하세요. 사전 훈련된 모델을 미세조정한 후에는 모델을 Hub의 커뮤니티와 공유하여 머신러닝 민주화에 기여해주세요! 🤗
 
-## AutoClass
+## AutoClass [[autoclass]]
 
 <Youtube id="AhChOFRegn4"/>
 
-내부적으로 들어가면 위에서 사용했던 [`pipeline`]은 [`AutoModelForSequenceClassification`]과 [`AutoTokenizer`] 클래스로 작동합니다. [AutoClass](./model_doc/auto)란 이름이나 경로를 받으면 그에 알맞는 사전학습된 모델을 가져오는 '바로가기'라고 볼 수 있는데요. 원하는 태스크와 전처리에 적합한 `AutoClass`를 고르기만 하면 됩니다.
+[`AutoModelForSequenceClassification`]과 [`AutoTokenizer`] 클래스는 위에서 다룬 [`pipeline`]의 기능을 구현하는 데 사용됩니다. [AutoClass](./model_doc/auto)는 사전 훈련된 모델의 아키텍처를 이름이나 경로에서 자동으로 가져오는 '바로가기'입니다. 과업에 적합한 `AutoClass`를 선택하고 해당 전처리 클래스를 선택하기만 하면 됩니다.
 
-전에 사용했던 예시로 돌아가서 `AutoClass`로 [`pipeline`]과 동일한 결과를 얻을 수 있는 방법을 알아보겠습니다.
+이전 섹션의 예제로 돌아가서 [`pipeline`]의 결과를 `AutoClass`를 활용해 복제하는 방법을 살펴보겠습니다.
 
-### AutoTokenizer
+### AutoTokenizer [[autotokenizer]]
 
-토크나이저는 전처리를 담당하며, 텍스트를 모델이 받을 숫자 배열로 바꿉니다. 토큰화 과정에는 단어를 어디에서 끊을지, 얼만큼 나눌지 등을 포함한 여러 규칙이 있습니다. 자세한 내용은 [토크나이저 요약](./tokenizer_summary)를 확인해주세요. 제일 중요한 점은 모델이 훈련됐을 때와 동일한 토큰화 규칙을 쓰도록 동일한 모델 이름으로 토크나이저 인스턴스를 만들어야 합니다.
+토크나이저는 텍스트를 모델의 입력으로 사용하기 위해 숫자 배열 형태로 전처리하는 역할을 담당합니다. 토큰화 과정에는 단어를 어디에서 끊을지, 어느 수준까지 나눌지와 같은 여러 규칙들이 있습니다 (토큰화에 대한 자세한 내용은 [토크나이저 요약](./tokenizer_summary)을 참조하세요). 가장 중요한 점은 모델이 사전 훈련된 모델과 동일한 토큰화 규칙을 사용하도록 동일한 모델 이름으로 토크나이저를 인스턴스화해야 한다는 것입니다.
 
-[`AutoTokenizer`]로 토크나이저를 불러오고,
+[`AutoTokenizer`]로 토크나이저를 로드하세요:
 
 ```py
 >>> from transformers import AutoTokenizer
@@ -183,7 +191,7 @@ label: NEGATIVE, with score: 0.5309
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
 ```
 
-토크나이저에 텍스트를 제공하세요.
+텍스트를 토크나이저에 전달하세요:
 
 ```py
 >>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
@@ -193,12 +201,12 @@ label: NEGATIVE, with score: 0.5309
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 ```
 
-그러면 다음을 포함한 딕셔너리가 반환됩니다.
+토크나이저는 다음을 포함한 딕셔너리를 반환합니다:
 
-* [input_ids](./glossary#input-ids): 숫자로 표현된 토큰들
-* [attention_mask](.glossary#attention-mask): 주시할 토큰들
+* [input_ids](./glossary#input-ids): 토큰의 숫자 표현.
+* [attention_mask](.glossary#attention-mask): 어떤 토큰에 주의를 기울여야 하는지를 나타냅니다.
 
-토크나이저는 입력을 리스트로도 받을 수 있으며, 텍스트를 패드하거나 잘라내어 균일한 길이의 배치를 반환할 수도 있습니다.
+토크나이저는 입력을 리스트 형태로도 받을 수 있으며, 텍스트를 패딩하고 잘라내어 일정한 길이의 묶음을 반환할 수도 있습니다:
 
 <frameworkcontent>
 <pt>
@@ -227,15 +235,15 @@ label: NEGATIVE, with score: 0.5309
 
 <Tip>
 
-[전처리](./preprocessing) 튜토리얼을 보시면 토큰화에 대한 자세한 설명과 함께 이미지, 오디오와 멀티모달 입력을 전처리하기 위한 [`AutoFeatureExtractor`]과 [`AutoProcessor`]의 사용방법도 알 수 있습니다.
+[전처리](./preprocessing) 튜토리얼을 참조하시면 토큰화에 대한 자세한 설명과 함께 이미지, 오디오와 멀티모달 입력을 전처리하기 위한 [`AutoImageProcessor`]와 [`AutoFeatureExtractor`], [`AutoProcessor`]의 사용방법도 알 수 있습니다.
 
 </Tip>
 
-### AutoModel
+### AutoModel [[automodel]]
 
 <frameworkcontent>
 <pt>
-🤗 Transformers로 사전학습된 인스턴스를 간단하고 통일된 방식으로 불러올 수 있습니다. 이러면 [`AutoTokenizer`]처럼 [`AutoModel`]도 불러올 수 있게 됩니다. 유일한 차이점은 태스크에 적합한 [`AutoModel`]을 선택해야 한다는 점입니다. 텍스트(또는 시퀀스) 분류의 경우 [`AutoModelForSequenceClassification`]을 불러와야 합니다.
+🤗 Transformers는 사전 훈련된 인스턴스를 간단하고 통합된 방법으로 로드할 수 있습니다. 즉, [`AutoTokenizer`]처럼 [`AutoModel`]을 로드할 수 있습니다. 유일한 차이점은 과업에 알맞은 [`AutoModel`]을 선택해야 한다는 점입니다. 텍스트 (또는 시퀀스) 분류의 경우 [`AutoModelForSequenceClassification`]을 로드해야 합니다:
 
 ```py
 >>> from transformers import AutoModelForSequenceClassification
@@ -246,17 +254,17 @@ label: NEGATIVE, with score: 0.5309
 
 <Tip>
 
-[`AutoModel`] 클래스에서 지원하는 태스크들은 [태스크 정리](./task_summary) 문서를 참고해주세요.
+[`AutoModel`] 클래스에서 지원하는 과업에 대해서는 [과업 요약](./task_summary)을 참조하세요.
 
 </Tip>
 
-이제 전처리된 입력 배치를 모델로 직접 보내야 합니다. 아래처럼 `**`를 앞에 붙여 딕셔너리를 풀어주기만 하면 됩니다.
+이제 전처리된 입력 묶음을 직접 모델에 전달해야 합니다. 아래처럼 `**`를 앞에 붙여 딕셔너리를 풀어주면 됩니다:
 
 ```py
 >>> pt_outputs = pt_model(**pt_batch)
 ```
 
-모델의 activation 결과는 `logits` 속성에 담겨있습니다. `logits`에 Softmax 함수를 적용해서 확률 형태로 받으세요.
+모델의 최종 활성화 함수 출력은 `logits` 속성에 담겨있습니다. `logits`에 softmax 함수를 적용하여 확률을 얻을 수 있습니다:
 
 ```py
 >>> from torch import nn
@@ -268,7 +276,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```
 </pt>
 <tf>
-🤗 Transformers는 사전학습된 인스턴스를 간단하고 통일된 방식으로 불러올 수 있습니다. 이러면 [`AutoTokenizer`]처럼 [`TFAutoModel`]도 불러올 수 있게 됩니다. 유일한 차이점은 태스크에 적합한 [`TFAutoModel`]를 선택해야 한다는 점입니다. 텍스트(또는 시퀀스) 분류의 경우 [`TFAutoModelForSequenceClassification`]을 불러와야 합니다.
+🤗 Transformers는 사전 훈련된 인스턴스를 간단하고 통합된 방법으로 로드할 수 있습니다. 즉, [`AutoTokenizer`]처럼 [`TFAutoModel`]을 로드할 수 있습니다. 유일한 차이점은 과업에 알맞은 [`TFAutoModel`]을 선택해야 한다는 점입니다. 텍스트 (또는 시퀀스) 분류의 경우 [`TFAutoModelForSequenceClassification`]을 로드해야 합니다:
 
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
@@ -279,17 +287,17 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 
 <Tip>
 
-[`AutoModel`] 클래스에서 지원하는 태스크들은 [태스크 정리](./task_summary) 문서를 참고해주세요.
+[`AutoModel`] 클래스에서 지원하는 과업에 대해서는 [과업 요약](./task_summary)을 참조하세요.
 
 </Tip>
 
-이제 전처리된 입력 배치를 모델로 직접 보내야 합니다. 딕셔너리의 키를 텐서에 직접 넣어주기만 하면 됩니다.
+이제 전처리된 입력 묶음을 직접 모델에 전달해야 합니다. 아래처럼 그대로 텐서를 전달하면 됩니다:
 
 ```py
 >>> tf_outputs = tf_model(tf_batch)
 ```
 
-모델의 activation 결과는 `logits` 속성에 담겨있습니다. `logits`에 Softmax 함수를 적용해서 확률 형태로 받으세요.
+모델의 최종 활성화 함수 출력은 `logits` 속성에 담겨있습니다. `logits`에 softmax 함수를 적용하여 확률을 얻을 수 있습니다:
 
 ```py
 >>> import tensorflow as tf
@@ -302,15 +310,15 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 
 <Tip>
 
-모든 (PyTorch 또는 TensorFlow) 🤗 Transformers 모델은 (softmax 등의) 최종 activation 함수 *이전에* 텐서를 내놓습니다. 왜냐하면 최종 activation 함수를 종종 loss 함수와 동일시하기 때문입니다. 모델 출력은 특수 데이터 클래스이므로 해당 속성은 IDE에서 자동으로 완성됩니다. 모델 출력은 튜플 또는 (정수, 슬라이스 또는 문자열로 인덱싱하는) 딕셔너리 형태로 주어지고 이런 경우 None인 속성은 무시됩니다.
+모든 🤗 Transformers 모델(PyTorch 또는 TensorFlow)은 (softmax와 같은) 최종 활성화 함수 *이전에* 텐서를 출력합니다. 왜냐하면 최종 활성화 함수의 출력은 종종 손실 함수 출력과 결합되기 때문입니다. 모델 출력은 특수한 데이터 클래스이므로 IDE에서 자동 완성됩니다. 모델 출력은 튜플이나 딕셔너리처럼 동작하며 (정수, 슬라이스 또는 문자열로 인덱싱 가능), None인 속성은 무시됩니다.
 
 </Tip>
 
-### 모델 저장하기[[save-a-model]]
+### 모델 저장하기 [[save-a-model]]
 
 <frameworkcontent>
 <pt>
-모델을 파인튜닝한 뒤에는 [`PreTrainedModel.save_pretrained`]로 모델을 토크나이저와 함께 저장할 수 있습니다.
+미세조정된 모델을 토크나이저와 함께 저장하려면 [`PreTrainedModel.save_pretrained`]를 사용하세요:
 
 ```py
 >>> pt_save_directory = "./pt_save_pretrained"
@@ -318,14 +326,14 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 >>> pt_model.save_pretrained(pt_save_directory)
 ```
 
-모델을 다시 사용할 때는 [`PreTrainedModel.from_pretrained`]로 다시 불러오면 됩니다.
+모델을 다시 사용하려면 [`PreTrainedModel.from_pretrained`]로 모델을 다시 로드하세요:
 
 ```py
 >>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
 ```
 </pt>
 <tf>
-모델을 파인튜닝한 뒤에는 [`TFPreTrainedModel.save_pretrained`]로 모델을 토크나이저와 함께 저장할 수 있습니다.
+미세조정된 모델을 토크나이저와 함께 저장하려면 [`TFPreTrainedModel.save_pretrained`]를 사용하세요:
 
 ```py
 >>> tf_save_directory = "./tf_save_pretrained"
@@ -333,7 +341,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 >>> tf_model.save_pretrained(tf_save_directory)
 ```
 
-모델을 다시 사용할 때는 [`TFPreTrainedModel.from_pretrained`]로 다시 불러오면 됩니다.
+모델을 다시 사용하려면 [`TFPreTrainedModel.from_pretrained`]로 모델을 다시 로드하세요:
 
 ```py
 >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
@@ -341,7 +349,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 </tf>
 </frameworkcontent>
 
-🤗 Transformers 기능 중 특히 재미있는 한 가지는 모델을 저장하고 PyTorch나 TensorFlow 모델로 다시 불러올 수 있는 기능입니다. 'from_pt' 또는 'from_tf' 매개변수를 사용해 모델을 기존과 다른 프레임워크로 변환시킬 수 있습니다.
+🤗 Transformers의 멋진 기능 중 하나는 모델을 PyTorch 또는 TensorFlow 모델로 저장해뒀다가 다른 프레임워크로 다시 로드할 수 있는 점입니다. `from_pt` 또는 `from_tf` 매개변수를 사용하여 모델을 한 프레임워크에서 다른 프레임워크로 변환할 수 있습니다:
 
 <frameworkcontent>
 <pt>
@@ -362,11 +370,11 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 </tf>
 </frameworkcontent>
 
-## 커스텀 모델 구축하기[[custom-model-builds]]
+## 커스텀 모델 구축하기 [[custom-model-builds]]
 
-모델의 구성 클래스를 수정하여 모델의 구조를 바꿀 수 있습니다. 은닉층, 어텐션 헤드 수와 같은 모델의 속성을 구성에서 지정합니다. 커스텀 구성 클래스에서 모델을 만들면 처음부터 시작해야 합니다. 모델 속성은 랜덤하게 초기화되므로 의미 있는 결과를 얻으려면 먼저 모델을 훈련시킬 필요가 있습니다.
+모델의 구성 클래스를 수정하여 모델의 구조를 바꿀 수 있습니다. (은닉층이나 어텐션 헤드의 수와 같은) 모델의 속성은 구성에서 지정되기 때문입니다. 커스텀 구성 클래스로 모델을 만들면 처음부터 시작해야 합니다. 모델 속성은 무작위로 초기화되므로 의미 있는 결과를 얻으려면 먼저 모델을 훈련시켜야 합니다.
 
-먼저 [`AutoConfig`]를 임포트하고, 수정하고 싶은 사전학습된 모델을 불러오세요. [`AutoConfig.from_pretrained`]에서 어텐션 헤드 수 같은 속성을 변경할 수 있습니다.
+먼저 [`AutoConfig`]를 가져오고 수정하고 싶은 사전학습된 모델을 로드하세요. [`AutoConfig.from_pretrained`] 내부에서 (어텐션 헤드 수와 같이) 변경하려는 속성를 지정할 수 있습니다:
 
 ```py
 >>> from transformers import AutoConfig
@@ -376,7 +384,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 
 <frameworkcontent>
 <pt>
-[`AutoModel.from_config`]를 사용하여 커스텀 구성대로 모델을 생성합니다.
+[`AutoModel.from_config`]를 사용하여 바꾼 구성대로 모델을 생성하세요:
 
 ```py
 >>> from transformers import AutoModel
@@ -385,7 +393,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```
 </pt>
 <tf>
-[`TFAutoModel.from_config`]를 사용하여 커스텀 구성대로 모델을 생성합니다.
+[`TFAutoModel.from_config`]를 사용하여 바꾼 구성대로 모델을 생성하세요:
 
 ```py
 >>> from transformers import TFAutoModel
@@ -395,15 +403,15 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 </tf>
 </frameworkcontent>
 
-커스텀 구성을 작성하는 방법에 대한 자세한 내용은 [커스텀 아키텍처 만들기](./create_a_model) 가이드를 참고하세요.
+커스텀 구성에 대한 자세한 내용은 [커스텀 아키텍처 만들기](./create_a_model) 가이드를 확인하세요.
 
-## Trainer - PyTorch에 최적화된 훈련 반복 루프[[trainer-a-pytorch-optimized-training-loop]]
+## Trainer - PyTorch에 최적화된 훈련 루프 [[trainer-a-pytorch-optimized-training-loop]]
 
-모든 모델은 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)이어서 대다수의 훈련 반복 루프에 사용할 수 있습니다. 사용자가 직접 훈련 반복 루프를 작성해도 되지만, 🤗 Transformers는 PyTorch용 [`Trainer`] 클래스를 제공합니다. 기본적인 훈련 반폭 루프가 포함되어 있고, 분산 훈련이나 혼합 정밀도 등의 추가 기능도 있습니다.
+모든 모델은 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)이므로 일반적인 훈련 루프에서 사용할 수 있습니다. 직접 훈련 루프를 작성할 수도 있지만, 🤗 Transformers는 PyTorch를 위한 [`Trainer`] 클래스를 제공합니다. 이 클래스에는 기본 훈련 루프가 포함되어 있으며 분산 훈련, 혼합 정밀도 등과 같은 기능을 추가로 제공합니다.
 
-태스크에 따라 다르지만, 일반적으로 다음 매개변수를 [`Trainer`]에 전달할 것입니다.
+과업에 따라 다르지만 일반적으로 [`Trainer`]에 다음 매개변수를 전달합니다:
 
-1. [`PreTrainedModel`] 또는 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)로 시작합니다.
+1. [`PreTrainedModel`] 또는 [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)로 시작합니다:
 
    ```py
    >>> from transformers import AutoModelForSequenceClassification
@@ -411,7 +419,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
    ```
 
-2. [`TrainingArguments`]로 학습률, 배치 크기나 훈련할 epoch 수와 같이 모델의 하이퍼파라미터를 조정합니다. 기본값은 훈련 인수를 전혀 지정하지 않은 경우 사용됩니다.
+2. [`TrainingArguments`]는 학습률, 배치 크기, 훈련할 에포크 수와 같은 모델 하이퍼파라미터를 포함합니다. 훈련 인자를 지정하지 않으면 기본값이 사용됩니다:
 
    ```py
    >>> from transformers import TrainingArguments
@@ -425,7 +433,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ... )
    ```
 
-3. 토크나이저, 특징추출기(feature extractor), 전처리기(processor) 클래스 등으로 전처리를 수행합니다.
+3. 토크나이저, 이미지 프로세서, 특징 추출기(feature extractor) 또는 프로세서와 전처리 클래스를 로드하세요:
 
    ```py
    >>> from transformers import AutoTokenizer
@@ -433,7 +441,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    ```
 
-4. 데이터셋를 적재합니다.
+4. 데이터셋을 로드하세요:
 
    ```py
    >>> from datasets import load_dataset
@@ -441,17 +449,20 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
    ```
 
-5. 데이터셋을 토큰화하는 함수를 만들고 [`~datasets.Dataset.map`]으로 전체 데이터셋에 적용시킵니다.
+5. 데이터셋을 토큰화하는 함수를 생성하세요:
 
    ```py
    >>> def tokenize_dataset(dataset):
    ...     return tokenizer(dataset["text"])
+   ```
 
+   그리고 [`~datasets.Dataset.map`]로 데이터셋 전체에 적용하세요:
 
+   ```py
    >>> dataset = dataset.map(tokenize_dataset, batched=True)
    ```
 
-6. [`DataCollatorWithPadding`]로 데이터셋으로부터 표본으로 삼을 배치를 만듭니다.
+6. [`DataCollatorWithPadding`]을 사용하여 데이터셋의 표본 묶음을 만드세요:
 
    ```py
    >>> from transformers import DataCollatorWithPadding
@@ -459,7 +470,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    ```
 
-이제 위의 모든 클래스를 [`Trainer`]로 모으세요.
+이제 위의 모든 클래스를 [`Trainer`]로 모으세요:
 
 ```py
 >>> from transformers import Trainer
@@ -474,7 +485,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ... )  # doctest: +SKIP
 ```
 
-준비되었으면 [`~Trainer.train`]으로 훈련을 시작하세요.
+준비가 되었으면 [`~Trainer.train`]을 호출하여 훈련을 시작하세요:
 
 ```py
 >>> trainer.train()  # doctest: +SKIP
@@ -482,19 +493,19 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 
 <Tip>
 
-sequence-to-sequence 모델을 사용하는 (번역이나 요약 같은) 태스크의 경우 [`Seq2SeqTrainer`]와 [`Seq2SeqTrainingArguments`] 클래스를 대신 사용하시기 바랍니다.
+번역이나 요약과 같이 시퀀스-시퀀스 모델을 사용하는 과업에는 [`Seq2SeqTrainer`] 및 [`Seq2SeqTrainingArguments`] 클래스를 사용하세요.
 
 </Tip>
 
-[`Trainer`] 내부의 메서드를 구현 상속(subclassing)해서 훈련 반복 루프를 개조할 수도 있습니다. 이러면 loss 함수, optimizer, scheduler 등의 기능도 개조할 수 있습니다. 어떤 메서드를 구현 상속할 수 있는지 알아보려면 [`Trainer`]를 참고하세요. 
+[`Trainer`] 내의 메서드를 서브클래스화하여 훈련 루프를 바꿀 수도 있습니다. 이러면 손실 함수, 옵티마이저, 스케줄러와 같은 기능 또한 바꿀 수 있게 됩니다. 변경 가능한 메소드에 대해서는 [`Trainer`] 문서를 참고하세요.
 
-훈련 반복 루프를 개조하는 다른 방법은 [Callbacks](./main_classes/callbacks)를 사용하는 것입니다. Callbacks로 다른 라이브러리와 통합하고, 훈련 반복 루프를 수시로 체크하여 진행 상황을 보고받거나, 훈련을 조기에 중단할 수 있습니다. Callbacks은 훈련 반복 루프 자체를 전혀 수정하지 않습니다. 만약 loss 함수 등을 개조하고 싶다면 [`Trainer`]를 구현 상속해야만 합니다.
+훈련 루프를 수정하는 다른 방법은 [Callbacks](./main_classes/callbacks)를 사용하는 것입니다. Callbacks로 다른 라이브러리와 통합하고, 훈련 루프를 체크하여 진행 상황을 보고받거나, 훈련을 조기에 중단할 수 있습니다. Callbacks은 훈련 루프 자체를 바꾸지는 않습니다. 손실 함수와 같은 것을 바꾸려면 [`Trainer`]를 서브클래스화해야 합니다.
 
-## TensorFlow로 훈련시키기[[train-with-tensorflow]]
+## TensorFlow로 훈련시키기 [[train-with-tensorflow]]
 
-모든 모델은 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)이어서 [Keras](https://keras.io/) API를 통해 TensorFlow에서 훈련시킬 수 있습니다. 🤗 Transformers에서 데이터셋를 `tf.data.Dataset` 형태로 쉽게 적재할 수 있는 [`~TFPreTrainedModel.prepare_tf_dataset`] 메서드를 제공하기 때문에, Keras의 [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 및 [`fit`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 메서드로 즉시 훈련을 시작할 수 있습니다.
+모든 모델은 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)이므로 [Keras](https://keras.io/) API를 통해 TensorFlow에서 훈련시킬 수 있습니다. 🤗 Transformers는 데이터셋을 쉽게 `tf.data.Dataset` 형태로 쉽게 로드할 수 있는 [`~TFPreTrainedModel.prepare_tf_dataset`] 메소드를 제공하기 때문에, Keras의 [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 및 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) 메소드로 바로 훈련을 시작할 수 있습니다.
 
-1. [`TFPreTrainedModel`] 또는 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)로 시작합니다.
+1. [`TFPreTrainedModel`] 또는 [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)로 시작합니다:
 
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
@@ -502,7 +513,7 @@ sequence-to-sequence 모델을 사용하는 (번역이나 요약 같은) 태스
    >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
    ```
 
-2. 토크나이저, 특징추출기(feature extractor), 전처리기(processor) 클래스 등으로 전처리를 수행합니다.
+2. 토크나이저, 이미지 프로세서, 특징 추출기(feature extractor) 또는 프로세서와 같은 전처리 클래스를 로드하세요:
 
    ```py
    >>> from transformers import AutoTokenizer
@@ -510,31 +521,31 @@ sequence-to-sequence 모델을 사용하는 (번역이나 요약 같은) 태스
    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    ```
 
-3. 데이터셋을 토큰화하는 함수를 만듭니다.
+3. 데이터셋을 토큰화하는 함수를 생성하세요:
 
    ```py
    >>> def tokenize_dataset(dataset):
    ...     return tokenizer(dataset["text"])  # doctest: +SKIP
    ```
 
-4. [`~datasets.Dataset.map`]으로 전체 데이터셋에 위 함수를 적용시킨 다음, 데이터셋과 토크나이저를 [`~TFPreTrainedModel.prepare_tf_dataset`]로 전달합니다. 배치 크기를 변경해보거나 데이터셋를 섞어봐도 좋습니다.
+4. [`~datasets.Dataset.map`]을 사용하여 전체 데이터셋에 토큰화 함수를 적용하고, 데이터셋과 토크나이저를 [`~TFPreTrainedModel.prepare_tf_dataset`]에 전달하세요. 배치 크기를 변경하거나 데이터셋을 섞을 수도 있습니다:
 
    ```py
    >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
    >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
+   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
    ... )  # doctest: +SKIP
    ```
 
-5. 준비되었으면 `compile`과 `fit`으로 훈련을 시작하세요.
+5. 준비되었으면 `compile` 및 `fit`를 호출하여 훈련을 시작하세요. 🤗 Transformers의 모든 모델은 과업과 관련된 기본 손실 함수를 가지고 있으므로 명시적으로 지정하지 않아도 됩니다:
 
    ```py
    >>> from tensorflow.keras.optimizers import Adam
 
-   >>> model.compile(optimizer=Adam(3e-5))
-   >>> model.fit(dataset)  # doctest: +SKIP
+   >>> model.compile(optimizer=Adam(3e-5))  # No loss argument!
+   >>> model.fit(tf_dataset)  # doctest: +SKIP
    ```
 
-## 이제 무얼 하면 될까요?[[whats-next]]
+## 다음 단계는 무엇인가요? [[whats-next]]
 
-🤗 Transformers 둘러보기를 모두 읽으셨다면, 가이드를 통해 특정 기술을 배울 수 있어요. 예를 들어 커스텀 모델을 작성하는 방법, 태스크용 모델을 파인튜닝하는 방법, 스크립트로 모델을 훈련시키는 방법 등이 있습니다. 🤗 Transformers의 핵심 개념에 대해 자세히 알아보려면 커피 한 잔을 마신 뒤 개념 가이드를 살펴보셔도 좋습니다!
+🤗 Transformers 둘러보기를 모두 읽으셨다면, 가이드를 살펴보고 더 구체적인 것을 수행하는 방법을 알아보세요. 이를테면 커스텀 모델 구축하는 방법, 과업에 알맞게 모델을 미세조정하는 방법, 스크립트로 모델 훈련하는 방법 등이 있습니다. 🤗 Transformers 핵심 개념에 대해 더 알아보려면 커피 한 잔 들고 개념 가이드를 살펴보세요!
\ No newline at end of file

From f4eb459ef25c62c4cc9edde38052da1980977872 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 21 Jul 2023 17:52:48 +0530
Subject: [PATCH 722/935] fsdp fixes and enhancements (#24980)

* fix fsdp prepare to remove the warnings and fix excess memory usage

* Update training_args.py

* parity for FSDP+XLA

* Update trainer.py
---
 docs/source/en/main_classes/trainer.md |  4 ++--
 src/transformers/trainer.py            | 13 +++++++++++--
 src/transformers/training_args.py      |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index dcf076cc5ac4..ad3ea57f1334 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -441,7 +441,7 @@ as the model saving with FSDP activated is only available with recent fixes.
 - Remaining FSDP config is passed via `--fsdp_config <path_to_fsdp_config.json>`. It is either a location of
   FSDP json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`. 
   - If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy.
-    - For transformer based auto wrap policy, please specify `fsdp_transformer_layer_cls_to_wrap` in the config file. 
+    - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
       This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
       This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
       Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
@@ -482,7 +482,7 @@ Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_co
   This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through
   `fsdp_min_num_params` or `fsdp_transformer_layer_cls_to_wrap`. 
 - You can either use transformer based auto wrap policy or size based auto wrap policy.
-  - For transformer based auto wrap policy, please specify `fsdp_transformer_layer_cls_to_wrap` in the config file. 
+  - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
     This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
     This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
     Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index da4e623205a7..3611e99035be 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1377,18 +1377,24 @@ def _wrap_model(self, model, training=True, dataloader=None):
                 raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.")
             auto_wrap_policy = None
             auto_wrapper_callable = None
+            default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None)
+            fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get(
+                "fsdp_transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap
+            )
+
             if self.args.fsdp_config["fsdp_min_num_params"] > 0:
                 auto_wrap_policy = functools.partial(
                     size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"]
                 )
-            elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
+            elif fsdp_transformer_layer_cls_to_wrap is not None:
                 transformer_cls_to_wrap = set()
-                for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]:
+                for layer_class in fsdp_transformer_layer_cls_to_wrap:
                     transformer_cls = get_module_class_from_name(model, layer_class)
                     if transformer_cls is None:
                         raise Exception("Could not find the transformer layer class to wrap in the model.")
                     else:
                         transformer_cls_to_wrap.add(transformer_cls)
+
                 auto_wrap_policy = functools.partial(
                     transformer_auto_wrap_policy,
                     # Transformer layer class to wrap
@@ -1600,6 +1606,7 @@ def _inner_training_loop(
             and self.sharded_ddp != ShardedDDPOption.SIMPLE
             or is_sagemaker_mp_enabled()
             or self.fsdp is not None
+            or self.is_fsdp_enabled
         )
 
         # We need to reset the scheduler, as its parameters may be different on subsequent calls
@@ -1631,6 +1638,8 @@ def _inner_training_loop(
         use_accelerator_prepare = True if model is self.model else False
 
         if delay_optimizer_creation:
+            if use_accelerator_prepare:
+                self.model = self.accelerator.prepare(self.model)
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         # prepare using `accelerator` prepare
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 7be24cc315c8..da786317e79e 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1567,6 +1567,7 @@ def __post_init__(self):
                 elif fsdp_option == FSDPOption.OFFLOAD:
                     os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
                 elif fsdp_option == FSDPOption.AUTO_WRAP:
+                    os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
                     if self.fsdp_config["fsdp_min_num_params"] > 0:
                         os.environ["FSDP_MIN_NUM_PARAMS"] = str(self.fsdp_config["fsdp_min_num_params"])
                         os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
@@ -1574,7 +1575,6 @@ def __post_init__(self):
                         os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ",".join(
                             self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]
                         )
-                        os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
             prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH")
             os.environ["FSDP_BACKWARD_PREFETCH"] = prefetch_policy.upper()
 

From f74560d0076d30604ff8d5f9ebd723c9b14b26ad Mon Sep 17 00:00:00 2001
From: Joya Chen <joyachen@u.nus.edu>
Date: Fri, 21 Jul 2023 20:28:54 +0800
Subject: [PATCH 723/935] Fix missing spaces in system prompt of Llama2
 tokenizer (#24930)

* Update tokenization_llama.py

* Update tokenization_llama_fast.py

* Update src/transformers/models/llama/tokenization_llama_fast.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/llama/tokenization_llama.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/llama/tokenization_llama.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/llama/tokenization_llama_fast.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/llama/tokenization_llama.py      | 6 +++---
 src/transformers/models/llama/tokenization_llama_fast.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index ec7fdb37fcbd..7169b29867d3 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -53,11 +53,11 @@
 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 
 # fmt: off
-DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your\
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
 answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
-that your responses are socially unbiased and positive in nature.
+ that your responses are socially unbiased and positive in nature.
 
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not\
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
 correct. If you don't know the answer to a question, please don't share false information."""
 # fmt: on
 
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index b50f70158772..c04e2da114cc 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -40,11 +40,11 @@
 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 
 # fmt: off
-DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your\
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
 answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
-that your responses are socially unbiased and positive in nature.
+ that your responses are socially unbiased and positive in nature.
 
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not\
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
 correct. If you don't know the answer to a question, please don't share false information."""
 # fmt: on
 

From 0511369a8b6464ca3e51618ac816df1b34590144 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:32:34 +0200
Subject: [PATCH 724/935] [`LlamaConfig`] Nit: pad token should be None by
 default (#24958)

* pad token should be None by default

* fix tests

* nits
---
 src/transformers/models/llama/configuration_llama.py | 2 +-
 tests/models/llama/test_modeling_llama.py            | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index ea257b3a7b95..45877721376d 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -116,7 +116,7 @@ def __init__(
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=True,
-        pad_token_id=0,
+        pad_token_id=None,
         bos_token_id=1,
         eos_token_id=2,
         pretraining_tp=1,
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 80820a5013ff..c2efc3f5a331 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -58,6 +58,7 @@ def __init__(
         initializer_range=0.02,
         num_labels=3,
         num_choices=4,
+        pad_token_id=0,
         scope=None,
     ):
         self.parent = parent
@@ -81,6 +82,7 @@ def __init__(
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
         self.scope = scope
 
     def prepare_config_and_inputs(self):
@@ -120,6 +122,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
         )
 
     def create_and_check_model(

From 640e1b6c6f78ed18ca8737d87c33ac6e2cc718a3 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 21 Jul 2023 09:41:36 -0400
Subject: [PATCH 725/935] Remove tokenizers from the doc table (#24963)

---
 docs/source/en/index.md | 400 ++++++++++++++++++++--------------------
 utils/check_table.py    |  14 +-
 2 files changed, 202 insertions(+), 212 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ced49d3ed721..28139d406ce3 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -278,205 +278,205 @@ Flax), PyTorch, and/or TensorFlow.
 
 <!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
 
-|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             ALIGN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            AltCLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Autoformer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             Bark              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            BioGpt             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              BiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             BLIP              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            BLIP-2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          BridgeTower          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Chinese-CLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             CLAP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          ConvNeXTV2           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CPM-Ant            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             DETA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DINOv2             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            EnCodec            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ErnieM             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Falcon             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           FocalNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              GIT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            GPT-Sw3            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          GPTBigCode           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        GPTSAN-japanese        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Graphormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Informer            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         InstructBLIP          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             LLaMA             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Mask2Former          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        MaskFormerSwin         |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
-|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             MEGA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            MGP-STR            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          MobileViTV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|              MRA              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           MusicGen            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           NLLB-MOE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           OneFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           OpenLlama           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Pix2Struct           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|     RoBERTa-PreLayerNorm      |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             RWKV              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              SAM              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|           SpeechT5            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          SwiftFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Swin2SR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          TimeSformer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         TimmBackbone          |       ❌       |       ❌       |       ❌        |         ❌         |      ❌      |
-|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             UMT5              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          ViT Hybrid           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Whisper            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             X-MOD             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Model             | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅        |         ✅         |      ✅      |
+|             ALIGN             |       ✅        |         ❌         |      ❌      |
+|            AltCLIP            |       ✅        |         ❌         |      ❌      |
+| Audio Spectrogram Transformer |       ✅        |         ❌         |      ❌      |
+|          Autoformer           |       ✅        |         ❌         |      ❌      |
+|             Bark              |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ✅        |         ❌         |      ❌      |
+|            BioGpt             |       ✅        |         ❌         |      ❌      |
+|              BiT              |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅        |         ✅         |      ✅      |
+|             BLIP              |       ✅        |         ✅         |      ❌      |
+|            BLIP-2             |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ✅        |         ❌         |      ❌      |
+|          BridgeTower          |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅        |         ❌         |      ❌      |
+|         Chinese-CLIP          |       ✅        |         ❌         |      ❌      |
+|             CLAP              |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ✅        |         ✅         |      ❌      |
+|          ConvNeXTV2           |       ✅        |         ❌         |      ❌      |
+|            CPM-Ant            |       ✅        |         ❌         |      ❌      |
+|             CTRL              |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ✅        |         ✅         |      ❌      |
+|             DETA              |       ✅        |         ❌         |      ❌      |
+|             DETR              |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ✅        |         ❌         |      ❌      |
+|            DINOv2             |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ✅        |         ✅         |      ❌      |
+|         EfficientNet          |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅        |         ✅         |      ✅      |
+|            EnCodec            |       ✅        |         ❌         |      ❌      |
+|        Encoder decoder        |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ✅        |         ❌         |      ❌      |
+|            ErnieM             |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅        |         ❌         |      ❌      |
+|            Falcon             |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅        |         ❌         |      ❌      |
+|           FocalNet            |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅        |         ✅         |      ❌      |
+|              GIT              |       ✅        |         ❌         |      ❌      |
+|             GLPN              |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ✅        |         ✅         |      ✅      |
+|            GPT-Sw3            |       ✅        |         ✅         |      ✅      |
+|          GPTBigCode           |       ✅        |         ❌         |      ❌      |
+|        GPTSAN-japanese        |       ✅        |         ❌         |      ❌      |
+|          Graphormer           |       ✅        |         ❌         |      ❌      |
+|           GroupViT            |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ✅        |         ❌         |      ❌      |
+|           Informer            |       ✅        |         ❌         |      ❌      |
+|         InstructBLIP          |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ✅        |         ❌         |      ❌      |
+|             LLaMA             |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅        |         ❌         |      ❌      |
+|          Mask2Former          |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ✅        |         ❌         |      ❌      |
+|        MaskFormerSwin         |       ❌        |         ❌         |      ❌      |
+|             mBART             |       ✅        |         ✅         |      ✅      |
+|             MEGA              |       ✅        |         ❌         |      ❌      |
+|         Megatron-BERT         |       ✅        |         ❌         |      ❌      |
+|            MGP-STR            |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ✅        |         ✅         |      ❌      |
+|          MobileViTV2          |       ✅        |         ❌         |      ❌      |
+|             MPNet             |       ✅        |         ✅         |      ❌      |
+|              MRA              |       ✅        |         ❌         |      ❌      |
+|              MT5              |       ✅        |         ✅         |      ✅      |
+|           MusicGen            |       ✅        |         ❌         |      ❌      |
+|              MVP              |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ✅        |         ❌         |      ❌      |
+|           NLLB-MOE            |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ✅        |         ❌         |      ❌      |
+|           OneFormer           |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅        |         ✅         |      ✅      |
+|           OpenLlama           |       ✅        |         ❌         |      ❌      |
+|              OPT              |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅        |         ❌         |      ❌      |
+|          Pix2Struct           |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ✅        |         ✅         |      ✅      |
+|            RemBERT            |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ✅        |         ✅         |      ✅      |
+|           RetriBERT           |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅        |         ✅         |      ✅      |
+|     RoBERTa-PreLayerNorm      |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅        |         ✅         |      ✅      |
+|             RWKV              |       ✅        |         ❌         |      ❌      |
+|              SAM              |       ✅        |         ✅         |      ❌      |
+|           SegFormer           |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ❌        |         ❌         |      ❌      |
+|           SpeechT5            |       ✅        |         ❌         |      ❌      |
+|           Splinter            |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅        |         ❌         |      ❌      |
+|          SwiftFormer          |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ✅        |         ❌         |      ❌      |
+|            Swin2SR            |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ✅        |         ❌         |      ❌      |
+|          TimeSformer          |       ✅        |         ❌         |      ❌      |
+|         TimmBackbone          |       ❌        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ✅        |         ❌         |      ❌      |
+|             TVLT              |       ✅        |         ❌         |      ❌      |
+|             UMT5              |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ✅        |         ❌         |      ❌      |
+|            UPerNet            |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ✅        |         ✅         |      ✅      |
+|          VisualBERT           |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ✅        |         ✅         |      ✅      |
+|          ViT Hybrid           |       ✅        |         ❌         |      ❌      |
+|            ViTMAE             |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ✅        |         ❌         |      ❌      |
+|             ViViT             |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅        |         ✅         |      ✅      |
+|            X-CLIP             |       ✅        |         ❌         |      ❌      |
+|             X-MOD             |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ✅        |         ❌         |      ❌      |
 
 <!-- End table-->
diff --git a/utils/check_table.py b/utils/check_table.py
index 77eabe4093c0..a48310f7c16f 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -93,8 +93,6 @@ def get_model_table_from_auto_modules():
     model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()}
 
     # Dictionaries flagging if each model prefix has a slow/fast tokenizer, backend in PT/TF/Flax.
-    slow_tokenizers = collections.defaultdict(bool)
-    fast_tokenizers = collections.defaultdict(bool)
     pt_models = collections.defaultdict(bool)
     tf_models = collections.defaultdict(bool)
     flax_models = collections.defaultdict(bool)
@@ -102,13 +100,7 @@ def get_model_table_from_auto_modules():
     # Let's lookup through all transformers object (once).
     for attr_name in dir(transformers_module):
         lookup_dict = None
-        if attr_name.endswith("Tokenizer"):
-            lookup_dict = slow_tokenizers
-            attr_name = attr_name[:-9]
-        elif attr_name.endswith("TokenizerFast"):
-            lookup_dict = fast_tokenizers
-            attr_name = attr_name[:-13]
-        elif _re_tf_models.match(attr_name) is not None:
+        if _re_tf_models.match(attr_name) is not None:
             lookup_dict = tf_models
             attr_name = _re_tf_models.match(attr_name).groups()[0]
         elif _re_flax_models.match(attr_name) is not None:
@@ -129,7 +121,7 @@ def get_model_table_from_auto_modules():
     # Let's build that table!
     model_names = list(model_name_to_config.keys())
     model_names.sort(key=str.lower)
-    columns = ["Model", "Tokenizer slow", "Tokenizer fast", "PyTorch support", "TensorFlow support", "Flax Support"]
+    columns = ["Model", "PyTorch support", "TensorFlow support", "Flax Support"]
     # We'll need widths to properly display everything in the center (+2 is to leave one extra space on each side).
     widths = [len(c) + 2 for c in columns]
     widths[0] = max([len(name) for name in model_names]) + 2
@@ -144,8 +136,6 @@ def get_model_table_from_auto_modules():
         prefix = model_name_to_prefix[name]
         line = [
             name,
-            check[slow_tokenizers[prefix]],
-            check[fast_tokenizers[prefix]],
             check[pt_models[prefix]],
             check[tf_models[prefix]],
             check[flax_models[prefix]],

From 5b7ffd5492e13b10fcfe282b1ab99097a6e0230a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 21 Jul 2023 09:41:56 -0400
Subject: [PATCH 726/935] Avoid importing all models when instantiating a
 pipeline (#24960)

* Avoid importing all models when instantiating a pipeline

* Remove sums that don't work
---
 src/transformers/models/auto/auto_factory.py  |  1 +
 src/transformers/pipelines/__init__.py        | 12 ---------
 .../pipelines/audio_classification.py         |  4 +--
 .../pipelines/automatic_speech_recognition.py |  8 +++---
 src/transformers/pipelines/base.py            | 14 ++++++++---
 .../pipelines/depth_estimation.py             |  4 +--
 .../pipelines/document_question_answering.py  |  4 +--
 .../pipelines/image_classification.py         |  8 +++---
 .../pipelines/image_segmentation.py           | 21 +++++++---------
 src/transformers/pipelines/image_to_text.py   |  6 ++---
 src/transformers/pipelines/mask_generation.py |  4 +--
 .../pipelines/object_detection.py             | 11 +++++---
 .../pipelines/question_answering.py           |  8 +++---
 .../pipelines/table_question_answering.py     | 25 ++++++++-----------
 .../pipelines/text2text_generation.py         |  8 +++---
 .../pipelines/text_classification.py          |  8 +++---
 src/transformers/pipelines/text_generation.py | 10 +++++---
 .../pipelines/token_classification.py         |  8 +++---
 .../pipelines/video_classification.py         |  4 +--
 .../pipelines/visual_question_answering.py    |  4 +--
 .../zero_shot_image_classification.py         |  8 +++---
 .../pipelines/zero_shot_object_detection.py   |  4 +--
 22 files changed, 92 insertions(+), 92 deletions(-)

diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 7823d467505f..753f2d9f8fb0 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -650,6 +650,7 @@ def __init__(self, config_mapping, model_mapping):
         self._config_mapping = config_mapping
         self._reverse_config_mapping = {v: k for k, v in config_mapping.items()}
         self._model_mapping = model_mapping
+        self._model_mapping._model_mapping = self
         self._extra_content = {}
         self._modules = {}
 
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 66822183f1b7..df98fef497af 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -88,11 +88,6 @@
     import tensorflow as tf
 
     from ..models.auto.modeling_tf_auto import (
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        TF_MODEL_WITH_LM_HEAD_MAPPING,
         TFAutoModel,
         TFAutoModelForCausalLM,
         TFAutoModelForImageClassification,
@@ -110,13 +105,6 @@
     import torch
 
     from ..models.auto.modeling_auto import (
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
         AutoModel,
         AutoModelForAudioClassification,
         AutoModelForCausalLM,
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 8ac2a4a554e3..96b974b7363a 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -22,7 +22,7 @@
 
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -98,7 +98,7 @@ def __init__(self, *args, **kwargs):
         if self.framework != "pt":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
-        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING)
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
 
     def __call__(
         self,
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index d798c44b0dc6..f32aabe64f93 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -30,7 +30,7 @@
 logger = logging.get_logger(__name__)
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_CTC_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_CTC_MAPPING_NAMES, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
 
 
 def rescale_stride(stride, ratio):
@@ -205,7 +205,7 @@ def __init__(
 
         if self.model.config.model_type == "whisper":
             self.type = "seq2seq_whisper"
-        elif self.model.__class__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.values():
+        elif self.model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
             self.type = "seq2seq"
         elif (
             feature_extractor._processor_class
@@ -220,7 +220,9 @@ def __init__(
         if self.framework == "tf":
             raise ValueError("The AutomaticSpeechRecognitionPipeline is only available in PyTorch.")
 
-        self.check_model_type(dict(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items() + MODEL_FOR_CTC_MAPPING.items()))
+        mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_CTC_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
     def __call__(
         self,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index ee117e62a189..524e15580e46 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -952,12 +952,18 @@ def check_model_type(self, supported_models: Union[List[str], dict]):
         """
         if not isinstance(supported_models, list):  # Create from a model mapping
             supported_models_names = []
-            for config, model in supported_models.items():
+            for _, model_name in supported_models.items():
                 # Mapping can now contain tuples of models for the same configuration.
-                if isinstance(model, tuple):
-                    supported_models_names.extend([_model.__name__ for _model in model])
+                if isinstance(model_name, tuple):
+                    supported_models_names.extend(list(model_name))
                 else:
-                    supported_models_names.append(model.__name__)
+                    supported_models_names.append(model_name)
+            if hasattr(supported_models, "_model_mapping"):
+                for _, model in supported_models._model_mapping._extra_content.items():
+                    if isinstance(model_name, tuple):
+                        supported_models_names.extend([m.__name__ for m in model])
+                    else:
+                        supported_models_names.append(model.__name__)
             supported_models = supported_models_names
         if self.model.__class__.__name__ not in supported_models:
             logger.error(
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index 7d0490f635d0..89b29e8dcaca 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -14,7 +14,7 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -48,7 +48,7 @@ class DepthEstimationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING)
+        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 936d728b5983..e4b96348e7be 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -37,7 +37,7 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
 
 TESSERACT_LOADED = False
 if is_pytesseract_available():
@@ -142,7 +142,7 @@ def __init__(self, *args, **kwargs):
             if self.model.config.encoder.model_type != "donut-swin":
                 raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
         else:
-            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING)
+            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
             if self.model.config.__class__.__name__ == "LayoutLMConfig":
                 self.model_type = ModelType.LayoutLM
             else:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 93dc4cff2144..bdda57430359 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -19,11 +19,11 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
     from ..tf_utils import stable_softmax
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -57,9 +57,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "vision")
         self.check_model_type(
-            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
     def _sanitize_parameters(self, top_k=None):
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 00f52af28725..c28946c77cd3 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -13,10 +13,10 @@
 
 if is_torch_available():
     from ..models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
-        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
-        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
-        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
     )
 
 
@@ -71,14 +71,11 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(
-            dict(
-                MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
-                + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
-                + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
-                + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items()
-            )
-        )
+        mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
     def _sanitize_parameters(self, **kwargs):
         preprocess_kwargs = {}
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 25feeb1c0cb9..e9f802ab7b2a 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -17,12 +17,12 @@
     from ..image_utils import load_image
 
 if is_tf_available():
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
 
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -55,7 +55,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "vision")
         self.check_model_type(
-            TF_MODEL_FOR_VISION_2_SEQ_MAPPING if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING
+            TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
         )
 
     def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None):
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index bfcd9e800a87..26c5f7101ef2 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -14,7 +14,7 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -96,7 +96,7 @@ def __init__(self, **kwargs):
         if self.framework != "pt":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
-        self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING)
+        self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
 
     def _sanitize_parameters(self, **kwargs):
         preprocess_kwargs = {}
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 0b9c5f0763dc..e7e26465abd2 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -11,7 +11,10 @@
 if is_torch_available():
     import torch
 
-    from ..models.auto.modeling_auto import MODEL_FOR_OBJECT_DETECTION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    )
 
 logger = logging.get_logger(__name__)
 
@@ -53,9 +56,9 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(
-            dict(MODEL_FOR_OBJECT_DETECTION_MAPPING.items() + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.items())
-        )
+        mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
     def _sanitize_parameters(self, **kwargs):
         postprocess_kwargs = {}
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 89a5c03c7b46..5bc72151fba5 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -32,7 +32,7 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
 
     Dataset = None
 
@@ -40,7 +40,7 @@
     import torch
     from torch.utils.data import Dataset
 
-    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
 
 
 def decode_spans(
@@ -270,7 +270,9 @@ def __init__(
 
         self._args_parser = QuestionAnsweringArgumentHandler()
         self.check_model_type(
-            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING if self.framework == "tf" else MODEL_FOR_QUESTION_ANSWERING_MAPPING
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
         )
 
     @staticmethod
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index c01d7e49053a..e0cb2ff3e178 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -17,8 +17,8 @@
     import torch
 
     from ..models.auto.modeling_auto import (
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
     )
 
 if is_tf_available() and is_tensorflow_probability_available():
@@ -26,8 +26,8 @@
     import tensorflow_probability as tfp
 
     from ..models.auto.modeling_tf_auto import (
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
     )
 
 
@@ -122,16 +122,13 @@ def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, *
         super().__init__(*args, **kwargs)
         self._args_parser = args_parser
 
-        self.check_model_type(
-            dict(
-                TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.items()
-                + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()
-            )
-            if self.framework == "tf"
-            else dict(
-                MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.items() + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()
-            )
-        )
+        if self.framework == "tf":
+            mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        else:
+            mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        self.check_model_type(mapping)
 
         self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool(
             getattr(self.model.config, "num_aggregation_labels", None)
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 48df10336a65..51249b7f4bfb 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -9,10 +9,10 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -65,9 +65,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.check_model_type(
-            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
         )
 
     def _sanitize_parameters(
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index e92033e60f0e..f9c87fb944a0 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -9,10 +9,10 @@
 
 
 if is_tf_available():
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 
 
 def sigmoid(_outputs):
@@ -84,9 +84,9 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         self.check_model_type(
-            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
         )
 
     def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 0f17dcc1e8ba..7bdb07ed7515 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -1,14 +1,18 @@
 import enum
 import warnings
 
-from .. import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING
-from ..utils import add_end_docstrings, is_tf_available
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
 if is_tf_available():
     import tensorflow as tf
 
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
 
 class ReturnType(enum.Enum):
     TENSORS = 0
@@ -62,7 +66,7 @@ class TextGenerationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.check_model_type(
-            TF_MODEL_FOR_CAUSAL_LM_MAPPING if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
         )
         if "prefix" not in self._preprocess_params:
             # This is very specific. The logic is quite complex and needs to be done
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 563620099819..a32a9aa9ad8b 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -17,9 +17,9 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
 
 
 class TokenClassificationArgumentHandler(ArgumentHandler):
@@ -135,9 +135,9 @@ class TokenClassificationPipeline(ChunkPipeline):
     def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.check_model_type(
-            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
         )
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 41a1699a7e82..4255856aa26d 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -13,7 +13,7 @@
 
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -34,7 +34,7 @@ class VideoClassificationPipeline(Pipeline):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         requires_backends(self, "decord")
-        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING)
+        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
 
     def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
         preprocess_params = {}
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 6d9e1cb3e7d1..ed36972d83f1 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -10,7 +10,7 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -53,7 +53,7 @@ class VisualQuestionAnsweringPipeline(Pipeline):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING)
+        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
 
     def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, **kwargs):
         preprocess_params, postprocess_params = {}, {}
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 5f33ecffcab6..a6e90dd52250 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -18,10 +18,10 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 if is_tf_available():
-    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
     from ..tf_utils import stable_softmax
 
 logger = logging.get_logger(__name__)
@@ -66,9 +66,9 @@ def __init__(self, **kwargs):
 
         requires_backends(self, "vision")
         self.check_model_type(
-            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
             if self.framework == "tf"
-            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING
+            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
     def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs):
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index cf05999861c0..cd7009abfbf0 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -14,7 +14,7 @@
 
     from transformers.modeling_outputs import BaseModelOutput
 
-    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
 
@@ -60,7 +60,7 @@ def __init__(self, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING)
+        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)
 
     def __call__(
         self,

From a6484c89b9566333b7f9862e22f702070f47bbe9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 21 Jul 2023 09:42:05 -0400
Subject: [PATCH 727/935] Fix type annotation for deepspeed training arg
 (#24988)

---
 src/transformers/training_args.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index da786317e79e..2efafaea77f7 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1001,6 +1001,7 @@ class TrainingArguments:
             )
         },
     )
+    # Do not touch this type annotation or it will stop working in CLI
     fsdp_config: Optional[str] = field(
         default=None,
         metadata={
@@ -1019,7 +1020,8 @@ class TrainingArguments:
             )
         },
     )
-    deepspeed: Optional[Union[str, Dict]] = field(
+    # Do not touch this type annotation or it will stop working in CLI
+    deepspeed: Optional[str] = field(
         default=None,
         metadata={
             "help": (

From a7d213189dc23f0a5a1b7f897989549a0c60306a Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 21 Jul 2023 10:30:17 -0400
Subject: [PATCH 728/935] Use main_input_name for include_inputs_for_metrics
 (#24993)

---
 src/transformers/trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 3611e99035be..72c36cca6531 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3121,7 +3121,8 @@ def evaluation_loop(
 
             # Prediction step
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+            main_input_name = getattr(self.model, "main_input_name", "input_ids")
+            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
 
             if is_torch_tpu_available():
                 xm.mark_step()
@@ -3674,7 +3675,8 @@ def prediction_loop(
 
         for step, inputs in enumerate(dataloader):
             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
+            main_input_name = getattr(self.model, "main_input_name", "input_ids")
+            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
 
             if loss is not None:
                 losses = loss.repeat(batch_size)

From f1a1eb4ae10eac56dccfcd55d0f3a65772dc2741 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 21 Jul 2023 16:47:51 +0200
Subject: [PATCH 729/935] Fix `llama` tokenization doctest (#24990)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/llama/tokenization_llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 7169b29867d3..ddc643a360f2 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -332,7 +332,7 @@ def create_token_type_ids_from_sequences(
         return output
 
     def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """Builds the input ids for a conversation.
+        r"""Builds the input ids for a conversation.
         This is the format used in the provided examples. System prompts should be manually added at the beginning of
         the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
         ```
@@ -347,7 +347,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
 
         >>> Conversation(
         ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
-        ... )
+        ... )  # doctest: +IGNORE_RESULT
         ```
         Args:
             conversation (`Conversation`):

From d3ce048c2029c829e23c7d82f9abd4919610db87 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 21 Jul 2023 17:50:52 +0200
Subject: [PATCH 730/935] [`bnb`] Add simple check for bnb import (#24995)

add simple check for bnb
---
 src/transformers/utils/import_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index ffefaed9e627..c0a8c80f0b09 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -504,7 +504,14 @@ def get_major_and_minor_from_version(full_version):
 
 
 def is_bitsandbytes_available():
-    return _bitsandbytes_available
+    if not is_torch_available():
+        return False
+
+    # bitsandbytes throws an error if cuda is not available
+    # let's avoid that by adding a simple check
+    import torch
+
+    return _bitsandbytes_available and torch.cuda.is_available()
 
 
 def is_torchdistx_available():

From 95f96b45ffb57291c69a43d7a11a5bb166220d0b Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 21 Jul 2023 18:11:08 +0200
Subject: [PATCH 731/935] [`Llama`] remove persistent  `inv_freq` tensor
 (#24998)

remove persistent tensor
---
 .../models/deprecated/open_llama/modeling_open_llama.py       | 4 ++--
 src/transformers/models/llama/modeling_llama.py               | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index a9948afa80f2..1ea64946c8ca 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -107,7 +107,7 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.max_position_embeddings = max_position_embeddings
         self.base = base
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(
@@ -171,7 +171,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
                 (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
             inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 5ba76d714761..b9f278619ad3 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -97,7 +97,7 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.max_position_embeddings = max_position_embeddings
         self.base = base
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(
@@ -159,7 +159,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
                 (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
             inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 

From ea41e18cfc38f6d2ae348389334eea8c3a8ee484 Mon Sep 17 00:00:00 2001
From: Ivan Sorokin <i.sorok1n@icloud.com>
Date: Fri, 21 Jul 2023 20:39:28 +0100
Subject: [PATCH 732/935] improve from_pretrained for zero3 multi gpus mode
 (#24964)

* improve from_pretrained for zero3 multi gpus mode

* Add check if torch.distributed.is_initialized

* Revert torch.distributed

---------

Co-authored-by: Stas Bekman <stas@stason.org>
---
 src/transformers/modeling_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 45459ed22b0a..c7d31c09024e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -457,7 +457,11 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
             )
         return safe_load_file(checkpoint_file)
     try:
-        return torch.load(checkpoint_file, map_location="cpu")
+        if is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0:
+            map_location = "meta"
+        else:
+            map_location = "cpu"
+        return torch.load(checkpoint_file, map_location=map_location)
     except Exception as e:
         try:
             with open(checkpoint_file) as f:

From 87fba947a51a643675a5fd4e2b1f54c81d046c75 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 21 Jul 2023 16:49:44 -0400
Subject: [PATCH 733/935] Move template doc file to md (#25004)

---
 ...se_modelname}}.mdx => {{cookiecutter.lowercase_modelname}}.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{{cookiecutter.lowercase_modelname}}.mdx => {{cookiecutter.lowercase_modelname}}.md} (100%)

diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md
similarity index 100%
rename from templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.mdx
rename to templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.md

From b257c46a075419c09e5ce5c5aa39bc346ecdb9a5 Mon Sep 17 00:00:00 2001
From: Wonhyeong Seo <wonhseo@kakao.com>
Date: Sat, 22 Jul 2023 08:23:59 +0900
Subject: [PATCH 734/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Updated=20Ko?=
 =?UTF-8?q?rean=20`serialization.md`=20(#24686)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix: update ko/serialization.md

* chatgpt draft
---
 docs/source/ko/serialization.md | 450 +++++++-------------------------
 1 file changed, 88 insertions(+), 362 deletions(-)

diff --git a/docs/source/ko/serialization.md b/docs/source/ko/serialization.md
index e3d06deac89b..0cbcf005e3ac 100644
--- a/docs/source/ko/serialization.md
+++ b/docs/source/ko/serialization.md
@@ -14,442 +14,168 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# ONNX로 내보내기[[export-to-onnx]]
+# ONNX로 내보내기 [[export-to-onnx]]
 
-프로덕션 환경에 🤗 Transformers 모델을 배포할 때에는 특수 런타임 및 하드웨어 위에 올리고 실행할 수 있도록 직렬화된 형식으로 내보내기를 권장합니다. 이 가이드에서는 🤗 Transformers 모델을 [ONNX (Open Neural Network eXchange)](http://onnx.ai)로 내보내는 방법을 안내합니다.
+🤗 Transformers 모델을 제품 환경에서 배포하기 위해서는 모델을 직렬화된 형식으로 내보내고 특정 런타임과 하드웨어에서 로드하고 실행할 수 있으면 유용합니다.
 
-ONNX는 딥러닝 모델을 표현하기 위한 공통 파일 형식과 연산자들을 정의하는 개방형 표준으로써 PyTorch, TensorFlow 등 다양한 프레임워크에서 지원됩니다. 모델을 ONNX 형식으로 내보내면, (보통 _중간 표현 (Intermediate Representation; IR)_이라고 불리는) 계산 그래프가 구성됩니다. 계산 그래프는 신경망을 통해 데이터가 흐르는 방식, 즉 어떤 연산이 어느 부분에 사용되었는지를 나타냅니다.
+🤗 Optimum은 Transformers의 확장으로, PyTorch 또는 TensorFlow에서 모델을 ONNX와 TFLite와 같은 직렬화된 형식으로 내보낼 수 있도록 하는 `exporters` 모듈을 통해 제공됩니다. 🤗 Optimum은 또한 성능 최적화 도구 세트를 제공하여 특정 하드웨어에서 모델을 훈련하고 실행할 때 최대 효율성을 달성할 수 있습니다.
 
-표준 연산 및 데이터 형식을 사용하여 그래프를 노출하기 때문에 ONNX를 사용하면 프레임워크 간 전환이 쉬워집니다. 예를 들어, PyTorch에서 훈련된 모델은 ONNX 형식으로 내보낸 뒤, TensorFlow에서 가져올 수 있습니다. 물론 그 반대도 가능합니다.
+이 안내서는 🤗 Optimum을 사용하여 🤗 Transformers 모델을 ONNX로 내보내는 방법을 보여줍니다. TFLite로 모델을 내보내는 안내서는 [TFLite로 내보내기 페이지](tflite)를 참조하세요.
 
-🤗 Transformers는 모델 체크포인트를 ONNX 그래프로 변환할 수 있게 해주는 [`transformers.onnx`](main_classes/onnx) 패키지를 제공합니다. 이걸 가능케 하는 구성 객체는 여러 모델 아키텍처를 대상으로 미리 제작되어 있으며, 다른 아키텍처로도 쉽게 확장할 수 있도록 설계되었습니다.
+## ONNX로 내보내기 [[export-to-onnx]]
 
-<Tip>
+[ONNX (Open Neural Network eXchange)](http://onnx.ai)는 PyTorch와 TensorFlow를 포함한 다양한 프레임워크에서 심층 학습 모델을 나타내는 데 사용되는 공통 연산자 세트와 공통 파일 형식을 정의하는 오픈 표준입니다. 모델이 ONNX 형식으로 내보내지면 이러한 연산자를 사용하여 신경망을 통해 데이터가 흐르는 흐름을 나타내는 계산 그래프(일반적으로 _중간 표현_이라고 함)가 구성됩니다.
 
-🤗 Optimum에서 [`optimum.exporters.onnx` 패키지](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model)를 사용하여 🤗 Transformers 모델을 내보낼 수도 있습니다.
+표준화된 연산자와 데이터 유형을 가진 그래프를 노출함으로써, ONNX는 프레임워크 간에 쉽게 전환할 수 있습니다. 예를 들어, PyTorch에서 훈련된 모델을 ONNX 형식으로 내보내고 TensorFlow에서 가져올 수 있습니다(그 반대도 가능합니다).
 
-모델을 내보낸 후 다음과 같이 사용될 수 있습니다:
+ONNX 형식으로 내보낸 모델은 다음과 같이 사용할 수 있습니다:
+- [그래프 최적화](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) 및 [양자화](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization)와 같은 기법을 사용하여 추론을 위해 최적화됩니다.
+- ONNX Runtime을 통해 실행할 수 있습니다. [`ORTModelForXXX` 클래스들](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)을 통해 동일한 `AutoModel` API를 따릅니다. 이 API는 🤗 Transformers에서 사용하는 것과 동일합니다.
+- [최적화된 추론 파이프라인](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines)을 사용할 수 있습니다. 이는 🤗 Transformers의 [`pipeline`] 함수와 동일한 API를 가지고 있습니다.
 
-- 양자화 및 그래프 최적화와 같은 기술을 통해 추론에 최적화합니다.
-- [`ORTModelForXXX` 클래스](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)를 통해 ONNX 런타임에서 실행합니다. 이 클래스들은 🤗 Transformers에서 사용하는 `AutoModel` API와 동일합니다.
-- [최적화된 추론 파이프라인](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines) 위에 실행합니다. 이 파이프라인은 🤗 Transformers의 [`pipeline`] 함수와 동일한 API를 갖습니다.
+🤗 Optimum은 구성 객체를 활용하여 ONNX 내보내기를 지원합니다. 이러한 구성 객체는 여러 모델 아키텍처에 대해 미리 준비되어 있으며 다른 아키텍처에 쉽게 확장할 수 있도록 설계되었습니다.
 
-이러한 기능을 모두 살펴보려면 [🤗 Optimum 라이브러리](https://github.com/huggingface/optimum)를 확인하세요.
+미리 준비된 구성 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/overview)를 참조하세요.
 
-</Tip>
+🤗 Transformers 모델을 ONNX로 내보내는 두 가지 방법이 있습니다. 여기에서 두 가지 방법을 모두 보여줍니다:
 
-미리 제작된 구성에는 다음 아키텍처가 포함됩니다:
-
-<!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
-
-- ALBERT
-- BART
-- BEiT
-- BERT
-- BigBird
-- BigBird-Pegasus
-- Blenderbot
-- BlenderbotSmall
-- BLOOM
-- CamemBERT
-- Chinese-CLIP
-- CLIP
-- CodeGen
-- Conditional DETR
-- ConvBERT
-- ConvNeXT
-- Data2VecText
-- Data2VecVision
-- DeBERTa
-- DeBERTa-v2
-- DeiT
-- DETR
-- DistilBERT
-- EfficientNet
-- ELECTRA
-- ERNIE
-- FlauBERT
-- GPT Neo
-- GPT-J
-- GPT-Sw3
-- GroupViT
-- I-BERT
-- ImageGPT
-- LayoutLM
-- LayoutLMv3
-- LeViT
-- Longformer
-- LongT5
-- M2M100
-- Marian
-- mBART
-- MEGA
-- MobileBERT
-- MobileNetV1
-- MobileNetV2
-- MobileViT
-- MT5
-- OpenAI GPT-2
-- OWL-ViT
-- Perceiver
-- PLBart
-- PoolFormer
-- RemBERT
-- ResNet
-- RoBERTa
-- RoBERTa-PreLayerNorm
-- RoFormer
-- SegFormer
-- SqueezeBERT
-- Swin Transformer
-- T5
-- Table Transformer
-- Vision Encoder decoder
-- ViT
-- Whisper
-- X-MOD
-- XLM
-- XLM-RoBERTa
-- XLM-RoBERTa-XL
-- YOLOS
-
-앞으로의 두 섹션에서는 아래 내용을 살펴보겠습니다:
-
-* `transformers.onnx` 패키지를 사용하여 지원되는 모델 내보내기
-* 지원되지 않는 아키텍처를 위해 사용자 정의 모델 내보내기
-
-## 모델을 ONNX로 내보내기[[exporting-a-model-to-onnx]]
-
-<Tip>
-
-이제 모델을 내보낼 때 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)를 사용하도록 권장합니다. `transformers.onnx`와 매우 유사하니 걱정하지 마세요!
+- 🤗 Optimum을 사용하여 CLI로 내보내기
+- `optimum.onnxruntime`을 사용하여 🤗 Optimum으로 ONNX로 내보내기
 
-</Tip>
+### CLI를 사용하여 🤗 Transformers 모델을 ONNX로 내보내기 [[exporting-a-transformers-model-to-onnx-with-cli]]
 
-🤗 Transformers 모델을 ONNX로 내보내려면 먼저 몇 가지 추가 종속성을 설치해야합니다:
+🤗 Transformers 모델을 ONNX로 내보내려면 먼저 추가 종속성을 설치하세요:
 
 ```bash
-pip install transformers[onnx]
+pip install optimum[exporters]
 ```
 
-`transformers.onnx` 패키지는 다음과 같이 Python 모듈로 사용할 수 있습니다:
+사용 가능한 모든 인수를 확인하려면 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)를 참조하거나 명령줄에서 도움말을 보세요.
 
 ```bash
-python -m transformers.onnx --help
-
-usage: Hugging Face Transformers ONNX exporter [-h] -m MODEL [--feature {causal-lm, ...}] [--opset OPSET] [--atol ATOL] output
-
-positional arguments:
-  output                Path indicating where to store generated ONNX model.
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Model ID on huggingface.co or path on disk to load model from.
-  --feature {causal-lm, ...}
-                        The type of features to export the model with.
-  --opset OPSET         ONNX opset version to export the model with.
-  --atol ATOL           Absolute difference tolerance when validating the model.
+optimum-cli export onnx --help
 ```
 
-다음과 같이 미리 제작된 구성을 사용하여 체크포인트를 내보낼 수 있습니다:
+예를 들어, 🤗 Hub에서 `distilbert-base-uncased-distilled-squad`와 같은 모델의 체크포인트를 내보내려면 다음 명령을 실행하세요:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
-다음과 같은 로그가 표시되어야합니다:
+위와 같이 진행 상황을 나타내는 로그가 표시되고 결과인 `model.onnx`가 저장된 위치가 표시됩니다.
 
 ```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'last_hidden_state'})
-        - Validating ONNX Model output "last_hidden_state":
-                -[✓] (2, 8, 768) matches (2, 8, 768)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
 ```
 
-이렇게 `--model` 인수로 정의된 체크포인트의 ONNX 그래프를 내보냅니다. 예시에서는 `distilbert-base-uncased`이지만, Hugging Face Hub에서 가져왔거나 로컬에 저장된 체크포인트들 모두 가능합니다.
-
-결과로 나온 `model.onnx` 파일은 ONNX 표준을 지원하는 [다양한 가속기](https://onnx.ai/supported-tools.html#deployModel) 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 [ONNX Runtime](https://onnxruntime.ai/)에서 모델을 가져오고 실행할 수 있습니다:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-`["last_hidden_state"]`와 같은 필요한 출력 이름은 각 모델의 ONNX 구성을 살펴보면 얻을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-Hub의 TensorFlow 체크포인트의 경우에도 과정은 동일합니다. 예를 들어, 다음과 같이 [Keras organization](https://huggingface.co/keras-io)에서 TensorFlow 체크포인트를 내보낼 수 있습니다:
+위의 예제는 🤗 Hub에서 체크포인트를 내보내는 것을 설명합니다. 로컬 모델을 내보낼 때에는 모델의 가중치와 토크나이저 파일을 동일한 디렉토리(`local_path`)에 저장했는지 확인하세요. CLI를 사용할 때에는 🤗 Hub의 체크포인트 이름 대신 `model` 인수에 `local_path`를 전달하고 `--task` 인수를 제공하세요. 지원되는 작업의 목록은 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/task_manager)를 참조하세요. `task` 인수가 제공되지 않으면 작업에 특화된 헤드 없이 모델 아키텍처로 기본 설정됩니다.
 
 ```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-로컬에 저장된 모델을 내보내려면 모델의 가중치 및 토크나이저 파일이 저장된 디렉토리가 필요합니다. 예를 들어, 다음과 같이 체크포인트를 가져오고 저장할 수 있습니다:
-
-<frameworkcontent> <pt>
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-pt-checkpoint")
->>> pt_model.save_pretrained("local-pt-checkpoint")
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
 ```
 
-체크포인트를 저장한 후, `transformers.onnx` 패키지의 `--model` 인수를 원하는 디렉토리로 지정하여 ONNX로 내보낼 수 있습니다:
+그 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 [가속기](https://onnx.ai/supported-tools.html#deployModel) 중 하나에서 실행할 수 있습니다. 예를 들어, [ONNX Runtime](https://onnxruntime.ai/)을 사용하여 모델을 로드하고 실행할 수 있습니다:
 
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
-</pt> <tf>
 ```python
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
 
->>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
->>> # Save to disk
->>> tokenizer.save_pretrained("local-tf-checkpoint")
->>> tf_model.save_pretrained("local-tf-checkpoint")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
 ```
 
-체크포인트를 저장한 후, `transformers.onnx` 패키지의 `--model` 인수를 원하는 디렉토리로 지정하여 ONNX로 내보낼 수 있습니다:
+Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, [Keras organization](https://huggingface.co/keras-io)에서 순수한 TensorFlow 체크포인트를 내보내는 방법은 다음과 같습니다:
 
 ```bash
-python -m transformers.onnx --model=local-tf-checkpoint onnx/
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
 ```
-</tf> </frameworkcontent>
 
-## 다른 모델 작업에 대한 기능 선택[[selecting-features-for-different-model-tasks]]
+### `optimum.onnxruntime`을 사용하여 🤗 Transformers 모델을 ONNX로 내보내기 [[exporting-a-transformers-model-to-onnx-with-optimumonnxruntime]]
 
-<Tip>
-
-이제 모델을 내보낼 때 `optimum.exporters.onnx`를 사용하도록 권장합니다. 작업을 선택하는 방법을 알아보려면 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#selecting-a-task)를 확인하세요.
-
-</Tip>
-
-다른 유형의 태스크에 맞춰서 모델을 내보낼 수 있도록 미리 제작된 구성마다 일련의 _기능_이 포함되어 있습니다. 아래 표에 나와 있는대로 각 기능은 다른 `AutoClass`와 연관되어 있습니다.
-
-| Feature                              | Auto Class                           |
-| ------------------------------------ | ------------------------------------ |
-| `causal-lm`, `causal-lm-with-past`   | `AutoModelForCausalLM`               |
-| `default`, `default-with-past`       | `AutoModel`                          |
-| `masked-lm`                          | `AutoModelForMaskedLM`               |
-| `question-answering`                 | `AutoModelForQuestionAnswering`      |
-| `seq2seq-lm`, `seq2seq-lm-with-past` | `AutoModelForSeq2SeqLM`              |
-| `sequence-classification`            | `AutoModelForSequenceClassification` |
-| `token-classification`               | `AutoModelForTokenClassification`    |
-
-각 구성에서 [`~transformers.onnx.FeaturesManager`]를 통해 지원되는 기능 목록을 찾을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
+CLI 대신에 `optimum.onnxruntime`을 사용하여 프로그래밍 방식으로 🤗 Transformers 모델을 ONNX로 내보낼 수도 있습니다. 다음과 같이 진행하세요:
 
 ```python
->>> from transformers.onnx.features import FeaturesManager
-
->>> distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("distilbert").keys())
->>> print(distilbert_features)
-["default", "masked-lm", "causal-lm", "sequence-classification", "token-classification", "question-answering"]
-```
-
-그런 다음 `transformers.onnx` 패키지의 `--feature` 인수에 이러한 기능 중 하나를 전달할 수 있습니다. 예를 들어, 텍스트 분류 모델을 내보내려면 다음과 같이 Hub에서 미세 조정된 모델을 선택하고 실행할 수 있습니다:
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
 
-```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
-                            --feature=sequence-classification onnx/
-```
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
 
-다음과 같은 로그가 표시됩니다:
+>>> # Load a model from transformers and export it to ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 
-```bash
-Validating ONNX model...
-        -[✓] ONNX model output names match reference model ({'logits'})
-        - Validating ONNX Model output "logits":
-                -[✓] (2, 2) matches (2, 2)
-                -[✓] all values close (atol: 1e-05)
-All good, model saved at: onnx/model.onnx
+>>> # Save the onnx model and tokenizer
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
 ```
 
-이때 미세 조정된 모델의 출력명은 이전의 `distilbert-base-uncased` 체크포인트에서 봤던 `last_hidden_state`와 달리 `logits`입니다. 시퀀스 분류를 위해 미세 조정되었기 때문에 예상대로 입니다.
-
-<Tip>
-
-`with-past` 접미사를 가진 기능(예: `causal-lm-with-past`)은 미리 계산된 숨겨진 상태(hidden states; 어텐션 블록 속 키-값 쌍)를 사용하여 빠른 자기 회귀 디코딩이 가능한 모델 클래스들입니다.
-
-</Tip>
-
-<Tip>
-
-`VisionEncoderDecoder` 유형 모델의 경우, 인코더 및 디코더 부분은 각각 `encoder_model.onnx` 및 `decoder_model.onnx`라는 두 개의 ONNX 파일로 분리하여 내보냅니다.
-
-</Tip>
+### 지원되지 않는 아키텍처의 모델 내보내기 [[exporting-a-model-for-an-unsupported-architecture]]
 
+현재 내보낼 수 없는 모델을 지원하기 위해 기여하려면, 먼저 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)에서 지원되는지 확인한 후 지원되지 않는 경우에는 [🤗 Optimum에 기여](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)하세요.
 
-## 지원되지 않는 아키텍처를 위한 모델 내보내기[[exporting-a-model-for-an-unsupported-architecture]]
+### `transformers.onnx`를 사용하여 모델 내보내기 [[exporting-a-model-with-transformersonnx]]
 
-<Tip>
+<Tip warning={true}>
 
-현재 내보낼 수 없는 모델을 지원하도록 기여하려면 먼저 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/package_reference/configuration#supported-architectures)에서 지원되는지 확인하고 지원되지 않는 경우 [🤗 Optimum에 기여](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/contribute)하세요.
+`tranformers.onnx`는 더 이상 유지되지 않습니다. 위에서 설명한 대로 🤗 Optimum을 사용하여 모델을 내보내세요. 이 섹션은 향후 버전에서 제거될 예정입니다.
 
 </Tip>
 
-라이브러리에서 직접 지원하지 않는 아키텍처의 모델을 내보내려면 세 가지 주요 단계를 거쳐야 합니다:
-
-1. 사용자 정의 ONNX 구성을 구현하기
-2. 모델을 ONNX로 내보내기
-3. PyTorch 및 내보낸 모델의 출력 검증하기
-
-이 섹션에서는 DistilBERT가 어떻게 구현되었는지 각 단계마다 자세히 살펴보겠습니다.
-
-### 사용자 정의 ONNX 구성을 구현하기[[implementing-a-custom-onnx-configuration]]
+🤗 Transformers 모델을 ONNX로 내보내려면 추가 종속성을 설치하세요:
 
-ONNX 구성 객체부터 시작해 봅시다. 내보내려는 모델 아키텍처 유형에 따라 상속해야하는 세 가지 추상 클래스를 제공합니다:
-
-* 인코더 기반 모델은 [`~onnx.config.OnnxConfig`]를 상속합니다.
-* 디코더 기반 모델은 [`~onnx.config.OnnxConfigWithPast`]를 상속합니다.
-* 인코더-디코더 모델은 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]를 상속합니다.
-
-<Tip>
-
-사용자 정의 ONNX 구성을 구현하는 좋은 방법은 비슷한 아키텍처의 `configuration_<model_name>.py` 파일에서 기존 구현을 확인하는 것입니다.
-
-</Tip>
-
-DistilBERT는 인코더 기반 모델이므로 해당 구성은 `OnnxConfig`를 상속합니다.
-
-```python
->>> from typing import Mapping, OrderedDict
->>> from transformers.onnx import OnnxConfig
-
-
->>> class DistilBertOnnxConfig(OnnxConfig):
-...     @property
-...     def inputs(self) -> Mapping[str, Mapping[int, str]]:
-...         return OrderedDict(
-...             [
-...                 ("input_ids", {0: "batch", 1: "sequence"}),
-...                 ("attention_mask", {0: "batch", 1: "sequence"}),
-...             ]
-...         )
-```
-
-각 구성 객체는 `inputs` 속성을 구현하고 매핑을 반환해야 합니다. 매핑의 키는 예상 입력에 해당하고 값은 해당 입력의 축을 나타냅니다. DistilBERT의 경우 `input_ids` 및 `attention_mask` 두 개의 입력이 필요한데요. 두 입력 모두 `(batch_size, sequence_length)`의 동일한 차원이기 때문에 구성에서도 똑같은 축을 사용합니다.
-
-<Tip>
-
-`DistilBertOnnxConfig`의 `inputs` 속성이 `OrderedDict`라는 것에 유의하세요. 이렇게 하면 입력이 그래프를 따라 흐를 때 `PreTrainedModel.forward()` 메소드 속 알맞은 상대적인 위치에 있도록 보장합니다. 사용자 정의 ONNX 구성을 구현할 때도 `inputs` 및 `outputs` 속성으로 `OrderedDict`를 사용하는 것을 권장합니다.
-
-</Tip>
-
-ONNX 구성을 구현한 후에는 다음과 같이 기본 모델의 구성을 제공하여 인스턴스화 할 수 있습니다:
-
-```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config = DistilBertOnnxConfig(config)
-```
-
-결과 객체에는 여러 가지 유용한 속성이 있습니다. 예를 들어 ONNX로 내보낼 때 쓰일 ONNX 연산자 집합을 볼 수 있습니다:
-
-```python
->>> print(onnx_config.default_onnx_opset)
-11
+```bash
+pip install transformers[onnx]
 ```
 
-다음과 같이 모델에 연결된 출력을 볼 수도 있습니다:
+`transformers.onnx` 패키지를 Python 모듈로 사용하여 준비된 구성을 사용하여 체크포인트를 내보냅니다:
 
-```python
->>> print(onnx_config.outputs)
-OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"})])
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
 ```
 
-출력 속성이 입력과 동일한 구조임을 유의하세요. 각 출력은 이름과 차원이 `OrderedDict`의 키-값으로 저장되어 있습니다. 출력 구조는 구성을 초기화할 때 선택한 기능과 관련이 있습니다. 기본적으로 ONNX 구성은 `AutoModel` 클래스로 가져온 모델을 내보낼 때 쓰이는 `default` 기능으로 초기화됩니다. 다른 태스크를 위해 모델을 내보내려면 ONNX 구성을 초기화할 때 `task` 인수에 다른 기능을 넣으면 됩니다. 예를 들어, 시퀀스 분류 단계를 덧붙인 DistilBERT를 내보내려면, 이렇게 해볼 수 있습니다:
+이렇게 하면 `--model` 인수에 정의된 체크포인트의 ONNX 그래프가 내보내집니다. 🤗 Hub에서 제공하는 체크포인트나 로컬에 저장된 체크포인트를 전달할 수 있습니다. 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 가속기 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 ONNX Runtime을 사용하여 모델을 로드하고 실행할 수 있습니다:
 
 ```python
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
->>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
->>> print(onnx_config_for_seq_clf.outputs)
-OrderedDict([('logits', {0: 'batch'})])
-```
-
-<Tip>
-
-[`~onnx.config.OnnxConfig`]나 다른 구성 클래스에 연결된 모든 기본 속성 및 메소드는 필요에 따라 모두 재정의할 수 있습니다. 고급 예제로 [`BartOnnxConfig`]를 확인하세요.
-
-</Tip>
-
-### 모델 내보내기[[exporting-the-model]]
-
-ONNX 구성을 구현했다면, 다음 단계는 모델을 내보내는 것입니다. 이제 `transformers.onnx` 패키지에서 제공하는 `export()` 함수를 살펴보겠습니다. 이 함수는 ONNX 구성, 기본 모델, 토크나이저, 그리고 내보낼 파일의 경로를 입력으로 받습니다:
-
-```python
->>> from pathlib import Path
->>> from transformers.onnx import export
->>> from transformers import AutoTokenizer, AutoModel
-
->>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
->>> base_model = AutoModel.from_pretrained(model_ckpt)
->>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
 
->>> onnx_inputs, onnx_outputs = export(tokenizer, base_model, onnx_config, onnx_config.default_onnx_opset, onnx_path)
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
 ```
 
-`export()` 함수가 반환하는 `onnx_inputs`와 `onnx_outputs`는 구성의 `inputs`와 `outputs` 속성에서 정의된 키 목록입니다. 모델을 내보낸 후 다음과 같이 모델이 잘 구성되어 있는지 테스트할 수 있습니다:
+필요한 출력 이름(예: `["last_hidden_state"]`)은 각 모델의 ONNX 구성을 확인하여 얻을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
 
 ```python
->>> import onnx
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
 
->>> onnx_model = onnx.load("model.onnx")
->>> onnx.checker.check_model(onnx_model)
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
 ```
 
-<Tip>
-
-모델 크기가 2GB보다 큰 경우 내보내는 중에 여러 추가 파일들이 생성되는 것을 볼 수 있습니다. 사실 ONNX는 모델을 저장하기 위해 [Protocol Buffers](https://developers.google.com/protocol-buffers/)를 사용하는데, 버퍼는 2GB의 크기 제한이 있기 때문에 _자연스러운_ 일입니다. 외부 데이터를 사용하여 모델을 가져오는 방법은 [ONNX 문서](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md)를 참조하세요.
-
-</Tip>
-
-### 모델의 출력 검증하기[[validating-the-model-outputs]]
-
-마지막 단계는 기존 모델과 내보낸 모델의 출력이 일정한 오차 범위 내에서 동일하다는 것을 검증하는 것입니다. 그러려면 `transformers.onnx` 패키지에서 제공하는 `validate_model_outputs()` 함수를 사용할 수 있습니다:
-
-```python
->>> from transformers.onnx import validate_model_outputs
+Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, 다음과 같이 순수한 TensorFlow 체크포인트를 내보냅니다:
 
->>> validate_model_outputs(
-...     onnx_config, tokenizer, base_model, onnx_path, onnx_outputs, onnx_config.atol_for_validation
-... )
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
 ```
 
-이 함수는 [`~transformers.onnx.OnnxConfig.generate_dummy_inputs`] 메소드로 기존 및 내보낸 모델의 입력을 생성하며, 검증에 사용될 오차 범위는 구성에서 정의할 수 있습니다. 일반적으로는 1e-6에서 1e-4 범위 내에서 합의하지만, 1e-3보다 작다면 문제 없을 가능성이 높습니다.
-
-## 🤗 Transformers에 새 구성 추가하기[[contributing-a-new-configuration-to-transformers]]
+로컬에 저장된 모델을 내보내려면 모델의 가중치 파일과 토크나이저 파일을 동일한 디렉토리에 저장한 다음, transformers.onnx 패키지의 --model 인수를 원하는 디렉토리로 지정하여 ONNX로 내보냅니다:
 
-미리 제작된 구성의 숫자를 늘리려고 노력하고 있으며, 커뮤니티의 기여를 환영합니다! 라이브러리에 당신만의 구성을 추가하려면 다음 단계를 기억해주세요:
-
-* `configuration_<model_name>.py` 파일에 ONNX 구성을 구현하세요.
-* [`~onnx.features.FeatureManager`]에 모델 아키텍처 및 해당 기능을 포함하세요.
-* `test_onnx_v2.py`의 테스트에 모델 아키텍처를 추가하세요.
-
-아직 감이 안 잡히신다면, [IBERT 구성](https://github.com/huggingface/transformers/pull/14868/files)이 어떻게 기여되었는지 확인해보세요.
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
\ No newline at end of file

From c9a82be592ca305180a7ab6a36e884bca1d426b8 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Sun, 23 Jul 2023 21:17:26 -0700
Subject: [PATCH 735/935] [check_config_docstrings.py] improve diagnostics
 (#25012)

* [check_config_docstrings.py] improve diagnostics

* style

* rephrase

* fix
---
 utils/check_config_docstrings.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index b92a2e559d0f..02ec510baba6 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -85,7 +85,12 @@ def check_config_docstrings_have_checkpoints():
 
     if len(configs_without_checkpoint) > 0:
         message = "\n".join(sorted(configs_without_checkpoint))
-        raise ValueError(f"The following configurations don't contain any valid checkpoint:\n{message}")
+        raise ValueError(
+            f"The following configurations don't contain any valid checkpoint:\n{message}\n\n"
+            "The requirement is to include a link pointing to one of the models of this architecture in the "
+            "docstring of the config classes listed above. The link should have be a markdown format like "
+            "[myorg/mymodel](https://huggingface.co/myorg/mymodel)."
+        )
 
 
 if __name__ == "__main__":

From 0906d21203cd4003d3a42a6433ef524b6ee1f7b0 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Mon, 24 Jul 2023 14:31:45 +0200
Subject: [PATCH 736/935] [`logging.py`] set default `stderr`  path if `None`
 (#25033)

set default logger
---
 src/transformers/utils/logging.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index b84b5029ce0e..1211994c5606 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -85,6 +85,10 @@ def _configure_library_root_logger() -> None:
             # This library has already configured the library root logger.
             return
         _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176
+        if sys.stderr is None:
+            sys.stderr = open(os.devnull, "w")
+
         _default_handler.flush = sys.stderr.flush
 
         # Apply our default configuration to the library root logger.

From 54ba8608d01e5ca26752dbeda375a3a62f8ffc14 Mon Sep 17 00:00:00 2001
From: Bharat Ramanathan <param.bharat@gmail.com>
Date: Mon, 24 Jul 2023 18:12:39 +0530
Subject: [PATCH 737/935] fix(integrations): store serialized `TrainingArgs` to
 `wandb.config` without sanitization. (#25035)

fix: store training args to wandb config without sanitization.

Allows resuming runs by reusing the wandb config.

Co-authored-by: Bharat Ramanathan <ramanathan.parameshwaran@gohuddl.com>
---
 src/transformers/integrations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 387003cead8f..456f8d9d06ae 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -716,7 +716,7 @@ def setup(self, args, state, model, **kwargs):
             logger.info(
                 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
             )
-            combined_dict = {**args.to_sanitized_dict()}
+            combined_dict = {**args.to_dict()}
 
             if hasattr(model, "config") and model.config is not None:
                 model_config = model.config.to_dict()

From 75317aefb387d12bb4cb38d6b3c522410aabdcfb Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Mon, 24 Jul 2023 08:57:24 -0400
Subject: [PATCH 738/935] [docs] Performance docs tidy up, part 1  (#23963)

* first pass at the single gpu doc

* overview: improved clarity and navigation

* WIP

* updated intro and deepspeed sections

* improved torch.compile section

* more improvements

* minor improvements

* make style

* Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* feedback addressed

* mdx -> md

* link fix

* feedback addressed

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/_toctree.yml            |  60 +-
 docs/source/en/model_memory_anatomy.md | 272 +++++++++
 docs/source/en/perf_train_gpu_one.md   | 779 +++++++++----------------
 docs/source/en/performance.md          |  89 ++-
 4 files changed, 606 insertions(+), 594 deletions(-)
 create mode 100644 docs/source/en/model_memory_anatomy.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 488e1c2c37bc..9c434d945f8a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -111,36 +111,40 @@
 - sections:
     - local: performance
       title: Overview
-    - local: perf_train_gpu_one
-      title: Training on one GPU
-    - local: perf_train_gpu_many
-      title: Training on many GPUs
-    - local: perf_train_cpu
-      title: Training on CPU
-    - local: perf_train_cpu_many
-      title: Training on many CPUs
-    - local: perf_train_tpu
-      title: Training on TPUs
-    - local: perf_train_tpu_tf
-      title: Training on TPU with TensorFlow
-    - local: perf_train_special
-      title: Training on Specialized Hardware
-    - local: perf_infer_cpu
-      title: Inference on CPU
-    - local: perf_infer_gpu_one
-      title: Inference on one GPU
-    - local: perf_infer_gpu_many
-      title: Inference on many GPUs
-    - local: perf_infer_special
-      title: Inference on Specialized Hardware
-    - local: perf_hardware
-      title: Custom hardware for training
+    - sections:
+        - local: perf_train_gpu_one
+          title: Methods and tools for efficient training on a single GPU
+        - local: perf_train_gpu_many
+          title: Multiple GPUs and parallelism
+        - local: perf_train_cpu
+          title: Efficient training on CPU
+        - local: perf_train_cpu_many
+          title: Distributed CPU training
+        - local: perf_train_tpu
+          title: Training on TPUs
+        - local: perf_train_tpu_tf
+          title: Training on TPU with TensorFlow
+        - local: perf_train_special
+          title: Training on Specialized Hardware
+        - local: perf_hardware
+          title: Custom hardware for training
+        - local: hpo_train
+          title: Hyperparameter Search using Trainer API
+      title: Efficient training techniques
+    - sections:
+        - local: perf_infer_cpu
+          title: Inference on CPU
+        - local: perf_infer_gpu_one
+          title: Inference on one GPU
+        - local: perf_infer_gpu_many
+          title: Inference on many GPUs
+        - local: perf_infer_special
+          title: Inference on Specialized Hardware
+      title: Optimizing inference
     - local: big_models
       title: Instantiating a big model
     - local: debugging
-      title: Debugging
-    - local: hpo_train
-      title: Hyperparameter Search using Trainer API
+      title: Troubleshooting
     - local: tf_xla
       title: XLA Integration for TensorFlow Models
   title: Performance and scalability
@@ -182,6 +186,8 @@
     title: Perplexity of fixed-length models
   - local: pipeline_webserver
     title: Pipelines for webserver inference
+  - local: model_memory_anatomy
+    title: Model training anatomy
   title: Conceptual guides
 - sections:
   - sections:
diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md
new file mode 100644
index 000000000000..d53a16a95e17
--- /dev/null
+++ b/docs/source/en/model_memory_anatomy.md
@@ -0,0 +1,272 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Model training anatomy
+
+To understand performance optimization techniques that one can apply to improve efficiency of model training 
+speed and memory utilization, it's helpful to get familiar with how GPU is utilized during training, and how compute 
+intensity varies depending on an operation performed.
+
+Let's start by exploring a motivating example of GPU utilization and the training run of a model. For the demonstration, 
+we'll need to install a few libraries: 
+
+```bash
+pip install transformers datasets accelerate nvidia-ml-py3
+```
+
+The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar 
+with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly.
+
+Then, we create some dummy data: random token IDs between 100 and 30000 and binary labels for a classifier. 
+In total, we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format.
+
+
+```py
+>>> import numpy as np
+>>> from datasets import Dataset
+
+
+>>> seq_len, dataset_size = 512, 512
+>>> dummy_data = {
+...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
+...     "labels": np.random.randint(0, 1, (dataset_size)),
+... }
+>>> ds = Dataset.from_dict(dummy_data)
+>>> ds.set_format("pt")
+```
+
+To print summary statistics for the GPU utilization and the training run with the [`Trainer`] we define two helper functions:
+
+```py
+>>> from pynvml import *
+
+
+>>> def print_gpu_utilization():
+...     nvmlInit()
+...     handle = nvmlDeviceGetHandleByIndex(0)
+...     info = nvmlDeviceGetMemoryInfo(handle)
+...     print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+
+>>> def print_summary(result):
+...     print(f"Time: {result.metrics['train_runtime']:.2f}")
+...     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+...     print_gpu_utilization()
+```
+
+Let's verify that we start with a free GPU memory:
+
+```py
+>>> print_gpu_utilization()
+GPU memory occupied: 0 MB.
+```
+
+That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on 
+your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by 
+the user. When a model is loaded to the GPU also the kernels are loaded which can take up 1-2GB of memory. To see how 
+much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well.
+
+```py
+>>> import torch
+
+
+>>> torch.ones((1, 1)).to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 1343 MB.
+```
+
+We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses.
+
+## Load Model
+
+First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check 
+how much space just the weights use.
+
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 2631 MB.
+```
+
+We can see that the model weights alone take up 1.3 GB of the GPU memory. The exact number depends on the specific 
+GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an 
+optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result 
+as with `nvidia-smi` CLI:
+
+
+```bash
+nvidia-smi
+```
+
+```bash
+Tue Jan 11 08:58:05 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
+| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
++-----------------------------------------------------------------------------+
+```
+
+We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can 
+start training the model and see how the GPU memory consumption changes. First, we set up a few standard training 
+arguments:
+
+```py
+default_args = {
+    "output_dir": "tmp",
+    "evaluation_strategy": "steps",
+    "num_train_epochs": 1,
+    "log_level": "error",
+    "report_to": "none",
+}
+```
+
+<Tip>
+
+ If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python 
+ kernel between experiments.
+
+</Tip>
+
+## Memory utilization at vanilla training
+
+Let's use the [`Trainer`] and train the model without using any GPU performance optimization techniques and a batch size of 4:
+
+```py
+>>> from transformers import TrainingArguments, Trainer, logging
+
+>>> logging.set_verbosity_error()
+
+
+>>> training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
+>>> trainer = Trainer(model=model, args=training_args, train_dataset=ds)
+>>> result = trainer.train()
+>>> print_summary(result)
+```
+
+```
+Time: 57.82
+Samples/second: 8.86
+GPU memory occupied: 14949 MB.
+```
+
+We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size 
+can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our
+model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. 
+To understand a bit better why this is the case let's have look at a model's operations and memory needs.
+
+## Anatomy of Model's Operations
+
+Transformers architecture includes 3 main groups of operations grouped below by compute-intensity.
+
+1. **Tensor Contractions**
+
+    Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer.
+
+2. **Statistical Normalizations**
+
+    Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map.
+
+3. **Element-wise Operators**
+
+    These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations.
+
+This knowledge can be helpful to know when analyzing performance bottlenecks.
+
+This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)
+
+
+## Anatomy of Model's Memory
+
+We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there 
+are many components during training that use GPU memory. The components on GPU memory are the following:
+
+1. model weights
+2. optimizer states
+3. gradients
+4. forward activations saved for gradient computation
+5. temporary buffers
+6. functionality-specific memory
+
+A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For 
+inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per 
+model parameter for mixed precision inference, plus activation memory.
+
+Let's look at the details.
+
+**Model Weights:**
+
+- 4 bytes * number of parameters for fp32 training
+- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
+
+**Optimizer States:**
+
+- 8 bytes * number of parameters for normal AdamW (maintains 2 states)
+- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
+
+**Gradients**
+
+- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
+
+**Forward Activations**
+
+- size depends on many factors, the key ones being sequence length, hidden size and batch size.
+
+There are the input and output that are being passed and returned by the forward and the backward functions and the 
+forward activations saved for gradient computation.
+
+**Temporary Memory**
+
+Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the 
+moment these could require additional memory and could push to OOM. Therefore, when coding it's crucial to think 
+strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed.
+
+**Functionality-specific memory**
+
+Then, your software could have special memory needs. For example, when generating text using beam search, the software 
+needs to maintain multiple copies of inputs and outputs.
+
+**`forward` vs `backward` Execution Speed**
+
+For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates 
+into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually 
+bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward 
+(e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, 
+and writes once, gradInput).
+
+As you can see, there are potentially a few places where we could save GPU memory or speed up operations. 
+Now that you understand what affects GPU utilization and computation speed, refer to 
+the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) documentation page to learn about 
+performance optimization techniques. 
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index 70f24fb1bc73..b526d7a9e180 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -13,489 +13,269 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Efficient Training on a Single GPU
+# Methods and tools for efficient training on a single GPU
 
-This guide focuses on training large models efficiently on a single GPU. These approaches are still valid if you have access to a machine with multiple GPUs but you will also have access to additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
-
-In this section we have a look at a few tricks to reduce the memory footprint and speed up training for large models and how they are integrated in the [`Trainer`] and [🤗 Accelerate](https://huggingface.co/docs/accelerate/). Each method can improve speed or memory usage which is summarized in the table below:
-
-|Method|Speed|Memory|
-|:-----|:----|:-----|
-| Gradient accumulation | No | Yes |
-| Gradient checkpointing | No| Yes |
-| Mixed precision training | Yes | (No) |
-| Batch size | Yes | Yes |
-| Optimizer choice | Yes | Yes |
-| DataLoader | Yes | No |
-| DeepSpeed Zero | No | Yes |
-
-A bracket means that it might not be strictly the case but is usually either not a main concern or negligible. Before we start make sure you have installed the following libraries:
-
-```bash
-pip install transformers datasets accelerate nvidia-ml-py3
-```
-
-The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly.
-
-Then we create some dummy data. We create random token IDs between 100 and 30000 and binary labels for a classifier. In total we get 512 sequences each with length 512 and store them in a [`~datasets.Dataset`] with PyTorch format.
-
-
-```py
-import numpy as np
-from datasets import Dataset
-
-
-seq_len, dataset_size = 512, 512
-dummy_data = {
-    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
-    "labels": np.random.randint(0, 1, (dataset_size)),
-}
-ds = Dataset.from_dict(dummy_data)
-ds.set_format("pt")
-```
-
-We want to print some summary statistics for the GPU utilization and the training run with the [`Trainer`]. We setup a two helper functions to do just that:
-
-```py
-from pynvml import *
-
-
-def print_gpu_utilization():
-    nvmlInit()
-    handle = nvmlDeviceGetHandleByIndex(0)
-    info = nvmlDeviceGetMemoryInfo(handle)
-    print(f"GPU memory occupied: {info.used//1024**2} MB.")
-
-
-def print_summary(result):
-    print(f"Time: {result.metrics['train_runtime']:.2f}")
-    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
-    print_gpu_utilization()
-```
-
-Let's verify that we start with a free GPU memory:
-
-```py
->>> print_gpu_utilization()
-GPU memory occupied: 0 MB.
-```
-
-That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by the user. When a model is loaded to the GPU also the kernels are loaded which can take up 1-2GB of memory. To see how much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well.
-
-```py
->>> import torch
-
-
->>> torch.ones((1, 1)).to("cuda")
->>> print_gpu_utilization()
-GPU memory occupied: 1343 MB.
-```
-
-We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses.
-
-## Load Model
-
-First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check how much space just weights use.
-
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
-
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
->>> print_gpu_utilization()
-GPU memory occupied: 2631 MB.
-```
-
-We can see that the model weights alone take up 1.3 GB of the GPU memory. The exact number depends on the specific GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result as with `nvidia-smi` CLI:
-
-
-```bash
-nvidia-smi
-```
-
-```bash
-Tue Jan 11 08:58:05 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|                               |                      |               MIG M. |
-|===============================+======================+======================|
-|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
-| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
-|                               |                      |                  N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                                  |
-|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
-|        ID   ID                                                   Usage      |
-|=============================================================================|
-|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
-+-----------------------------------------------------------------------------+
-```
-
-We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can start training the model and see how the GPU memory consumption changes. First, we set up a few standard training arguments that we will use across all our experiments:
-
-```py
-default_args = {
-    "output_dir": "tmp",
-    "evaluation_strategy": "steps",
-    "num_train_epochs": 1,
-    "log_level": "error",
-    "report_to": "none",
-}
-```
+This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by 
+optimizing memory utilization, speeding up the training, or both. If you'd like to understand how GPU is utilized during 
+training, please refer to the [Model training anatomy](model_memory_anatomy) conceptual guide first. This guide 
+focuses on practical techniques.  
 
 <Tip>
 
- Note: In order to properly clear the memory after experiments we need restart the Python kernel between experiments. Run all steps above and then just one of the experiments below.
+If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
 
 </Tip>
 
-## Vanilla Training
-
-As a first experiment we will use the [`Trainer`] and train the model without any further modifications and a batch size of 4:
-
-```py
-from transformers import TrainingArguments, Trainer, logging
-
-logging.set_verbosity_error()
-
-
-training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 57.82
-Samples/second: 8.86
-GPU memory occupied: 14949 MB.
-```
-
-We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. To understand a bit better why this is the case let's have look at a model's operations and memory needs.
+When training large models, there are two aspects that should be considered at the same time: 
 
-## Anatomy of Model's Operations
+* Data throughput/training time
+* Model performance
 
-Transformers architecture includes 3 main groups of operations grouped below by compute-intensity.
+Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU 
+as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory, 
+the memory optimization techniques, such as gradient accumulation, can help.
 
-1. **Tensor Contractions**
+However, if the preferred batch size fits into memory, there's no reason to apply memory-optimizing techniques because they can 
+slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of 
+hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly.
 
-    Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer.
+The methods and tools covered in this guide can be classified based on the effect they have on the training process:
 
-2. **Statistical Normalizations**
+| Method/tool                                                | Improves training speed | Optimizes memory utilization |
+|:-----------------------------------------------------------|:------------------------|:-----------------------------|
+| [Batch size choice](#batch-size-choice)                    | Yes                     | Yes                          |
+| [Gradient accumulation](#gradient-accumulation)            | No                      | Yes                          |
+| [Gradient checkpointing](#gradient-checkpointing)          | No                      | Yes                          |
+| [Mixed precision training](#mixed-precision-training)      | Yes                     | (No)                         |
+| [Optimizer choice](#optimizer-choice)                      | Yes                     | Yes                          |
+| [Data preloading](#data-preloading)                        | Yes                     | No                           |
+| [DeepSpeed Zero](#deepspeed-zero)                          | No                      | Yes                          |
+| [torch.compile](#using-torchcompile)                       | Yes                     | No                           |
 
-    Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map.
-
-3. **Element-wise Operators**
-
-    These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations.
-
-This knowledge can be helpful to know when analyzing performance bottlenecks.
-
-This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)
-
-
-## Anatomy of Model's Memory
-We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there are many components during training that use GPU memory. The components on GPU memory are the following:
-1. model weights
-2. optimizer states
-3. gradients
-4. forward activations saved for gradient computation
-5. temporary buffers
-6. functionality-specific memory
-
-A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per model parameter for mixed precision inference, plus activation memory.
-
-Let's look at the details.
-
-**Model Weights:**
-
-- 4 bytes * number of parameters for fp32 training
-- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
-
-**Optimizer States:**
-
-- 8 bytes * number of parameters for normal AdamW (maintains 2 states)
-- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
-- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
-
-**Gradients**
-
-- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
-
-**Forward Activations**
-
-- size depends on many factors, the key ones being sequence length, hidden size and batch size.
-
-There are the input and output that are being passed and returned by the forward and the backward functions and the forward activations saved for gradient computation.
-
-**Temporary Memory**
-
-Additionally there are all kinds of temporary variables which get released once the calculation is done, but in the moment these could require additional memory and could push to OOM. Therefore when coding it's crucial to think strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed.
+<Tip>
 
-**Functionality-specific memory**
+Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a 
+large model and a small batch size, the memory use will be larger.
 
-Then your software could have special memory needs. For example, when generating text using beam search, the software needs to maintain multiple copies of inputs and outputs.
+</Tip>
 
-**`forward` vs `backward` Execution Speed**
+You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are 
+training your model with [`Trainer`] or writing a pure PyTorch loop, in which case you can [configure these optimizations 
+with 🤗 Accelerate](#using-accelerate).
 
-For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward (e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, and writes once, gradInput).
+If these methods do not result in sufficient gains, you can explore the following options: 
+* [Look into building your own custom Docker container with efficient softare prebuilds](#efficient-software-prebuilds)
+* [Consider a model that uses Mixture of Experts (MoE)](#mixture-of-experts)
+* [Convert your model to BetterTransformer to leverage PyTorch native attention](#using-pytorch-native-attention)
 
-So there are potentially a few places where we could save GPU memory or speed up operations. Let's start with a simple optimization: choosing the right batch size.
+Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving 
+to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism 
+techniques outlined in the [multi-GPU section](perf_train_gpu_many). 
 
-## Batch sizes
+## Batch size choice
 
-One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
+To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and 
+input/output neuron counts that are of size 2^N. Often it's a multiple of 8, but it can be 
+higher depending on the hardware being used and the model's dtype.
 
-For example for fully connected layers (which correspond to GEMMs), NVIDIA provides recommendations for [input/output neuron counts](
-https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and [batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size).
+For reference, check out NVIDIA's recommendation for [input/output neuron counts](
+https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and 
+[batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size) for 
+fully connected layers (which are involved in GEMMs (General Matrix Multiplications)).
 
-[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For example, for fp16 a multiple of 8 is recommended, but on A100 it's 64!
+[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) 
+define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless 
+it's an A100 GPU, in which case use multiples of 64.
 
-For parameters that are small, there is also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) to consider, this is where tiling happens and the right multiplier can have a significant speedup.
+For parameters that are small, consider also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization). 
+This is where tiling happens and the right multiplier can have a significant speedup.
 
 ## Gradient Accumulation
 
-The idea behind gradient accumulation is to instead of calculating the gradients for the whole batch at once to do it in smaller steps. The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process. When enough gradients are accumulated we run the model's optimization step. This way we can easily increase the overall batch size to numbers that would never fit into the GPU's memory. In turn, however, the added forward and backward passes can slow down the training a bit.
+The **gradient accumulation** method aims to calculate gradients in smaller increments instead of computing them for the 
+entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward 
+and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of 
+gradients have been accumulated, the model's optimization step is executed. By employing gradient accumulation, it 
+becomes possible to increase the **effective batch size** beyond the limitations imposed by the GPU's memory capacity. 
+However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can 
+slow down the training process.
 
-We can use gradient accumulation in the [`Trainer`] by simply adding the `gradient_accumulation_steps` argument to [`TrainingArguments`]. Let's see how it impacts the models memory footprint:
+You can enable gradient accumulation by adding the `gradient_accumulation_steps` argument to  [`TrainingArguments`]: 
 
 ```py
 training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
 ```
 
-```
-Time: 66.03
-Samples/second: 7.75
-GPU memory occupied: 8681 MB.
-```
+In the above example, your effective batch size becomes 4. 
 
-We can see that the memory footprint was dramatically reduced at the cost of being only slightly slower than the vanilla run. Of course, this would change as you increase the number of accumulation steps. In general you would want to max out the GPU usage as much as possible. So in our case, the batch_size of 4 was already pretty close to the GPU's limit. If we wanted to train with a batch size of 64 we should not use `per_device_train_batch_size=1` and `gradient_accumulation_steps=64` but instead `per_device_train_batch_size=4` and `gradient_accumulation_steps=16` which has the same effective batch size while making better use of the available GPU resources.
+Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example 
+[further down in this guide](#using-accelerate).
 
-For more details see the benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537)
-and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).
+While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can 
+result in a more pronounced training slowdown. Consider the following example. Let's say, the `per_device_train_batch_size=4` 
+without gradient accumulation hits the GPU's limit. If you would like to train with batches of size 64, do not set the 
+`per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to 64. Instead, keep `per_device_train_batch_size=4` 
+and set `gradient_accumulation_steps=16`. This results in the same effective batch size while making better use of 
+the available GPU resources.
 
-Next we have a look at another trick to save a little bit more GPU memory called gradient checkpointing.
+For additional information, please refer to batch size and gradient accumulation benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537)
+and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).
 
 ## Gradient Checkpointing
 
-Even when we set the batch size to 1 and use gradient accumulation we can still run out of memory when working with large models. In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead. Alternatively, one could forget all activations during the forward pass and recompute them on demand during the backward pass. This would however add a significant computational overhead and slow down training.
+Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. 
+This is because there are other components that also require memory storage.
 
-Gradient checkpointing strikes a compromise between the two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. See [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9) explaining the ideas behind gradient checkpointing.
+Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in 
+significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed 
+during the backward pass, would introduce a considerable computational overhead and slow down the training process.
 
-To enable gradient checkpointing in the [`Trainer`] we only need to pass it as a flag to the [`TrainingArguments`]. Everything else is handled under the hood:
+**Gradient checkpointing** offers a compromise between these two approaches and saves strategically selected activations 
+throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For 
+an in-depth explanation of gradient checkpointing, refer to [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9).
+
+To enable gradient checkpointing in the [`Trainer`], pass the corresponding a flag to [`TrainingArguments`]:
 
 ```py
 training_args = TrainingArguments(
     per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
 )
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
 ```
 
-```
-Time: 85.47
-Samples/second: 5.99
-GPU memory occupied: 6775 MB.
-```
-
-We can see that this saved some more memory but at the same time training became a bit slower. A general rule of thumb is that gradient checkpointing slows down training by about 20%. Let's have a look at another method with which we can regain some speed: mixed precision training.
-
+Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example [further in this guide](#using-accelerate). 
 
-## Floating Data Types
+<Tip>
 
-The idea of mixed precision training is that not all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variables and their computations are faster. Here are the commonly used floating point data types choice of which impacts both memory usage and throughput:
+While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%.
 
-- fp32 (`float32`)
-- fp16 (`float16`)
-- bf16 (`bfloat16`)
-- tf32 (CUDA internal data type)
+</Tip>
 
-Here is a diagram that shows how these data types correlate to each other.
+## Mixed precision training
 
-![data types](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tf32-bf16-fp16-fp32.png)
-(source: [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/))
+**Mixed precision training** is a technique that aims to optimize the computational efficiency of training models by 
+utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point 
+precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision 
+level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit 
+floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed 
+in half-precision, while some are still in full precision, the approach is called mixed precision training.
 
-While fp16 and fp32 have been around for quite some time, bf16 and tf32 are only available on the Ampere architecture GPUS and TPUs support bf16 as well. Let's start with the most commonly used method which is FP16 training/
+Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures 
+(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check 
+out the [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) to learn more about 
+the differences between these data types.
 
+### fp16
 
-### FP16 Training
+The main advantage of mixed precision training comes from saving the activations in half precision (fp16). 
+Although the gradients are also computed in half precision they are converted back to full precision for the optimization 
+step so no memory is saved here. 
+While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes.
+This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU).
 
-The idea of mixed precision training is that not all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variales and their computations are faster. The main advantage comes from saving the activations in half (16-bit) precision. Although the gradients are also computed in half precision they are converted back to full precision for the optimization step so no memory is saved here. Since the model is present on the GPU in both 16-bit and 32-bit precision this can use more GPU memory (1.5x the original model is on the GPU), especially for small batch sizes. Since some computations are performed in full and some in half precision this approach is also called mixed precision training. Enabling mixed precision training is also just a matter of setting the `fp16` flag to `True`:
+To enable mixed precision training, set the `fp16` flag to `True`:
 
 ```py
 training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 27.46
-Samples/second: 18.64
-GPU memory occupied: 13939 MB.
-```
-
-We can see that this is almost twice as fast as the vanilla training. Let's add it to the mix of the previous methods:
-
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    **default_args,
-)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 50.76
-Samples/second: 10.09
-GPU memory occupied: 7275 MB.
 ```
 
-We can see that with these tweaks we use about half the GPU memory as at the beginning while also being slightly faster.
+If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example [further in this guide](#using-accelerate). 
 
 ### BF16
-If you have access to a Ampere or newer hardware you can use bf16 for your training and evaluation. While bf16 has a worse precision than fp16, it has a much much bigger dynamic range. Therefore, if in the past you were experiencing overflow issues while training the model, bf16 will prevent this from happening most of the time. Remember that in fp16 the biggest number you can have is `65535` and any number above that will overflow. A bf16 number can be as large as `3.39e+38` (!) which is about the same as fp32 - because both have 8-bits used for the numerical range.
+
+If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While 
+bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have 
+is `65535` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which 
+is about the same as fp32 - because both have 8-bits used for the numerical range.
 
 You can enable BF16 in the 🤗 Trainer with:
 
 ```python
-TrainingArguments(bf16=True)
+training_args = TrainingArguments(bf16=True, **default_args)
 ```
 
 ### TF32
-The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total.
 
-It's magical in the sense that you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput improvement. All you need to do is to add this to your code:
+The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead 
+of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It's "magical" in the sense that 
+you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput 
+improvement. All you need to do is to add the following to your code:
 
 ```
 import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 ```
 
-When this is done CUDA will automatically switch to using tf32 instead of fp32 where it's possible. This, of course, assumes that the used GPU is from the Ampere series.
-
-Like all cases with reduced precision this may or may not be satisfactory for your needs, so you have to experiment and see. According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) the majority of machine learning training shouldn't be impacted and showed the same perplexity and convergence as the fp32 training.
+CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series.
 
+According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/), the 
+majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. 
 If you're already using fp16 or bf16 mixed precision it may help with the throughput as well.
 
-You can enable this mode in the 🤗 Trainer with:
+You can enable this mode in the 🤗 Trainer:
+
 ```python
-TrainingArguments(tf32=True)
+TrainingArguments(tf32=True, **default_args)
 ```
-By default the PyTorch default is used.
 
-Note: tf32 mode is internal to CUDA and can't be accessed directly via `tensor.to(dtype=torch.tf32)` as `torch.tf32` doesn't exist.
+<Tip>
+
+tf32 can't be accessed directly via `tensor.to(dtype=torch.tf32)` because it is an internal CUDA data type. You need `torch>=1.7` to use tf32 data types.
 
-Note: you need `torch>=1.7` to enjoy this feature.
+</Tip>
 
-You can also see a variety of benchmarks on tf32 vs other precisions:
+For additional information on tf32 vs other precisions, please refer to the following benchmarks: 
 [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and
 [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189).
 
-We've now seen how we can change the floating types to increase throughput, but we are not done, yet! There is another area where we can save GPU memory: the optimizer.
+## Optimizer choice
 
-## Optimizer
+The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves 
+good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory 
+footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. 
+For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed, `adamw_apex_fused` will give you the 
+fastest training experience among all supported AdamW optimizers.
 
-The most common optimizer used to train transformer model is Adam or AdamW (Adam with weight decay). Adam achieves good convergence by storing the rolling average of the previous gradients which, however, adds an additional memory footprint of the order of the number of model parameters. One remedy to this is to use an alternative optimizer such as Adafactor, which works well for some models but often it has instability issues.
-
-HF Trainer integrates a variety of optimisers that can be used out of box. To activate the desired optimizer simply pass the `--optim` flag to the command line.
-
-To see which optimizers are currently supported:
-
-```bash
-$ python examples/pytorch/translation/run_translation.py -h | grep "\-optim"
-         [--optim {adamw_hf,adamw_torch,adamw_torch_xla,adamw_apex_fused,adafactor}]
-```
+[`Trainer`] integrates a variety of optimizers that can be used out of box: `adamw_hf`, `adamw_torch`, `adamw_torch_fused`, 
+`adamw_apex_fused`, `adamw_anyprecision` or `adafactor`. More optimizers can be plugged in via a third-party implementation.
 
-For example, if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed `--optim adamw_apex_fused` will give you the fastest training experience among all supported AdamW optimizers.
+Let's take a closer look at two alternatives to AdamW optimizer - Adafactor (available in Trainer), and 8bit BNB quantized 
+optimizer (third-party implementation).
 
-On the other hand [8bit BNB optimizer](https://github.com/TimDettmers/bitsandbytes) can save 3/4 of memory normally used by a typical AdamW optimizer if it is configured to quantize all optimizer states, but in some situations only some optimizer states are quintized and then more memory is used.
-
-Let's get a feel for the numbers and use for example use a 3B-parameter model, like `t5-3b`. Note that since a Gigabyte correpsonds to a billion bytes we can simply multiply the parameters (in billions) with the number of necessary bytes per parameter to get Gigabytes of GPU memory usage:
-
-- A standard AdamW uses 8 bytes for each parameter, here the optimizer will need (`8*3`) 24GB of GPU memory.
-- Adafactor uses slightly more than 4 bytes, so (`4*3`) 12GB and then some extra.
-- 8bit BNB quantized optimizer will use only (`2*3`) 6GB if all optimizer states are quantized.
-
-Let's have a look at Adafactor first.
+For comparison, for a 3B-parameter model, like “t5-3b”: 
+* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB)
+* Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra.
+* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized.
 
 ### Adafactor
 
-Instead of keeping the rolling average for each element in the weight matrices Adafactor only stores aggregated information (row- and column-wise sums of the rolling averages) which reduces the footprint considerably. One downside of Adafactor is that in some instances convergence can be slower than Adam's so some experimentation is advised here. We can use Adafactor simply by setting `optim="adafactor"`:
+Adafactor doesn't store rolling averages for each element in weight matrices. Instead, it keeps aggregated information 
+(sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, 
+Adafactor may have slower convergence in certain cases.
 
+You can switch to Adafactor by setting `optim="adafactor"` in [`TrainingArguments`]:
 
 ```py
 training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
-```
-
-```
-Time: 64.31
-Samples/second: 7.96
-GPU memory occupied: 12295 MB.
-```
-
-We can see that this saves a few more GB on the GPU. Let's see how it looks when we add it to the other methods we introduced earlier:
-
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    optim="adafactor",
-    **default_args,
-)
-
-trainer = Trainer(model=model, args=training_args, train_dataset=ds)
-result = trainer.train()
-print_summary(result)
 ```
 
-```
-Time: 56.54
-Samples/second: 9.06
-GPU memory occupied: 4847 MB.
-```
-
-We went from 15 GB memory usage to 5 GB - a 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of Adafactor can be worse than Adam. There is an alternative to Adafactor called 8-bit Adam that takes a slightly different approach.
+Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) 
+you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of 
+Adafactor can be worse than Adam. 
 
 ### 8-bit Adam
 
-Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the idea behind FP16 training where using variables with lower precision saves memory.
+Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization 
+means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the 
+idea behind mixed precision training.
 
-In contrast to the previous approaches is this one not integrated into the [`Trainer`] as a simple flag. We need to install the 8-bit optimizer and then pass it as a custom optimizer to the [`Trainer`]. Follow the installation guide in the Github [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library that implements the 8-bit Adam optimizer.
+To use the 8-bit optimizer, you need to install it separately and then pass it as a custom optimizer to the [`Trainer`]. 
 
-Once installed, we just need to initialize the the optimizer. Although this looks like a considerable amount of work it actually just involves two steps: first we need to group the model's parameters into two groups where to one group we apply weight decay and to the other we don't. Usually, biases and layer norm parameters are not weight decayed. Then in a second step we just do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.
+First, follow the installation guide in the GitHub [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library 
+that implements the 8-bit Adam optimizer.
 
-<Tip>
-Note that in order to use the 8-bit optimizer with an existing pretrained model a change to the embedding layer is needed.
-Read [this issue](https://github.com/huggingface/transformers/issues/14819) for more information.
-</Tip>
+Next you need to initialize the optimizer. This involves two steps: 
+* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed. 
+* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.
 
 ```py
 import bitsandbytes as bnb
@@ -530,53 +310,97 @@ adam_bnb_optim = bnb.optim.Adam8bit(
 )
 ```
 
-We can now pass the custom optimizer as an argument to the `Trainer`:
+<Tip>
+
+To use the 8-bit optimizer with an existing pretrained model, you need to make a change to the embedding layer.
+Read [this issue](https://github.com/huggingface/transformers/issues/14819) for more information.
+
+</Tip>
+
+Finally, pass the custom optimizer as an argument to the `Trainer`:
+
 ```py
 trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
-result = trainer.train()
-print_summary(result)
 ```
 
-```
-Time: 55.95
-Samples/second: 9.15
-GPU memory occupied: 13085 MB.
-```
+Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), 
+you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. 
 
-We can see that we get a similar memory improvement as with Adafactor while keeping the full rolling average of the gradients. Let's repeat the experiment with the full settings:
+### multi_tensor
 
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
-    gradient_checkpointing=True,
-    fp16=True,
-    **default_args,
-)
+pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations 
+with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub [issue](https://github.com/huggingface/transformers/issues/9965).
 
-trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
-result = trainer.train()
-print_summary(result)
-```
+## Data preloading
 
-```
-Time: 49.46
-Samples/second: 10.35
-GPU memory occupied: 5363 MB.
+One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it 
+can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast 
+enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck:
+
+- `DataLoader(pin_memory=True, ...)` - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
+- `DataLoader(num_workers=4, ...)` - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance.
+
+When using [`Trainer`], the corresponding [`TrainingArguments`] are: `dataloader_pin_memory` (`True` by default), and `dataloader_num_workers` (defaults to `0`).
+
+## DeepSpeed ZeRO
+
+DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate.
+It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale 
+deep learning training.
+
+If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed
+as it'll only slow things down. However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can 
+leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately
+[install the library](main_classes/deepspeed#installation), then follow one of the guides to create a configuration file 
+and launch DeepSpeed: 
+ 
+* For an in-depth guide on DeepSpeed integration with [`Trainer`], review [the corresponding documentation](main_classes/deepspeed), specifically the 
+[section for a single GPU](main_classes/deepspeed#deployment-with-one-gpu). Some adjustments are required to use DeepSpeed in a notebook; please take a look at the [corresponding guide](main_classes/deepspeed#deployment-in-notebooks).
+* If you prefer to use 🤗 Accelerate, refer to [🤗 Accelerate DeepSpeed guide](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed).
+
+## Using torch.compile
+
+PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can 
+optimize your code by adding a single line of code: `model = torch.compile(model)`.
+
+If using [`Trainer`], you only need `to` pass the `torch_compile` option in the [`TrainingArguments`]: 
+
+```python
+training_args = TrainingArguments(torch_compile=True, **default_args)
 ```
 
-Again, we get about a 3x memory improvement and even slightly higher throughput as using Adafactor. So we have seen how we can optimize the memory footprint of large models. The following plot summarizes all our experiments:
+`torch.compile` uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. After 
+capturing the graph, different backends can be deployed to lower the graph to an optimized engine. 
+You can find more details and benchmarks in [PyTorch documentation](https://pytorch.org/get-started/pytorch-2.0/).
 
-![png](https://huggingface.co/datasets/lvwerra/repo-images/raw/main/gpu-memory-savings.png)
+`torch.compile` has a growing list of backends, which can be found in by calling `torchdynamo.list_backends()`, each of which with its optional dependencies.
 
-### `_multi_tensor`
-pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner and don't mind using the bleed-edge, see: https://github.com/huggingface/transformers/issues/9965
+Choose which backend to use by specifying it via `torch_compile_backend` in the [`TrainingArguments`].  Some of the most commonly used backends are:
 
+**Debugging backends**:
+* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.
+* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
+
+**Training & inference backends**:
+* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels  [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
+* `dynamo.optimize("nvfuser")` -  nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
+* `dynamo.optimize("aot_nvfuser")` -  nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
+* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
+
+**Inference-only backend**s:
+* `dynamo.optimize("ofi")` -  Uses Torchscript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
+* `dynamo.optimize("fx2trt")` -  Uses Nvidia TensorRT for inference optimizations.  [Read more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
+* `dynamo.optimize("onnxrt")` -  Uses ONNXRT for inference on CPU/GPU.  [Read more](https://onnxruntime.ai/)
+* `dynamo.optimize("ipex")` -  Uses IPEX for inference on CPU.  [Read more](https://github.com/intel/intel-extension-for-pytorch)
+
+For an example of using `torch.compile` with 🤗 Transformers, check out this [blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features](https://www.philschmid.de/getting-started-pytorch-2-0-transformers)
 
 ## Using 🤗 Accelerate
 
-So far we have used the [`Trainer`] to run the experiments but a more flexible alternative to that approach is to use 🤗 Accelerate. With 🤗 Accelerate you have full control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. In turn it allows you to easily scale across different infrastructures such as CPUs, GPUs, TPUs, or distributed multi-GPU setups without changing any code. Let's see what it takes to implement all of the above tweaks in 🤗 Accelerate. We can still use the [`TrainingArguments`] to wrap the training settings:
+With [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) you can use the above methods while gaining full 
+control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. 
 
+Suppose you have combined the methods in the [`TrainingArguments`] like so:
 
 ```py
 training_args = TrainingArguments(
@@ -590,7 +414,6 @@ training_args = TrainingArguments(
 
 The full example training loop with 🤗 Accelerate is only a handful of lines of code long:
 
-
 ```py
 from accelerate import Accelerator
 from torch.utils.data.dataloader import DataLoader
@@ -613,89 +436,50 @@ for step, batch in enumerate(dataloader, start=1):
         optimizer.zero_grad()
 ```
 
-First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
-
-Finally, we can write the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see how gradient accumulation works: we normalize the loss so we get the average at the end of accumulation and once we have enough steps we run the optimization. Now the question is: does this use the same amount of memory as the previous steps? Let's check:
-
-
-```py
->>> print_gpu_utilization()
-GPU memory occupied: 5363 MB.
-```
-
-Indeed it does. Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the benefit of more flexiblity in the training loop. For a full documentation of all features have a look at the [Accelerate documentation](https://huggingface.co/docs/accelerate/index).
-
-## DataLoader
-
-One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it can handle. By default everything happens in the main process and it might not be able to read the data from disk fast enough, and thus create a bottleneck, leading to GPU under-utilization.
-
-- `DataLoader(pin_memory=True, ...)` which ensures that the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
-- `DataLoader(num_workers=4, ...)` - spawn several workers to pre-load data faster - during training watch the GPU utilization stats and if it's far from 100% experiment with raising the number of workers. Of course, the problem could be elsewhere so a very big number of workers won't necessarily lead to a better performance.
-
-## DeepSpeed ZeRO
-
-The in-depth details on how to use Deepspeed can be found [here](main_classes/deepspeed).
-
-First, a quick decision tree:
-
-1. Model fits onto a single GPU and you have enough space to fit a small batch size - you don't need to use Deepspeed as it'll only slow things down in this use case.
-2. Model doesn't fit onto a single GPU or you can't fit a small batch - use DeepSpeed ZeRO + CPU Offload and for much larger models NVMe Offload.
-
-Now if the decision tree suggested you use DeepSpeed first you need to [install it](main_classes/deepspeed#installation), then follow one of the following guides to create a configuration file and launch DeepSpeed.
-
-Activation:
-
-- HF Trainer-based examples: see this [guide](main_classes/deepspeed#deployment-with-one-gpu).
-- Custom HF Trainer-based program: Same as above, but pass:
-
-    ```python
-    TrainingArguments(deepspeed="/path/to/ds_config.json")
-    ```
-- Deployment in Notebooks: see this [guide](main_classes/deepspeed#deployment-in-notebooks).
-
-- Custom training loop: This is somewhat complex but you can study how this is implemented in [HF Trainer](
-https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) - simply search for `deepspeed` in the code.
-
-
-## Choice of GPU
-Sometimes, even when applying all the above tweaks the throughput on a given GPU might still not be good enough. One easy solution is to change the type of GPU. For example switching from let's say a K80 (which you typically get on Google Colab) to a fancier GPU such as the V100 or A100. Although they are more expensive they are usually more cost effective than cheaper GPUs due to their larger memory and faster architecture.
+First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). 
+Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. 
+When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) 
+we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. 
+During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) 
+call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same [8-bit optimizer](#8-bit-adam) from the earlier example.
 
-Now, let's take a step back and discuss what we should optimize for when scaling the training of large models.
+Finally, we can add the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see
+how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have 
+enough steps we run the optimization. 
 
-## How to scale
-
-When we train models there are a two aspects we want to optimize at the same time:
-
-- Data throughput/training time
-- Model performance
-
-We have seen that each method changes the memory usage and throughput. In general we want to maximize the throughput (samples/second) to minimize the training cost. This is generally achieved by utilizing the GPU as much as possible and thus filling GPU memory to its limit. For example, as mentioned earlier, we only employ gradient accumulation when we want to use a batch size beyond the size of the GPU memory. If the desired batch size fits into memory then there is no reason to apply gradient accumulation which will only slow down training.
-
-The second objective is model performance. Just because we can does not mean we should use a large batch size. As part of hyperparameter tuning you should determine which batch size yields the best result and then optimize the throughput accordingly.
+Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the 
+benefit of more flexibility in the training loop. For a full documentation of all features have a look at the 
+[Accelerate documentation](https://huggingface.co/docs/accelerate/index).
 
 
 ## Efficient Software Prebuilds
 
-PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuit with the cuda toolkit which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.
+PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuilt with the cuda toolkit 
+which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.
 
-At times it may take an additional effort to pre-build some components, e.g., if you're using libraries like `apex` that don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. To address these users' needs PyTorch and NVIDIA release a new version of NGC docker container which already comes with everything prebuilt and you just need to install your programs on it and it will run out of the box.
+At times, additional efforts may be required to pre-build some components. For instance, if you're using libraries like `apex` that 
+don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. 
+To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with 
+everything prebuilt. You just need to install your programs on it, and it will run out of the box.
 
 This approach is also useful if you want to tweak the pytorch source and/or make a new customized build.
-
-To find the docker image version you want start [here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch).
+To find the docker image version you want start [with PyTorch release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), 
+choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's 
+components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go 
+to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch).
 
 Next follow the instructions to download and deploy the docker image.
 
-## Sparsity
-
-### Mixture of Experts
+## Mixture of Experts
 
-Quite a few of the recent papers reported a 4-5x training speedup and a faster inference by integrating
+Some recent papers reported a 4-5x training speedup and a faster inference by integrating
 Mixture of Experts (MoE) into the Transformer models.
 
-Since it has been discovered that more parameters lead to better performance, this technique allows to increase the number of parameters by an order of magnitude without increasing training costs.
+Since it has been discovered that more parameters lead to better performance, this technique allows to increase the 
+number of parameters by an order of magnitude without increasing training costs.
 
-In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function that trains each expert in a balanced way depending on the input token's position in a sequence.
+In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function 
+that trains each expert in a balanced way depending on the input token's position in a sequence.
 
 ![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png)
 
@@ -703,9 +487,12 @@ In this approach every other FFN layer is replaced with a MoE Layer which consis
 
 You can find exhaustive details and comparison tables in the papers listed at the end of this section.
 
-The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.
+The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude 
+larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.
 
-There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the memory requirements moderately as well.
+There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or 
+hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the 
+memory requirements moderately as well.
 
 Most related papers and implementations are built around Tensorflow/TPUs:
 
@@ -715,46 +502,16 @@ Most related papers and implementations are built around Tensorflow/TPUs:
 
 And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts:  [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](Thttps://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training).
 
-
-## Scaling beyond a single GPU
-
-For some applications, such as pretraining large language models, applying all the approaches above might still not be fast enough. In this case you want to scale your experiment to several GPUs.
-
-Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).
-
 ## Using PyTorch native attention
 
-PyTorch 2.0 released the native [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA), that allows to use fused GPU kernels as [memory-efficient attention](https://arxiv.org/abs/2112.05682) and [flash attention](https://arxiv.org/abs/2205.14135).
+PyTorch 2.0 released the native [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA), 
+that allows to use fused GPU kernels as [memory-efficient attention](https://arxiv.org/abs/2112.05682) and [flash attention](https://arxiv.org/abs/2205.14135).
 
-After installing the [`optimum`](https://github.com/huggingface/optimum) package, the relevant internal modules can be replaced to use PyTorch's native attention with:
+After installing the [`optimum`](https://github.com/huggingface/optimum) package, the relevant internal modules can be 
+replaced to use PyTorch's native attention with:
 
 ```python
 model = model.to_bettertransformer()
 ```
 
-Training can then be done as usual.
-
-## Using torch.compile
-
-PyTorch 2.0 introduces a new compile function, you can learn more about it [in their documentation](https://pytorch.org/get-started/pytorch-2.0/). It uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After capturing the graph, different backends can be deployed to lower the graph to an optimized engine. You can choose one option below for performance boost.
-
-`torch.compile` has a growing list of backends, which can be found in [backends.py](https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py)
-or `torchdynamo.list_backends()` each of which with its optional dependencies.
-
-Some of the most commonly used backends are
-
-**Debugging backends**:
-* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.
-* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
-
-**Training & inference backends**:
-* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels  [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
-* `dynamo.optimize("nvfuser")` -  nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-* `dynamo.optimize("aot_nvfuser")` -  nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
-
-**Inference-only backend**s:
-* `dynamo.optimize("ofi")` -  Uses Torchscript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
-* `dynamo.optimize("fx2trt")` -  Uses Nvidia TensorRT for inference optimizations.  [Read more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
-* `dynamo.optimize("onnxrt")` -  Uses ONNXRT for inference on CPU/GPU.  [Read more](https://onnxruntime.ai/)
-* `dynamo.optimize("ipex")` -  Uses IPEX for inference on CPU.  [Read more](https://github.com/intel/intel-extension-for-pytorch)
+Once converted, train the model as usual.
diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md
index 1b88307b7046..a1661a6ba5a8 100644
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@@ -20,77 +20,54 @@ rendered properly in your Markdown viewer.
 
 # Performance and Scalability
 
-Training larger and larger transformer models and deploying them to production comes with a range of challenges. During training your model can require more GPU memory than is available or be very slow to train and when you deploy it for inference it can be overwhelmed with the throughput that is required in the production environment. This documentation is designed to help you navigate these challenges and find the best setting for your use-case. We split the guides into training and inference as they come with different challenges and solutions. Then within each of them we have separate guides for different kinds of hardware setting (e.g. single vs. multi-GPU for training or CPU vs. GPU for infrence).
+Training large transformer models and deploying them to production present various challenges.  
+During training, the model may require more GPU memory than available or exhibit slow training speed. In the deployment 
+phase, the model can struggle to handle the required throughput in a production environment.
 
-![perf_overview](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf_overview.png)
+This documentation aims to assist you in overcoming these challenges and finding the optimal setting for your use-case. 
+The guides are divided into training and inference sections, as each comes with different challenges and solutions. 
+Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU 
+for training or CPU vs. GPU for inference.
 
-This document serves as an overview and entry point for the methods that could be useful for your scenario.
+Use this document as your starting point to navigate further to the methods that match your scenario.
 
 ## Training
 
-Training transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where you only have a single GPU, but there is also a section about multi-GPU and CPU training (with more coming soon).
+Training large transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where 
+you have a single GPU. The methods that you can apply to improve training efficiency on a single GPU extend to other setups 
+such as multiple GPU. However, there are also techniques that are specific to multi-GPU or CPU training. We cover them in 
+separate sections.
 
-<Tip>
-
- Note: Most of the strategies introduced in the single GPU sections (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
-
-</Tip>
-
-### Single GPU
-
-Training large models on a single GPU can be challenging but there are a number of tools and methods that make it feasible. In this section methods such as mixed precision training, gradient accumulation and checkpointing, efficient optimizers, as well as strategies to determine the best batch size are discussed.
-
-[Go to single GPU training section](perf_train_gpu_one)
-
-### Multi-GPU
-
-In some cases training on a single GPU is still too slow or won't fit the large model. Moving to a multi-GPU setup is the logical step, but training on multiple GPUs at once comes with new decisions: does each GPU have a full copy of the model or is the model itself also distributed? In this section we look at data, tensor, and pipeline parallism.
-
-[Go to multi-GPU training section](perf_train_gpu_many)
-
-### CPU
-
-
-[Go to CPU training section](perf_train_cpu)
-
-
-### TPU
-
-[_Coming soon_](perf_train_tpu)
-
-### Specialized Hardware
-
-[_Coming soon_](perf_train_special)
+* [Methods and tools for efficient training on a single GPU](perf_train_gpu_one): start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. 
+* [Multi-GPU training section](perf_train_gpu_many): explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism.
+* [CPU training section](perf_train_cpu): learn about mixed precision training on CPU.
+* [Efficient Training on Multiple CPUs](perf_train_cpu_many): learn about distributed CPU training.
+* [Training on TPU with TensorFlow](perf_train_tpu_tf): if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. 
+* [Custom hardware for training](perf_hardware): find tips and tricks when building your own deep learning rig.
+* [Hyperparameter Search using Trainer API](hpo_train)
 
 ## Inference
 
-Efficient inference with large models in a production environment can be as challenging as training them. In the following sections we go through the steps to run inference on CPU and single/multi-GPU setups.
-
-### CPU
-
-[Go to CPU inference section](perf_infer_cpu)
-
-### Single GPU
-
-[Go to single GPU inference section](perf_infer_gpu_one)
-
-### Multi-GPU
-
-[Go to multi-GPU inference section](perf_infer_gpu_many)
-
-### Specialized Hardware
+Efficient inference with large models in a production environment can be as challenging as training them. In the following 
+sections we go through the steps to run inference on CPU and single/multi-GPU setups.
 
-[_Coming soon_](perf_infer_special)
+* [Inference on a single CPU](perf_infer_cpu)
+* [Inference on a single GPU](perf_infer_gpu_one)
+* [Multi-GPU inference](perf_infer_gpu_many)
+* [XLA Integration for TensorFlow Models](tf_xla)
 
-## Hardware
 
-In the hardware section you can find tips and tricks when building your own deep learning rig.
+## Training and inference
 
-[Go to hardware section](perf_hardware)
+Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it.
 
+* [Instantiating a big model](big_models)
+* [Troubleshooting performance issues](debugging)
 
 ## Contribute
 
-This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there.
+This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to 
+make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there.
 
-When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the source of that information (unless it comes directly from you).
+When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the 
+source of that information (unless it comes directly from you).

From 6704923107dfab02f76546f816911f907af66e47 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Mon, 24 Jul 2023 15:12:39 +0200
Subject: [PATCH 739/935] Support GatedRepoError + use raise from (#25034)

* Support GatedRepoError + use raise from

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Use token instead of use_auth_token in error messages

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/utils/hub.py | 37 +++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index ffed74360d05..f4c9e9b3c66e 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -41,6 +41,7 @@
 from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get
 from huggingface_hub.utils import (
     EntryNotFoundError,
+    GatedRepoError,
     LocalEntryNotFoundError,
     RepositoryNotFoundError,
     RevisionNotFoundError,
@@ -428,21 +429,26 @@ def cached_file(
             use_auth_token=use_auth_token,
             local_files_only=local_files_only,
         )
-
-    except RepositoryNotFoundError:
+    except GatedRepoError as e:
+        raise EnvironmentError(
+            "You are trying to access a gated repo.\nMake sure to request access at "
+            f"https://huggingface.co/{path_or_repo_id} and pass a token having permission to this repo either "
+            "by logging in with `huggingface-cli login` or by passing `token=<your_token>`."
+        ) from e
+    except RepositoryNotFoundError as e:
         raise EnvironmentError(
             f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
-            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
-            "pass a token having permission to this repo with `use_auth_token` or log in with "
-            "`huggingface-cli login` and pass `use_auth_token=True`."
-        )
-    except RevisionNotFoundError:
+            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token "
+            "having permission to this repo either by logging in with `huggingface-cli login` or by passing "
+            "`token=<your_token>`"
+        ) from e
+    except RevisionNotFoundError as e:
         raise EnvironmentError(
             f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
             "for this model name. Check the model page at "
             f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
-        )
-    except LocalEntryNotFoundError:
+        ) from e
+    except LocalEntryNotFoundError as e:
         # We try to see if we have a cached version (not up to date):
         resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir, revision=revision)
         if resolved_file is not None and resolved_file != _CACHED_NO_EXIST:
@@ -454,8 +460,8 @@ def cached_file(
             f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named"
             f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at"
             " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-        )
-    except EntryNotFoundError:
+        ) from e
+    except EntryNotFoundError as e:
         if not _raise_exceptions_for_missing_entries:
             return None
         if revision is None:
@@ -463,7 +469,7 @@ def cached_file(
         raise EnvironmentError(
             f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
             f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files."
-        )
+        ) from e
     except HTTPError as err:
         # First we try to see if we have a cached version (not up to date):
         resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir, revision=revision)
@@ -611,6 +617,13 @@ def has_file(
     try:
         hf_raise_for_status(r)
         return True
+    except GatedRepoError as e:
+        logger.error(e)
+        raise EnvironmentError(
+            f"{path_or_repo} is a gated repository. Make sure to request access at "
+            f"https://huggingface.co/{path_or_repo} and pass a token having permission to this repo either by "
+            "logging in with `huggingface-cli login` or by passing `token=<your_token>`."
+        ) from e
     except RepositoryNotFoundError as e:
         logger.error(e)
         raise EnvironmentError(f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'.")

From efb2ba666da57a92a8ea47c62ed9e90fbf8aa344 Mon Sep 17 00:00:00 2001
From: Iskren Ivov Chernev <me@iskren.info>
Date: Mon, 24 Jul 2023 16:21:10 +0300
Subject: [PATCH 740/935] Better handling missing SYS in llama conversation
 tokenizer (#24997)

* Better handling missing SYS in llama conversation tokenizer

The existing code failed to add SYS if the conversation has history
without SYS, but did modify the passed conversation as it did.

Rearrange the code so modification to the conversation object are taken
into account for token id generation.

* Fix formatting with black

* Avoid one-liners

* Also fix fast tokenizer

* Drop List decl
---
 .../models/llama/tokenization_llama.py        | 21 ++++++++++---------
 .../models/llama/tokenization_llama_fast.py   | 21 ++++++++++---------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index ddc643a360f2..00dbc117a4c5 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -356,6 +356,17 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
             `List[int]`:
                 Input ids for the conversation.
         """
+        if len(conversation.past_user_inputs) > 0:
+            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+                conversation.past_user_inputs[0] = (
+                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                )
+        elif conversation.new_user_input:
+            if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
+                conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
+        else:
+            raise ValueError("Last message must be from user")
+
         dialogue = list(conversation.iter_texts())
         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
             [not is_user for is_user, msg in dialogue[1::2]]
@@ -365,14 +376,6 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
             )
 
         dialog_tokens: List[int] = []
-        if len(conversation.past_user_inputs) > 0:
-            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
-                conversation.past_user_inputs[0] = (
-                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
-                )
-        elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
-            dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
-
         dialog_tokens += sum(
             [
                 [self.bos_token_id]
@@ -384,8 +387,6 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
             ],
             [],
         )
-        if not (dialogue[-1][0]):
-            raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
         dialog_tokens += [self.bos_token_id] + self.encode(
             f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
         )
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index c04e2da114cc..82dfbe8925f0 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -212,6 +212,17 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
             `List[int]`:
                 Input ids for the conversation.
         """
+        if len(conversation.past_user_inputs) > 0:
+            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+                conversation.past_user_inputs[0] = (
+                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                )
+        elif conversation.new_user_input:
+            if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
+                conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
+        else:
+            raise ValueError("Last message must be from user")
+
         dialogue = list(conversation.iter_texts())
         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
             [not is_user for is_user, msg in dialogue[1::2]]
@@ -221,14 +232,6 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
             )
 
         dialog_tokens = []
-        if len(conversation.past_user_inputs) > 0:
-            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
-                conversation.past_user_inputs[0] = (
-                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
-                )
-        elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
-            dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
-
         dialog_tokens += sum(
             [
                 [self.bos_token_id]
@@ -240,8 +243,6 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
             ],
             [],
         )
-        if not (dialogue[-1][0]):
-            raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
         dialog_tokens += [self.bos_token_id] + self.encode(
             f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
         )

From 383be1b7635d739e0e2f4dbd25a6ee6218e416d8 Mon Sep 17 00:00:00 2001
From: Sangam Lee <74291999+augustinLib@users.noreply.github.com>
Date: Mon, 24 Jul 2023 22:23:34 +0900
Subject: [PATCH 741/935] =?UTF-8?q?=F0=9F=8C=90[i18n-KO]=20Translated=20pe?=
 =?UTF-8?q?rformance.md=20to=20Korean=20(#24883)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* dos: ko: performance.md

* feat: chatgpt draft

* fix: manual edits

* fix: manual edits

* Update docs/source/ko/performance.md

Co-authored-by: Kihoon Son <75935546+kihoon71@users.noreply.github.com>

* Update docs/source/ko/performance.md

---------

Co-authored-by: Kihoon Son <75935546+kihoon71@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml   |  4 +-
 docs/source/ko/performance.md | 96 +++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/performance.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 1225b63b3035..0eca41b8ed62 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -105,8 +105,8 @@
       title: 문제 해결
   title: (번역중) 개발자 가이드
 - sections:
-    - local: in_translation
-      title: (번역중) Overview
+    - local: performance
+      title: 성능 및 확장성
     - local: in_translation
       title: (번역중) Training on one GPU
     - local: in_translation
diff --git a/docs/source/ko/performance.md b/docs/source/ko/performance.md
new file mode 100644
index 000000000000..226bd5f249af
--- /dev/null
+++ b/docs/source/ko/performance.md
@@ -0,0 +1,96 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 성능 및 확장성 [[performance-and-scalability]]
+
+점점 더 큰 규모의 트랜스포머 모델을 훈련하고 프로덕션에 배포하는 데에는 다양한 어려움이 따릅니다. 훈련 중에는 모델이 사용 가능한 GPU 메모리보다 더 많은 메모리를 필요로 하거나 훈련 속도가 매우 느릴 수 있으며, 추론을 위해 배포할 때는 제품 환경에서 요구되는 처리량으로 인해 과부하가 발생할 수 있습니다. 이 문서는 이러한 문제를 극복하고 사용 사례에 가장 적합한 설정을 찾도록 도움을 주기 위해 설계되었습니다. 훈련과 추론으로 가이드를 분할했는데, 이는 각각 다른 문제와 해결 방법이 있기 때문입니다. 그리고 각 가이드에는 다양한 종류의 하드웨어 설정에 대한 별도의 가이드가 있습니다(예: 훈련을 위한 단일 GPU vs 다중 GPU 또는 추론을 위한 CPU vs GPU).
+
+![perf_overview](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf_overview.png)
+
+이 문서는 사용자의 상황에 유용할 수 있는 방법들에 대한 개요 및 시작점 역할을 합니다.
+
+## 훈련 [[training]]
+
+효율적인 트랜스포머 모델 훈련에는 GPU나 TPU와 같은 가속기가 필요합니다. 가장 일반적인 경우는 단일 GPU만 사용하는 경우지만, 다중 GPU 및 CPU 훈련에 대한 섹션도 있습니다(곧 더 많은 내용이 추가될 예정).
+
+<Tip>
+
+ 참고: 단일 GPU 섹션에서 소개된 대부분의 전략(예: 혼합 정밀도 훈련 또는 그라디언트 누적)은 일반적인 모델 훈련에도 적용되므로, 다중 GPU나 CPU 훈련과 같은 섹션을 살펴보기 전에 꼭 참고하시길 바랍니다.
+
+</Tip>
+
+### 단일 GPU [[single-gpu]]
+
+단일 GPU에서 대규모 모델을 훈련하는 것은 어려울 수 있지만, 이를 가능하게 하는 여러 가지 도구와 방법이 있습니다. 이 섹션에서는 혼합 정밀도 훈련, 그라디언트 누적 및 체크포인팅, 효율적인 옵티마이저, 최적의 배치 크기를 결정하기 위한 전략 등에 대해 논의합니다.
+
+[단일 GPU 훈련 섹션으로 이동](perf_train_gpu_one)
+
+### 다중 GPU [[multigpu]]
+
+단일 GPU에서 훈련하는 것이 너무 느리거나 대규모 모델에 적합하지 않은 경우도 있습니다. 다중 GPU 설정으로 전환하는 것은 논리적인 단계이지만, 여러 GPU에서 한 번에 훈련하려면 각 GPU마다 모델의 전체 사본을 둘지, 혹은 모델 자체도 여러 GPU에 분산하여 둘지 등 새로운 결정을 내려야 합니다. 이 섹션에서는 데이터, 텐서 및 파이프라인 병렬화에 대해 살펴봅니다.
+
+[다중 GPU 훈련 섹션으로 이동](perf_train_gpu_many)
+
+### CPU [[cpu]]
+
+
+[CPU 훈련 섹션으로 이동](perf_train_cpu)
+
+
+### TPU [[tpu]]
+
+[_곧 제공될 예정_](perf_train_tpu)
+
+### 특수한 하드웨어 [[specialized-hardware]]
+
+[_곧 제공될 예정_](perf_train_special)
+
+## 추론 [[inference]]
+
+제품 및 서비스 환경에서 대규모 모델을 효율적으로 추론하는 것은 모델을 훈련하는 것만큼 어려울 수 있습니다. 이어지는 섹션에서는 CPU 및 단일/다중 GPU 설정에서 추론을 진행하는 단계를 살펴봅니다.
+
+### CPU [[cpu]]
+
+[CPU 추론 섹션으로 이동](perf_infer_cpu)
+
+### 단일 GPU [[single-gpu]]
+
+[단일 GPU 추론 섹션으로 이동](perf_infer_gpu_one)
+
+### 다중 GPU [[multigpu]]
+
+[다중 GPU 추론 섹션으로 이동](perf_infer_gpu_many)
+
+### 특수한 하드웨어 [[specialized-hardware]]
+
+[_곧 제공될 예정_](perf_infer_special)
+
+## 하드웨어 [[hardware]]
+
+하드웨어 섹션에서는 자신만의 딥러닝 장비를 구축할 때 유용한 팁과 요령을 살펴볼 수 있습니다.
+
+[하드웨어 섹션으로 이동](perf_hardware)
+
+
+## 기여하기 [[contribute]]
+
+이 문서는 완성되지 않은 상태이며, 추가해야 할 내용이나 수정 사항이 많이 있습니다. 따라서 추가하거나 수정할 내용이 있으면 주저하지 말고 PR을 열어 주시거나, 자세한 내용을 논의하기 위해 Issue를 시작해 주시기 바랍니다.
+
+A가 B보다 좋다고 하는 기여를 할 때는, 재현 가능한 벤치마크와/또는 해당 정보의 출처 링크를 포함해주세요(당신으로부터의 직접적인 정보가 아닌 경우).
\ No newline at end of file

From 9d2b983ed07f99038bfa1736c8515549fca3c94a Mon Sep 17 00:00:00 2001
From: Sunmin Cho <sunmin.dev@gmail.com>
Date: Mon, 24 Jul 2023 22:24:11 +0900
Subject: [PATCH 742/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`testing.md`=20to=20Korean=20(#24900)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: testing.md

* feat: draft

* fix: manual edits

* fix: edit ko/_toctree.yml

* fix: manual edits

* fix: manual edits

* fix: manual edits

* fix: manual edits

* fix: resolve suggestions
---
 docs/source/ko/_toctree.yml |    4 +-
 docs/source/ko/testing.md   | 1278 +++++++++++++++++++++++++++++++++++
 2 files changed, 1280 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/testing.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 0eca41b8ed62..e8c8ddeab319 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -149,8 +149,8 @@
       title: (번역중) How to convert a 🤗 Transformers model to TensorFlow?
     - local: in_translation
       title: (번역중) How to add a pipeline to 🤗 Transformers?
-    - local: in_translation
-      title: (번역중) Testing
+    - local: testing
+      title: 테스트
     - local: in_translation
       title: (번역중) Checks on a Pull Request
   title: (번역중) 기여하기
diff --git a/docs/source/ko/testing.md b/docs/source/ko/testing.md
new file mode 100644
index 000000000000..0e40ff9f07c6
--- /dev/null
+++ b/docs/source/ko/testing.md
@@ -0,0 +1,1278 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 테스트[[testing]]
+
+
+먼저 🤗 Transformers 모델이 어떻게 테스트되는지 살펴보고, 새로운 테스트를 작성 및 기존 테스트를 개선하는 방법을 알아봅시다.
+
+이 저장소에는 2개의 테스트 스위트가 있습니다:
+
+1. `tests` - 일반 API에 대한 테스트
+2. `examples` - API의 일부가 아닌 다양한 응용 프로그램에 대한 테스트
+
+## Transformers 테스트 방법[[how-transformers-are-tested]]
+
+1. PR이 제출되면 9개의 CircleCi 작업으로 테스트가 진행됩니다. 해당 PR에 대해 새로운 커밋이 생성될 때마다 테스트는 다시 진행됩니다. 이 작업들은 
+   이 [config 파일](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml)에 정의되어 있으므로 필요하다면 
+   사용자의 로컬 환경에서 동일하게 재현해 볼 수 있습니다.
+
+   이 CI 작업은 `@slow` 테스트를 실행하지 않습니다.
+
+2. [github actions](https://github.com/huggingface/transformers/actions)에 의해 실행되는 작업은 3개입니다:
+
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): 
+    torch hub integration이 작동하는지 확인합니다.
+
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): `main` 브랜치에서 커밋이 업데이트된 경우에만 GPU를 이용한 빠른 테스트를 실행합니다. 
+    이는 `src`, `tests`, `.github` 폴더 중 하나에 코드가 업데이트된 경우에만 실행됩니다. 
+    (model card, notebook, 기타 등등을 추가한 경우 실행되지 않도록 하기 위해서입니다)
+
+   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): `tests` 및 `examples`에서
+   GPU를 이용한 일반 테스트, 느린 테스트를 실행합니다.
+
+
+```bash
+RUN_SLOW=1 pytest tests/
+RUN_SLOW=1 pytest examples/
+```
+
+   결과는 [여기](https://github.com/huggingface/transformers/actions)에서 확인할 수 있습니다.
+
+
+## 테스트 실행[[running-tests]]
+
+
+
+
+
+### 실행할 테스트 선택[[choosing-which-tests-to-run]]
+
+이 문서는 테스트를 실행하는 다양한 방법에 대해 자세히 설명합니다. 
+모든 내용을 읽은 후에도, 더 자세한 내용이 필요하다면 [여기](https://docs.pytest.org/en/latest/usage.html)에서 확인할 수 있습니다.
+
+다음은 가장 유용한 테스트 실행 방법 몇 가지입니다.
+
+모두 실행:
+
+```console
+pytest
+```
+
+또는:
+
+```bash
+make test
+```
+
+후자는 다음과 같이 정의됩니다:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+위의 명령어는 pytest에게 아래의 내용을 전달합니다:
+
+- 사용 가능한 CPU 코어 수만큼 테스트 프로세스를 실행합니다. (RAM이 충분하지 않다면, 테스트 프로세스 수가 너무 많을 수 있습니다!)
+- 동일한 파일의 모든 테스트는 동일한 테스트 프로세스에서 실행되어야 합니다.
+- 출력을 캡처하지 않습니다.
+- 자세한 모드로 실행합니다.
+
+
+
+### 모든 테스트 목록 가져오기[[getting-the-list-of-all-tests]]
+
+테스트 스위트의 모든 테스트:
+
+```bash
+pytest --collect-only -q
+```
+
+지정된 테스트 파일의 모든 테스트:
+
+```bash
+pytest tests/test_optimization.py --collect-only -q
+```
+
+### 특정 테스트 모듈 실행[[run-a-specific-test-module]]
+
+개별 테스트 모듈 실행하기:
+
+```bash
+pytest tests/test_logging.py
+```
+
+### 특정 테스트 실행[[run-specific-tests]]
+
+대부분의 테스트 내부에서는 unittest가 사용됩니다. 따라서 특정 하위 테스트를 실행하려면 해당 테스트를 포함하는 unittest 클래스의 이름을 알아야 합니다.
+예를 들어 다음과 같을 수 있습니다:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+```
+
+위의 명령어의 의미는 다음과 같습니다:
+
+- `tests/test_optimization.py` - 테스트가 있는 파일
+- `OptimizationTest` - 클래스의 이름
+- `test_adam_w` - 특정 테스트 함수의 이름
+
+파일에 여러 클래스가 포함된 경우, 특정 클래스의 테스트만 실행할 수도 있습니다. 예를 들어 다음과 같습니다:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest
+```
+
+이 명령어는 해당 클래스 내부의 모든 테스트를 실행합니다.
+
+앞에서 언급한 것처럼 `OptimizationTest` 클래스에 포함된 테스트를 확인할 수 있습니다.
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+```
+
+키워드 표현식을 사용하여 테스트를 실행할 수도 있습니다.
+
+`adam`이라는 이름을 포함하는 테스트만 실행하려면 다음과 같습니다:
+
+```bash
+pytest -k adam tests/test_optimization.py
+```
+
+논리 연산자 `and`와 `or`를 사용하여 모든 키워드가 일치해야 하는지 또는 어느 하나가 일치해야 하는지를 나타낼 수 있습니다.
+`not`은 부정할 때 사용할 수 있습니다.
+
+`adam`이라는 이름을 포함하지 않는 모든 테스트를 실행하려면 다음과 같습니다:
+
+```bash
+pytest -k "not adam" tests/test_optimization.py
+```
+
+두 가지 패턴을 하나로 결합할 수도 있습니다:
+
+```bash
+pytest -k "ada and not adam" tests/test_optimization.py
+```
+
+예를 들어 `test_adafactor`와 `test_adam_w`를 모두 실행하려면 다음을 사용할 수 있습니다:
+
+```bash
+pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+```
+
+여기서 `or`를 사용하는 것에 유의하세요. 두 키워드 중 하나가 일치하도록 하기 위한 목적으로 사용하기 때문입니다.
+
+두 패턴이 모두 포함되어야 하는 테스트만 실행하려면, `and`를 사용해야 합니다:
+
+```bash
+pytest -k "test and ada" tests/test_optimization.py
+```
+
+### `accelerate` 테스트 실행[[run-`accelerate`-tests]]
+
+모델에서 `accelerate` 테스트를 실행해야 할 때가 있습니다. 이를 위해서는 명령어에 `-m accelerate_tests`를 추가하면 됩니다.
+예를 들어, `OPT`에서 이러한 테스트를 실행하려면 다음과 같습니다:
+```bash
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 
+```
+
+### 문서 테스트 실행[[run-documentation-tests]]
+
+예시 문서가 올바른지 테스트하려면 `doctests`가 통과하는지 확인해야 합니다.
+예를 들어, [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035)를 사용해 봅시다:
+
+```python 
+r"""
+Returns:
+
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import WhisperModel, WhisperFeatureExtractor
+    >>> from datasets import load_dataset
+
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```"""
+
+```
+
+원하는 파일의 모든 docstring 예제를 자동으로 테스트하려면 다음 명령을 실행하면 됩니다:
+```bash 
+pytest --doctest-modules <path_to_file_or_dir>
+```
+파일의 확장자가 markdown인 경우 `--doctest-glob="*.md"` 인수를 추가해야 합니다.
+
+### 수정된 테스트만 실행[[run-only-modified-tests]]
+
+수정된 파일 또는 현재 브랜치 (Git 기준)와 관련된 테스트를 실행하려면 [pytest-picked](https://github.com/anapaulagomes/pytest-picked)을 사용할 수 있습니다.
+이는 변경한 내용이 테스트에 영향을 주지 않았는지 빠르게 확인할 수 있는 좋은 방법입니다.
+
+```bash
+pip install pytest-picked
+```
+
+```bash
+pytest --picked
+```
+
+수정되었지만, 아직 커밋되지 않은 모든 파일 및 폴더에서 테스트가 실행됩니다.
+
+### 소스 수정 시 실패한 테스트 자동 재실행[[automatically-rerun-failed-tests-on-source-modification]]
+
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)는 모든 실패한 테스트를 감지하고, 
+파일을 수정한 후에 파일을 계속 재실행하여 테스트가 성공할 때까지 기다리는 매우 유용한 기능을 제공합니다.
+따라서 수정한 내용을 확인한 후 pytest를 다시 시작할 필요가 없습니다. 
+모든 테스트가 통과될 때까지 이 과정을 반복한 후 다시 전체 실행이 이루어집니다.
+
+```bash
+pip install pytest-xdist
+```
+
+재귀적 모드의 사용: `pytest -f` 또는 `pytest --looponfail`
+
+파일의 변경 사항은 `looponfailroots` 루트 디렉터리와 해당 내용을 (재귀적으로) 확인하여 감지됩니다.
+이 값의 기본값이 작동하지 않는 경우, 
+`setup.cfg`의 설정 옵션을 변경하여 프로젝트에서 변경할 수 있습니다:
+
+```ini
+[tool:pytest]
+looponfailroots = transformers tests
+```
+
+또는 `pytest.ini`/``tox.ini`` 파일:
+
+```ini
+[pytest]
+looponfailroots = transformers tests
+```
+
+이렇게 하면 ini-file의 디렉터리를 기준으로 상대적으로 지정된 각 디렉터리에서 파일 변경 사항만 찾게 됩니다.
+
+
+이 기능을 대체할 수 있는 구현 방법인 [pytest-watch](https://github.com/joeyespo/pytest-watch)도 있습니다.
+
+
+### 특정 테스트 모듈 건너뛰기[[skip-a-test-module]]
+
+모든 테스트 모듈을 실행하되 특정 모듈을 제외하려면, 실행할 테스트 목록을 명시적으로 지정할 수 있습니다. 
+예를 들어, `test_modeling_*.py` 테스트를 제외한 모든 테스트를 실행하려면 다음을 사용할 수 있습니다:
+
+```bash
+pytest *ls -1 tests/*py | grep -v test_modeling*
+```
+
+### 상태 초기화[[clearing state]]
+
+CI 빌드 및 (속도에 대한) 격리가 중요한 경우, 캐시를 지워야 합니다:
+
+```bash
+pytest --cache-clear tests
+```
+
+### 테스트를 병렬로 실행[[running-tests-in-parallel]]
+
+이전에 언급한 것처럼 `make test`는 테스트를 병렬로 실행하기 위해 
+`pytest-xdist` 플러그인(`-n X` 인수, 예를 들어 `-n 2`를 사용하여 2개의 병렬 작업 실행)을 통해 실행됩니다.
+
+`pytest-xdist`의 `--dist=` 옵션을 사용하여 테스트를 어떻게 그룹화할지 제어할 수 있습니다. 
+`--dist=loadfile`은 하나의 파일에 있는 테스트를 동일한 프로세스로 그룹화합니다.
+
+실행된 테스트의 순서가 다르고 예측할 수 없기 때문에, `pytest-xdist`로 테스트 스위트를 실행하면 실패가 발생할 수 있습니다 (검출되지 않은 결합된 테스트가 있는 경우).
+이 경우 [pytest-replay](https://github.com/ESSS/pytest-replay)를 사용하면 동일한 순서로 테스트를 다시 실행해서 
+실패하는 시퀀스를 최소화하는 데에 도움이 됩니다.
+
+### 테스트 순서와 반복[[test-order-and-repetition]]
+
+잠재적인 종속성 및 상태 관련 버그(tear down)를 감지하기 위해 
+테스트를 여러 번, 연속으로, 무작위로 또는 세트로 반복하는 것이 좋습니다.
+그리고 직접적인 여러 번의 반복은 DL의 무작위성에 의해 발견되는 일부 문제를 감지하는 데에도 유용합니다.
+
+
+#### 테스트를 반복[[repeat-tests]]
+
+- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
+
+```bash
+pip install pytest-flakefinder
+```
+
+모든 테스트를 여러 번 실행합니다(기본값은 50번):
+
+```bash
+pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+```
+
+<Tip>
+
+이 플러그인은 `pytest-xdist`의 `-n` 플래그와 함께 작동하지 않습니다.
+
+</Tip>
+
+<Tip>
+
+`pytest-repeat`라는 또 다른 플러그인도 있지만 `unittest`와 함께 작동하지 않습니다.
+
+</Tip>
+
+#### 테스트를 임의의 순서로 실행[[run-tests-in-a-random-order]]
+
+```bash
+pip install pytest-random-order
+```
+
+중요: `pytest-random-order`가 설치되면 테스트가 자동으로 임의의 순서로 섞입니다. 
+구성 변경이나 커맨드 라인 옵션이 필요하지 않습니다.
+
+앞서 설명한 것처럼 이를 통해 한 테스트의 상태가 다른 테스트의 상태에 영향을 미치는 결합된 테스트를 감지할 수 있습니다. 
+`pytest-random-order`가 설치되면 해당 세션에서 사용된 랜덤 시드가 출력되며 예를 들어 다음과 같습니다:
+
+```bash
+pytest tests
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+따라서 특정 시퀀스가 실패하는 경우에는 정확한 시드를 추가하여 재현할 수 있습니다. 예를 들어 다음과 같습니다:
+
+```bash
+pytest --random-order-seed=573663
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+정확히 동일한 테스트 목록(또는 목록이 없음)을 사용하는 경우에만 정확한 순서를 재현합니다.
+목록을 수동으로 좁히기 시작하면 더 이상 시드에 의존할 수 없고 실패했던 정확한 순서로 수동으로 목록을 나열해야합니다. 그리고 `--random-order-bucket=none`을 사용하여 pytest에게 순서를 임의로 설정하지 않도록 알려야 합니다. 
+예를 들어 다음과 같습니다:
+
+```bash
+pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+```
+
+모든 테스트에 대해 섞기를 비활성화하려면 다음과 같습니다:
+
+```bash
+pytest --random-order-bucket=none
+```
+
+기본적으로 `--random-order-bucket=module`이 내재되어 있으므로, 모듈 수준에서 파일을 섞습니다. 
+또한 `class`, `package`, `global` 및 `none` 수준에서도 섞을 수 있습니다.
+자세한 내용은 해당 [문서](https://github.com/jbasko/pytest-random-order)를 참조하세요.
+
+또 다른 무작위화의 대안은 [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly)입니다.
+이 모듈은 매우 유사한 기능/인터페이스를 가지고 있지만, `pytest-random-order`에 있는 버킷 모드를 사용할 수는 없습니다. 
+설치 후에는 자동으로 적용되는 문제도 동일하게 가집니다.
+
+### 외관과 느낌을 변경[[look-and-feel-variations]
+
+#### pytest-sugar 사용[[pytest-sugar]]
+
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar)는 테스트가 보여지는 형태를 개선하고, 
+진행 상황 바를 추가하며, 실패한 테스트와 검증을 즉시 표시하는 플러그인입니다. 설치하면 자동으로 활성화됩니다.
+
+```bash
+pip install pytest-sugar
+```
+
+pytest-sugar 없이 테스트를 실행하려면 다음과 같습니다:
+
+```bash
+pytest -p no:sugar
+```
+
+또는 제거하세요.
+
+
+
+#### 각 하위 테스트 이름과 진행 상황 보고[[report-each-sub-test-name-and-its-progress]]
+
+`pytest`를 통해 단일 또는 그룹의 테스트를 실행하는 경우(`pip install pytest-pspec` 이후):
+
+```bash
+pytest --pspec tests/test_optimization.py
+```
+
+#### 실패한 테스트 즉시 표시[[instantly-shows-failed-tests]]
+
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail)은 테스트 세션의 끝까지 기다리지 않고 
+실패 및 오류를 즉시 표시합니다.
+
+```bash
+pip install pytest-instafail
+```
+
+```bash
+pytest --instafail
+```
+
+### GPU 사용 여부[[to-GPU-or-not-to-GPU]]
+
+GPU가 활성화된 환경에서, CPU 전용 모드로 테스트하려면 `CUDA_VISIBLE_DEVICES=""`를 추가합니다:
+
+```bash
+CUDA_VISIBLE_DEVICES="" pytest tests/test_logging.py
+```
+
+또는 다중 GPU가 있는 경우 `pytest`에서 사용할 GPU를 지정할 수도 있습니다. 
+예를 들어, GPU `0` 및 `1`이 있는 경우 다음을 실행할 수 있습니다:
+
+```bash
+CUDA_VISIBLE_DEVICES="1" pytest tests/test_logging.py
+```
+
+이렇게 하면 다른 GPU에서 다른 작업을 실행하려는 경우 유용합니다.
+
+일부 테스트는 반드시 CPU 전용으로 실행해야 하며, 일부는 CPU 또는 GPU 또는 TPU에서 실행해야 하고, 일부는 여러 GPU에서 실행해야 합니다. 
+다음 스킵 데코레이터는 테스트의 요구 사항을 CPU/GPU/TPU별로 설정하는 데 사용됩니다:
+
+- `require_torch` - 이 테스트는 torch에서만 실행됩니다.
+- `require_torch_gpu` - `require_torch`에 추가로 적어도 1개의 GPU가 필요합니다.
+- `require_torch_multi_gpu` - `require_torch`에 추가로 적어도 2개의 GPU가 필요합니다.
+- `require_torch_non_multi_gpu` - `require_torch`에 추가로 0개 또는 1개의 GPU가 필요합니다.
+- `require_torch_up_to_2_gpus` - `require_torch`에 추가로 0개, 1개 또는 2개의 GPU가 필요합니다.
+- `require_torch_tpu` - `require_torch`에 추가로 적어도 1개의 TPU가 필요합니다.
+
+GPU 요구 사항을 표로 정리하면 아래와 같습니디ㅏ:
+
+
+| n gpus | decorator                      |
+|--------+--------------------------------|
+| `>= 0` | `@require_torch`               |
+| `>= 1` | `@require_torch_gpu`           |
+| `>= 2` | `@require_torch_multi_gpu`     |
+| `< 2`  | `@require_torch_non_multi_gpu` |
+| `< 3`  | `@require_torch_up_to_2_gpus`  |
+
+
+예를 들어, 2개 이상의 GPU가 있고 pytorch가 설치되어 있을 때에만 실행되어야 하는 테스트는 다음과 같습니다:
+
+```python no-style
+@require_torch_multi_gpu
+def test_example_with_multi_gpu():
+```
+
+`tensorflow`가 필요한 경우 `require_tf` 데코레이터를 사용합니다. 예를 들어 다음과 같습니다:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
+이러한 데코레이터는 중첩될 수 있습니다. 
+예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다:
+
+```python no-style
+@require_torch_gpu
+@slow
+def test_example_slow_on_gpu():
+```
+
+`@parametrized`와 같은 일부 데코레이터는 테스트 이름을 다시 작성하기 때문에 `@require_*` 스킵 데코레이터는 올바르게 작동하려면 항상 맨 마지막에 나열되어야 합니다. 
+다음은 올바른 사용 예입니다:
+
+```python no-style
+@parameterized.expand(...)
+@require_torch_multi_gpu
+def test_integration_foo():
+```
+
+`@pytest.mark.parametrize`에는 이러한 순서 문제는 없으므로 처음 혹은 마지막에 위치시킬 수 있고 이러한 경우에도 잘 작동할 것입니다. 
+하지만 unittest가 아닌 경우에만 작동합니다.
+
+테스트 내부에서 다음을 사용할 수 있습니다:
+
+- 사용 가능한 GPU 수:
+
+```python
+from transformers.testing_utils import get_gpu_count
+
+n_gpu = get_gpu_count()  #torch와 tf와 함께 작동
+```
+
+### 분산 훈련[[distributed-training]]
+
+`pytest`는 분산 훈련을 직접적으로 다루지 못합니다. 
+이를 시도하면 하위 프로세스가 올바른 작업을 수행하지 않고 `pytest`라고 생각하기에 테스트 스위트를 반복해서 실행하게 됩니다.
+그러나 일반 프로세스를 생성한 다음 여러 워커를 생성하고 IO 파이프를 관리하도록 하면 동작합니다.
+
+다음은 사용 가능한 테스트입니다:
+
+- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
+- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
+
+실행 지점으로 바로 이동하려면, 해당 테스트에서 `execute_subprocess_async` 호출을 검색하세요.
+
+이러한 테스트를 실행하려면 적어도 2개의 GPU가 필요합니다.
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
+```
+
+### 출력 캡처[[output-capture]]
+
+테스트 실행 중 `stdout` 및 `stderr`로 전송된 모든 출력이 캡처됩니다. 
+테스트나 설정 메소드가 실패하면 캡처된 출력은 일반적으로 실패 추적 정보와 함께 표시됩니다.
+
+출력 캡처를 비활성화하고 `stdout` 및 `stderr`를 정상적으로 받으려면 `-s` 또는 `--capture=no`를 사용하세요:
+
+```bash
+pytest -s tests/test_logging.py
+```
+
+테스트 결과를 JUnit 형식의 출력으로 보내려면 다음을 사용하세요:
+
+```bash
+py.test tests --junitxml=result.xml
+```
+
+### 색상 조절[[color-control]]
+
+색상이 없게 하려면 다음과 같이 설정하세요(예를 들어 흰색 배경에 노란색 글씨는 가독성이 좋지 않습니다):
+
+```bash
+pytest --color=no tests/test_logging.py
+```
+
+### online pastebin service에 테스트 보고서 전송[[sending test report to online pastebin service]]
+
+각 테스트 실패에 대한 URL을 만듭니다:
+
+```bash
+pytest --pastebin=failed tests/test_logging.py
+```
+
+이렇게 하면 각 실패에 대한 URL을 제공하는 remote Paste service에 테스트 실행 정보를 제출합니다. 
+일반적인 테스트를 선택할 수도 있고 혹은 특정 실패만 보내려면 `-x`와 같이 추가할 수도 있습니다.
+
+전체 테스트 세션 로그에 대한 URL을 생성합니다:
+
+```bash
+pytest --pastebin=all tests/test_logging.py
+```
+
+## 테스트 작성[[writing-tests]]
+
+🤗 transformers 테스트는 대부분 `unittest`를 기반으로 하지만, 
+`pytest`에서 실행되므로 대부분의 경우 두 시스템의 기능을 사용할 수 있습니다.
+
+지원되는 기능에 대해 [여기](https://docs.pytest.org/en/stable/unittest.html)에서 확인할 수 있지만, 
+기억해야 할 중요한 점은 대부분의 `pytest` fixture가 작동하지 않는다는 것입니다.
+파라미터화도 작동하지 않지만, 우리는 비슷한 방식으로 작동하는 `parameterized` 모듈을 사용합니다.
+
+
+### 매개변수화[[parametrization]]
+
+동일한 테스트를 다른 인수로 여러 번 실행해야 하는 경우가 종종 있습니다. 
+테스트 내에서 이 작업을 수행할 수 있지만, 그렇게 하면 하나의 인수 세트에 대해 테스트를 실행할 수 없습니다.
+
+```python
+# test_this1.py
+import unittest
+from parameterized import parameterized
+
+
+class TestMathUnitTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
+    def test_floor(self, name, input, expected):
+        assert_equal(math.floor(input), expected)
+```
+
+이제 기본적으로 이 테스트는 `test_floor`의 마지막 3개 인수가 
+매개변수 목록의 해당 인수에 할당되는 것으로 3번 실행될 것입니다.
+
+그리고 `negative` 및 `integer` 매개변수 집합만 실행하려면 다음과 같이 실행할 수 있습니다:
+
+```bash
+pytest -k "negative and integer" tests/test_mytest.py
+```
+
+또는 `negative` 하위 테스트를 제외한 모든 서브 테스트를 다음과 같이 실행할 수 있습니다:
+
+```bash
+pytest -k "not negative" tests/test_mytest.py
+```
+
+앞에서 언급한 `-k` 필터를 사용하는 것 외에도, 
+각 서브 테스트의 정확한 이름을 확인한 후에 일부 혹은 전체 서브 테스트를 실행할 수 있습니다.
+
+```bash
+pytest test_this1.py --collect-only -q
+```
+
+그리고 다음의 내용을 확인할 수 있을 것입니다:
+
+```bash
+test_this1.py::TestMathUnitTest::test_floor_0_negative
+test_this1.py::TestMathUnitTest::test_floor_1_integer
+test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+```
+
+2개의 특정한 서브 테스트만 실행할 수도 있습니다:
+
+```bash
+pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+```
+
+`transformers`의 개발자 종속성에 이미 있는 [parameterized](https://pypi.org/project/parameterized/) 모듈은 
+`unittests`와 `pytest` 테스트 모두에서 작동합니다.
+
+그러나 테스트가 `unittest`가 아닌 경우 `pytest.mark.parametrize`를 사용할 수 있습니다(이미 있는 일부 테스트에서 사용되는 경우도 있습니다. 
+주로 `examples` 하위에 있습니다).
+
+다음은 `pytest`의 `parametrize` 마커를 사용한 동일한 예입니다:
+
+```python
+# test_this2.py
+import pytest
+
+
+@pytest.mark.parametrize(
+    "name, input, expected",
+    [
+        ("negative", -1.5, -2.0),
+        ("integer", 1, 1.0),
+        ("large fraction", 1.6, 1),
+    ],
+)
+def test_floor(name, input, expected):
+    assert_equal(math.floor(input), expected)
+```
+
+`parameterized`와 마찬가지로 `pytest.mark.parametrize`를 사용하면 
+`-k` 필터가 작동하지 않는 경우에도 실행할 서브 테스트를 정확하게 지정할 수 있습니다. 
+단, 이 매개변수화 함수는 서브 테스트의 이름 집합을 약간 다르게 생성합니다. 다음과 같은 모습입니다:
+
+```bash
+pytest test_this2.py --collect-only -q
+```
+
+그리고 다음의 내용을 확인할 수 있을 것입니다:
+
+```bash
+test_this2.py::test_floor[integer-1-1.0]
+test_this2.py::test_floor[negative--1.5--2.0]
+test_this2.py::test_floor[large fraction-1.6-1]
+```
+
+특정한 테스트에 대해서만 실행할 수도 있습니다:
+
+```bash
+pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+```
+
+이전의 예시와 같이 실행할 수 있습니다.
+
+
+
+### 파일 및 디렉터리[[files-and-directories]]
+
+테스트에서 종종 현재 테스트 파일과 관련된 상대적인 위치를 알아야 하는 경우가 있습니다. 
+테스트가 여러 디렉터리에서 호출되거나 깊이가 다른 하위 디렉터리에 있을 수 있기 때문에 그 위치를 아는 것은 간단하지 않습니다.
+`transformers.test_utils.TestCasePlus`라는 헬퍼 클래스는 모든 기본 경로를 처리하고 간단한 액세서를 제공하여 이 문제를 해결합니다:
+
+
+- `pathlib` 객체(완전히 정해진 경로)
+
+  - `test_file_path` - 현재 테스트 파일 경로 (예: `__file__`)
+  - test_file_dir` - 현재 테스트 파일이 포함된 디렉터리
+  - tests_dir` - `tests` 테스트 스위트의 디렉터리
+  - examples_dir` - `examples` 테스트 스위트의 디렉터리
+  - repo_root_dir` - 저장소 디렉터리
+  - src_dir` - `src`의 디렉터리(예: `transformers` 하위 디렉터리가 있는 곳)
+
+- 문자열로 변환된 경로---위와 동일하지만, `pathlib` 객체가 아닌 문자열로 경로를 반환합니다:
+
+  - `test_file_path_str`
+  - `test_file_dir_str`
+  - `tests_dir_str`
+  - `examples_dir_str`
+  - `repo_root_dir_str`
+  - `src_dir_str`
+
+위의 내용을 사용하려면 테스트가 'transformers.test_utils.TestCasePlus'의 서브클래스에 있는지 확인해야 합니다. 
+예를 들어 다음과 같습니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_local_locations(self):
+        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
+```
+
+만약 `pathlib`를 통해 경로를 조작할 필요가 없거나 경로를 문자열로만 필요로 하는 경우에는 `pathlib` 객체에 `str()`을 호출하거나 `_str`로 끝나는 접근자를 사용할 수 있습니다. 
+예를 들어 다음과 같습니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_stringified_locations(self):
+        examples_dir = self.examples_dir_str
+```
+
+### 임시 파일 및 디렉터리[[temporary-files-and-directories]]
+
+고유한 임시 파일 및 디렉터리를 사용하는 것은 병렬 테스트 실행에 있어 필수적입니다. 
+이렇게 함으로써 테스트들이 서로의 데이터를 덮어쓰지 않게 할 수 있습니다. 또한 우리는 생성된 테스트의 종료 단계에서 이러한 임시 파일 및 디렉터리를 제거하고 싶습니다.  
+따라서 이러한 요구 사항을 충족시켜주는 `tempfile`과 같은 패키지를 사용하는 것이 중요합니다.
+
+그러나 테스트를 디버깅할 때는 임시 파일이나 디렉터리에 들어가는 내용을 확인할 수 있어야 하며, 
+재실행되는 각 테스트마다 임시 파일이나 디렉터리의 경로에 대해 무작위 값이 아닌 정확한 값을 알고 싶을 것입니다.
+
+`transformers.test_utils.TestCasePlus`라는 도우미 클래스는 이러한 목적에 가장 적합합니다. 
+이 클래스는 `unittest.TestCase`의 하위 클래스이므로, 우리는 이것을 테스트 모듈에서 쉽게 상속할 수 있습니다.
+
+다음은 해당 클래스를 사용하는 예시입니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+이 코드는 고유한 임시 디렉터리를 생성하고 `tmp_dir`을 해당 위치로 설정합니다.
+
+- 고유한 임시 디렉터리를 생성합니다:
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+`tmp_dir`에는 생성된 임시 디렉터리의 경로가 포함됩니다. 
+이는 테스트의 종료 단계에서 자동으로 제거됩니다.
+
+- 선택한 경로로 임시 디렉터리 생성 후에 테스트 시작 전에 비어 있는 상태인지 확인하고, 테스트 후에는 비우지 마세요.
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+```
+
+이것은 디버깅할 때 특정 디렉터리를 모니터링하고, 
+그 디렉터리에 이전에 실행된 테스트가 데이터를 남기지 않도록 하는 데에 유용합니다.
+
+- `before` 및 `after` 인수를 직접 오버라이딩하여 기본 동작을 변경할 수 있으며 
+다음 중 하나의 동작으로 이어집니다:
+
+  - `before=True`: 테스트 시작 시 임시 디렉터리가 항상 지워집니다.
+  - `before=False`: 임시 디렉터리가 이미 존재하는 경우 기존 파일은 그대로 남습니다.
+  - `after=True`: 테스트 종료 시 임시 디렉터리가 항상 삭제됩니다.
+  - `after=False`: 테스트 종료 시 임시 디렉터리가 항상 그대로 유지됩니다.
+
+<Tip>
+
+`rm -r`에 해당하는 명령을 안전하게 실행하기 위해,
+명시적인 `tmp_dir`을 사용하는 경우 프로젝트 저장소 체크 아웃의 하위 디렉터리만 허용됩니다.
+따라서 실수로 `/tmp`가 아닌 중요한 파일 시스템의 일부가 삭제되지 않도록 항상 `./`로 시작하는 경로를 전달해야 합니다.
+
+</Tip>
+
+<Tip>
+
+각 테스트는 여러 개의 임시 디렉터리를 등록할 수 있으며, 
+별도로 요청하지 않는 한 모두 자동으로 제거됩니다.
+
+</Tip>
+
+### 임시 sys.path 오버라이드[[temporary-sys.path-override]]
+
+`sys.path`를 다른 테스트로 임시로 오버라이드하기 위해 예를 들어 `ExtendSysPath` 컨텍스트 관리자를 사용할 수 있습니다.
+예를 들어 다음과 같습니다:
+
+
+```python
+import os
+from transformers.testing_utils import ExtendSysPath
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+```
+
+### 테스트 건너뛰기[[skipping-tests]]
+
+이것은 버그가 발견되어 새로운 테스트가 작성되었지만 아직 그 버그가 수정되지 않은 경우에 유용합니다. 
+이 테스트를 주 저장소에 커밋하려면 `make test` 중에 건너뛰도록 해야 합니다.
+
+방법:
+
+- **skip**은 테스트가 일부 조건이 충족될 경우에만 통과될 것으로 예상되고, 그렇지 않으면 pytest가 전체 테스트를 건너뛰어야 함을 의미합니다. 
+일반적인 예로는 Windows가 아닌 플랫폼에서 Windows 전용 테스트를 건너뛰거나 
+외부 리소스(예를 들어 데이터베이스)에 의존하는 테스트를 건너뛰는 것이 있습니다.
+
+- **xfail**은 테스트가 특정한 이유로 인해 실패할 것으로 예상하는 것을 의미합니다. 
+일반적인 예로는 아직 구현되지 않은 기능이나 아직 수정되지 않은 버그의 테스트가 있습니다. 
+`xfail`로 표시된 테스트가 예상대로 실패하지 않고 통과된 경우, 이것은 xpass이며 테스트 결과 요약에 기록됩니다.
+
+두 가지 중요한 차이점 중 하나는 `skip`은 테스트를 실행하지 않지만 `xfail`은 실행한다는 것입니다.
+따라서 오류가 있는 코드가 일부 테스트에 영향을 미칠 수 있는 경우 `xfail`을 사용하지 마세요.
+
+#### 구현[[implementation]]
+
+- 전체 테스트를 무조건 건너뛰려면 다음과 같이 할 수 있습니다:
+
+```python no-style
+@unittest.skip("this bug needs to be fixed")
+def test_feature_x():
+```
+
+또는 pytest를 통해:
+
+```python no-style
+@pytest.mark.skip(reason="this bug needs to be fixed")
+```
+
+또는 `xfail` 방식으로:
+
+```python no-style
+@pytest.mark.xfail
+def test_feature_x():
+```
+
+- 테스트 내부에서 내부 확인에 따라 테스트를 건너뛰는 방법은 다음과 같습니다:
+
+```python
+def test_feature_x():
+    if not has_something():
+        pytest.skip("unsupported configuration")
+```
+
+또는 모듈 전체:
+
+```python
+import pytest
+
+if not pytest.config.getoption("--custom-flag"):
+    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+```
+
+또는 `xfail` 방식으로:
+
+```python
+def test_feature_x():
+    pytest.xfail("expected to fail until bug XYZ is fixed")
+```
+
+- import가 missing된 모듈이 있을 때 그 모듈의 모든 테스트를 건너뛰는 방법:
+
+```python
+docutils = pytest.importorskip("docutils", minversion="0.3")
+```
+
+- 조건에 따라 테스트를 건너뛰는 방법:
+
+```python no-style
+@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+def test_feature_x():
+```
+
+또는:
+
+```python no-style
+@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+def test_feature_x():
+```
+
+또는 모듈 전체를 건너뛰는 방법:
+
+```python no-style
+@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+class TestClass():
+    def test_feature_x(self):
+```
+
+보다 자세한 예제 및 방법은 [여기](https://docs.pytest.org/en/latest/skipping.html)에서 확인할 수 있습니다.
+
+### 느린 테스트[[slow-tests]]
+
+테스트 라이브러리는 지속적으로 확장되고 있으며, 일부 테스트는 실행하는 데 몇 분이 걸립니다. 
+그리고 우리에게는 테스트 스위트가 CI를 통해 완료되기까지 한 시간을 기다릴 여유가 없습니다.
+따라서 필수 테스트를 위한 일부 예외를 제외하고 느린 테스트는 다음과 같이 표시해야 합니다.
+
+```python no-style
+from transformers.testing_utils import slow
+@slow
+def test_integration_foo():
+```
+
+`@slow`로 표시된 테스트를 실행하려면 `RUN_SLOW=1` 환경 변수를 설정하세요. 예를 들어 다음과 같습니다:
+
+```bash
+RUN_SLOW=1 pytest tests
+```
+
+`@parameterized`와 같은 몇 가지 데코레이터는 테스트 이름을 다시 작성합니다. 
+그러므로 `@slow`와 나머지 건너뛰기 데코레이터 `@require_*`가 올바르게 작동되려면 마지막에 나열되어야 합니다. 다음은 올바른 사용 예입니다.
+
+```python no-style
+@parameterized.expand(...)
+@slow
+def test_integration_foo():
+```
+
+이 문서의 초반부에 설명된 것처럼 느린 테스트는 PR의 CI 확인이 아닌 예약된 일정 기반으로 실행됩니다. 
+따라서 PR 제출 중에 일부 문제를 놓친 채로 병합될 수 있습니다.
+이러한 문제들은 다음번의 예정된 CI 작업 중에 감지됩니다. 
+하지만 PR을 제출하기 전에 자신의 컴퓨터에서 느린 테스트를 실행하는 것 또한 중요합니다.
+
+느린 테스트로 표시해야 하는지 여부를 결정하는 대략적인 결정 기준은 다음과 같습니다.
+
+만약 테스트가 라이브러리의 내부 구성 요소 중 하나에 집중되어 있다면(예: 모델링 파일, 토큰화 파일, 파이프라인), 
+해당 테스트를 느린 테스트 스위트에서 실행해야 합니다.
+만약 라이브러리의 다른 측면(예: 문서 또는 예제)에 집중되어 있다면, 
+해당 테스트를 느린 테스트 스위트에서 실행해야 합니다. 그리고 이 접근 방식을 보완하기 위해 예외를 만들어야 합니다.
+
+- 무거운 가중치 세트나 50MB보다 큰 데이터셋을 다운로드해야 하는 모든 테스트(예: 모델 통합 테스트, 토크나이저 통합 테스트, 파이프라인 통합 테스트)를 
+  느린 테스트로 설정해야 합니다.
+  새로운 모델을 추가하는 경우 통합 테스트용으로 무작위 가중치로 작은 버전을 만들어 허브에 업로드해야 합니다. 
+  이 내용은 아래 단락에서 설명됩니다.
+- 특별히 빠르게 실행되도록 최적화되지 않은 학습을 수행해야 하는 테스트는 느린 테스트로 설정해야 합니다.
+- 느리지 않아야 할 테스트 중 일부가 극도로 느린 경우 
+  예외를 도입하고 이를 `@slow`로 설정할 수 있습니다. 
+  대용량 파일을 디스크에 저장하고 불러오는 자동 모델링 테스트는 `@slow`으로 표시된 테스트의 좋은 예입니다.
+- CI에서 1초 이내에 테스트가 완료되는 경우(다운로드 포함)에는 느린 테스트가 아니어야 합니다.
+
+느린 테스트가 아닌 경우에는 다양한 내부를 완전히 커버하면서 빠르게 유지되어야 합니다.
+예를 들어, 무작위 가중치를 사용하여 특별히 생성된 작은 모델로 테스트하면 상당한 커버리지를 얻을 수 있습니다.
+이러한 모델은 최소한의 레이어 수(예: 2), 어휘 크기(예: 1000) 등의 요소만 가집니다. 그런 다음 `@slow` 테스트는 대형 느린 모델을 사용하여 정성적인 테스트를 수행할 수 있습니다.
+이러한 작은 모델을 사용하는 방법을 확인하려면 다음과 같이 *tiny* 모델을 찾아보세요.
+
+```bash
+grep tiny tests examples
+```
+
+다음은 작은 모델[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de)을 만든 
+[script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) 예시입니다. 
+특정 모델의 아키텍처에 맞게 쉽게 조정할 수 있습니다.
+
+예를 들어 대용량 모델을 다운로드하는 경우 런타임을 잘못 측정하기 쉽지만, 
+로컬에서 테스트하면 다운로드한 파일이 캐시되어 다운로드 시간이 측정되지 않습니다. 
+대신 CI 로그의 실행 속도 보고서를 확인하세요(`pytest --durations=0 tests`의 출력).
+
+이 보고서는 느린 이상값으로 표시되지 않거나 빠르게 다시 작성해야 하는 느린 이상값을 찾는 데도 유용합니다. 
+CI에서 테스트 스위트가 느려지기 시작하면 이 보고서의 맨 위 목록에 가장 느린 테스트가 표시됩니다.
+
+
+
+### stdout/stderr 출력 테스트[[testing-the-stdout/stderr-output]]
+
+`stdout` 및/또는 `stderr`로 쓰는 함수를 테스트하려면 `pytest`의 [capsys 시스템](https://docs.pytest.org/en/latest/capture.html)을 사용하여 해당 스트림에 액세스할 수 있습니다. 
+다음과 같이 수행할 수 있습니다.
+
+```python
+import sys
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
+def test_result_and_stdout(capsys):
+    msg = "Hello"
+    print_to_stdout(msg)
+    print_to_stderr(msg)
+    out, err = capsys.readouterr()  # 캡처된 출력 스트림 사용
+    # 선택 사항: 캡처된 스트림 재생성
+    sys.stdout.write(out)
+    sys.stderr.write(err)
+    # 테스트:
+    assert msg in out
+    assert msg in err
+```
+
+그리고, 물론 대부분의 경우에는 `stderr`는 예외의 일부로 제공됩니다. 
+그러므로 해당 경우에는 try/except를 사용해야 합니다.
+
+```python
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
+def test_something_exception():
+    msg = "Not a good value"
+    error = ""
+    try:
+        raise_exception(msg)
+    except Exception as e:
+        error = str(e)
+        assert msg in error, f"{msg} is in the exception:\n{error}"
+```
+
+`stdout`를 캡처하는 또 다른 방법은 `contextlib.redirect_stdout`를 사용하는 것입니다.
+
+```python
+from io import StringIO
+from contextlib import redirect_stdout
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def test_result_and_stdout():
+    msg = "Hello"
+    buffer = StringIO()
+    with redirect_stdout(buffer):
+        print_to_stdout(msg)
+    out = buffer.getvalue()
+    # 선택 사항: 캡처된 스트림 재생성
+    sys.stdout.write(out)
+    # 테스트:
+    assert msg in out
+```
+
+`stdout` 캡처에 관련된 중요한 문제 중 하나는 보통 `print`에서 이전에 인쇄된 내용을 재설정하는 `\r` 문자가 포함될 수 있다는 것입니다.
+`pytest`에서는 문제가 없지만 `pytest -s`에서는 이러한 문자가 버퍼에 포함되므로 
+`-s`가 있거나 없는 상태에서 태스트를 수행할 수 있으려면 캡처된 출력에 대해 추가적인 정리가 필요합니다.
+이 경우에는 `re.sub(r'~.*\r', '', buf, 0, re.M)`을 사용할 수 있습니다.
+
+하지만 도우미 컨텍스트 관리자 래퍼를 사용하면 
+출력에 `\r`이 포함되어 있는지의 여부에 관계없이 모든 것을 자동으로 처리하므로 편리합니다.
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+with CaptureStdout() as cs:
+    function_that_writes_to_stdout()
+print(cs.out)
+```
+
+다음은 전체 테스트 예제입니다.
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+msg = "Secret message\r"
+final = "Hello World"
+with CaptureStdout() as cs:
+    print(msg + final)
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
+```
+
+`stderr`를 캡처하고 싶다면, 대신 `CaptureStderr` 클래스를 사용하세요.
+
+```python
+from transformers.testing_utils import CaptureStderr
+
+with CaptureStderr() as cs:
+    function_that_writes_to_stderr()
+print(cs.err)
+```
+
+두 스트림을 동시에 캡처해야 한다면, 부모 `CaptureStd` 클래스를 사용하세요.
+
+```python
+from transformers.testing_utils import CaptureStd
+
+with CaptureStd() as cs:
+    function_that_writes_to_stdout_and_stderr()
+print(cs.err, cs.out)
+```
+
+또한, 테스트의 디버깅을 지원하기 위해 
+이러한 컨텍스트 관리자는 기본적으로 컨텍스트에서 종료할 때 캡처된 스트림을 자동으로 다시 실행합니다.
+
+
+### 로거 스트림 캡처[[capturing-logger-stream]]
+
+로거 출력을 검증해야 하는 경우 `CaptureLogger`를 사용할 수 있습니다.
+
+```python
+from transformers import logging
+from transformers.testing_utils import CaptureLogger
+
+msg = "Testing 1, 2, 3"
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+with CaptureLogger(logger) as cl:
+    logger.info(msg)
+assert cl.out, msg + "\n"
+```
+
+### 환경 변수를 이용하여 테스트[[testing-with-environment-variables]]
+
+특정 테스트의 환경 변수 영향을 검증하려면 
+`transformers.testing_utils.mockenv`라는 도우미 데코레이터를 사용할 수 있습니다.
+
+```python
+from transformers.testing_utils import mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+```
+
+일부 경우에는 외부 프로그램을 호출해야할 수도 있는데, 이 때에는 여러 개의 로컬 경로를 포함하는 `os.environ`에서 `PYTHONPATH`의 설정이 필요합니다.  
+헬퍼 클래스 `transformers.test_utils.TestCasePlus`가 도움이 됩니다:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class EnvExampleTest(TestCasePlus):
+    def test_external_prog(self):
+        env = self.get_env()
+        # 이제 `env`를 사용하여 외부 프로그램 호출
+```
+
+테스트 파일이 `tests` 테스트 스위트 또는 `examples`에 있는지에 따라 
+`env[PYTHONPATH]`가 두 디렉터리 중 하나를 포함하도록 설정되며, 
+현재 저장소에 대해 테스트가 수행되도록 `src` 디렉터리도 포함됩니다.
+테스트 호출 이전에 설정된 경우에는 `env[PYTHONPATH]`를 그대로 사용합니다.
+
+이 헬퍼 메소드는 `os.environ` 객체의 사본을 생성하므로 원본은 그대로 유지됩니다.
+
+
+### 재현 가능한 결과 얻기[[getting-reproducible-results]]
+
+일부 상황에서 테스트에서 임의성을 제거하여 동일하게 재현 가능한 결과를 얻고 싶을 수 있습니다. 
+이를 위해서는 다음과 같이 시드를 고정해야 합니다.
+
+```python
+seed = 42
+
+# 파이썬 RNG
+import random
+
+random.seed(seed)
+
+# 파이토치 RNG
+import torch
+
+torch.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+
+# 넘파이 RNG
+import numpy as np
+
+np.random.seed(seed)
+
+# 텐서플로 RNG
+tf.random.set_seed(seed)
+```
+
+### 테스트 디버깅[[debugging tests]]
+
+경고가 있는 곳에서 디버거를 시작하려면 다음을 수행하세요.
+
+```bash
+pytest tests/test_logging.py -W error::UserWarning --pdb
+```
+
+## Github Actions 워크플로우 작업 처리[[working-with-github-actions-workflows]]
+
+셀프 푸시 워크플로우 CI 작업을 트리거하려면, 다음을 수행해야 합니다.
+
+1. `transformers` 원본에서 새 브랜치를 만듭니다(포크가 아닙니다!).
+2. 브랜치 이름은 `ci_` 또는 `ci-`로 시작해야 합니다(`main`도 트리거하지만 `main`에서는 PR을 할 수 없습니다). 
+   또한 특정 경로에 대해서만 트리거되므로 이 문서가 작성된 후에 변경된 내용은 
+   [여기](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml)의 *push:*에서 확인할 수 있습니다.
+3. 이 브랜치에서 PR을 생성합니다
+4. 그런 다음 [여기](https://github.com/huggingface/transformers/actions/workflows/self-push.yml)에서 작업이 나타나는지 확인할 수 있습니다. 
+   백로그가 있는 경우, 바로 실행되지 않을 수도 있습니다.
+
+
+
+
+## 실험적인 CI 기능 테스트[[testing-Experimental-CI-Features]]
+
+CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 때문에 잠재적으로 문제가 발생할 수 있습니다. 
+따라서 새로운 CI 기능을 추가하는 경우 다음과 같이 수행해야 합니다.
+
+1. 테스트해야 할 내용을 테스트하는 새로운 전용 작업을 생성합니다.
+2. 새로운 작업은 항상 성공해야만 녹색 ✓를 받을 수 있습니다(아래에 자세한 내용이 있습니다).
+3. 다양한 PR 유형에 대한 확인을  위해 
+   (사용자 포크 브랜치, 포크되지 않은 브랜치, github.com UI 직접 파일 편집에서 생성된 브랜치, 강제 푸시 등 PR의 유형은 아주 다양합니다.) 
+   며칠 동안 실험 작업의 로그를 모니터링하면서 실행해봅니다.
+   (의도적으로 항상 녹색을 표시하므로 작업 전체가 녹색은 아니라는 점에 유의합니다.)
+4. 모든 것이 안정적인지 확인한 후, 새로운 변경 사항을 기존 작업에 병합합니다.
+
+이렇게 하면 CI 기능 자체에 대한 실험이 일반 작업 흐름에 방해가 되지 않습니다.
+
+그러나 새로운 CI 기능이 개발 중인 동안, 항상 성공하도록 할 수 있는 방법은 무엇일까요?
+
+TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작업을 성공한 것으로 보고하지만, 
+현재 우리가 사용하는 CircleCI와 Github Actions는 이를 지원하지 않습니다.
+
+따라서 다음과 같은 해결책을 사용할 수 있습니다.
+
+1. bash 스크립트에서 가능한 많은 오류를 억제하기 위해 실행 명령의 시작 부분에 `set +euo pipefail`을 추가합니다.
+2. 마지막 명령은 반드시 성공해야 합니다. `echo "done"` 또는 `true`를 사용하면 됩니다.
+
+예시는 다음과 같습니다.
+
+```yaml
+- run:
+    name: run CI experiment
+    command: |
+        set +euo pipefail
+        echo "setting run-all-despite-any-errors-mode"
+        this_command_will_fail
+        echo "but bash continues to run"
+        # emulate another failure
+        false
+        # but the last command must be a success
+        echo "during experiment do not remove: reporting success to CI, even if there were failures"
+```
+
+간단한 명령의 경우 다음과 같이 수행할 수도 있습니다.
+
+```bash
+cmd_that_may_fail || true
+```
+
+결과에 만족한 후에는 물론, 실험적인 단계 또는 작업을 일반 작업의 나머지 부분과 통합하면서 
+`set +euo pipefail` 또는 기타 추가한 요소를 제거하여 
+실험 작업이 일반 CI 작동에 방해되지 않도록 해야 합니다.
+
+이 전반적인 과정은 실험 단계가 PR의 전반적인 상태에 영향을 주지 않고 실패하도록 
+`allow-failure`와 같은 기능을 설정할 수 있다면 훨씬 더 쉬웠을 것입니다. 
+그러나 앞에서 언급한 바와 같이 CircleCI와 Github Actions는 현재 이러한 기능들 지원하지 않습니다.
+
+이 기능의 지원을 위한 투표에 참여하고 CI 관련 스레드들에서 이러한 상황을 확인할 수도 있습니다.
+
+- [Github Actions:](https://github.com/actions/toolkit/issues/399)
+- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)

From 3b734f5042a414d0520f56ce3a588cebc756a6db Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 24 Jul 2023 09:27:19 -0400
Subject: [PATCH 743/935] Add dispatch_batches to training arguments (#25038)

* Dispatch batches

* Copy items
---
 src/transformers/trainer.py               |  4 +-
 src/transformers/training_args.py         |  9 ++++
 tests/trainer/test_trainer_distributed.py | 65 ++++++++++++++++++++++-
 3 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 72c36cca6531..2069dde73936 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3806,7 +3806,9 @@ def create_accelerator_and_postprocess(self):
 
         # create accelerator object
         self.accelerator = Accelerator(
-            deepspeed_plugin=self.args.deepspeed_plugin, gradient_accumulation_plugin=gradient_accumulation_plugin
+            dispatch_batches=self.args.dispatch_batches,
+            deepspeed_plugin=self.args.deepspeed_plugin,
+            gradient_accumulation_plugin=gradient_accumulation_plugin,
         )
 
         # deepspeed and accelerate flags covering both trainer args and accelerate launcher
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 2efafaea77f7..313caf47e9d4 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1200,6 +1200,15 @@ class TrainingArguments:
         },
     )
 
+    dispatch_batches: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Whether to dispatch batches across devices in distributed training. If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process"
+            "and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose"
+            "underlying dataset is an `IterableDataset`, `False` otherwise."
+        },
+    )
+
     def __post_init__(self):
         # expand paths, if not os.makedirs("~/bar") will make directory
         # in the current directory instead of the actual home
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 8711a3970f3b..5a7734b8ba16 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -14,6 +14,8 @@
 
 from typing import Dict
 
+import numpy as np
+
 from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
 from transformers.testing_utils import (
     TestCasePlus,
@@ -33,7 +35,7 @@
 if is_torch_available():
     import torch
     from torch import nn
-    from torch.utils.data import Dataset
+    from torch.utils.data import Dataset, IterableDataset
 
     from transformers import Trainer
 
@@ -63,6 +65,56 @@ def forward(self, input_ids, labels=None):
             else:
                 return input_ids
 
+    class RegressionModel(nn.Module):
+        def __init__(self, a=0, b=0, double_output=False):
+            super().__init__()
+            self.a = nn.Parameter(torch.tensor(a).float())
+            self.b = nn.Parameter(torch.tensor(b).float())
+            self.double_output = double_output
+            self.config = None
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            if labels is None:
+                return (y, y) if self.double_output else (y,)
+            loss = nn.functional.mse_loss(y, labels)
+            return (loss, y, y) if self.double_output else (loss, y)
+
+    class SampleIterableDataset(IterableDataset):
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
+
+        def __iter__(self):
+            for i in range(len(self.dataset)):
+                yield self.dataset[i]
+
+    class FiniteIterableDataset(SampleIterableDataset):
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            super().__init__(a, b, length, seed, label_names)
+            self.current_sample = 0
+
+        def __iter__(self):
+            while self.current_sample < len(self.dataset):
+                yield self.dataset[self.current_sample]
+                self.current_sample += 1
+
+    class RegressionDataset:
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            np.random.seed(seed)
+            self.label_names = ["labels"] if label_names is None else label_names
+            self.length = length
+            self.x = np.random.normal(size=(length,)).astype(np.float32)
+            self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
+            self.ys = [y.astype(np.float32) for y in self.ys]
+
+        def __len__(self):
+            return self.length
+
+        def __getitem__(self, i):
+            result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
+            result["input_x"] = self.x[i]
+            return result
+
 
 class TestTrainerDistributedNeuronCore(TestCasePlus):
     @require_torch_neuroncore
@@ -168,3 +220,14 @@ def compute_metrics(p: EvalPrediction) -> Dict:
             exit(1)
 
         trainer.args.eval_accumulation_steps = None
+
+    # Check that `dispatch_batches=False` will work on a finite iterable dataset
+
+    train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)
+
+    model = RegressionModel()
+    training_args.per_device_train_batch_size = 1
+    training_args.max_steps = 1
+    training_args.dispatch_batches = False
+    trainer = Trainer(model, training_args, train_dataset=train_dataset)
+    trainer.train()

From 8f1f0bf50f402881c0aa53b18f21736a151adf5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B6ren=20Brunk?= <soeren@brunk.io>
Date: Mon, 24 Jul 2023 15:37:58 +0200
Subject: [PATCH 744/935] Fix typo in LlamaTokenizerFast docstring example
 (#25018)

---
 src/transformers/models/llama/tokenization_llama_fast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 82dfbe8925f0..533c6adf7b74 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -58,7 +58,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
     ```
     from transformers import LlamaTokenizerFast
 
-    tokenizer = LlaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
+    tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
     tokenizer.encode("Hello this is a test")
     >>> [1, 15043, 445, 338, 263, 1243]
     ```

From 42571f6eb8780a7b482bd963dea7413d02b31f4e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 24 Jul 2023 10:08:47 -0400
Subject: [PATCH 745/935] Make more test models smaller (#25005)

* Make more test models tiny

* Make more test models tiny

* More models

* More models
---
 tests/models/ctrl/test_modeling_ctrl.py       |  6 +--
 tests/models/ctrl/test_modeling_tf_ctrl.py    |  2 +-
 tests/models/cvt/test_modeling_cvt.py         |  8 +---
 tests/models/cvt/test_modeling_tf_cvt.py      |  4 +-
 tests/models/deta/test_modeling_deta.py       | 19 ++++++---
 tests/models/dpt/test_modeling_dpt.py         |  8 ++--
 tests/models/dpt/test_modeling_dpt_hybrid.py  | 12 +++---
 .../test_modeling_efficientnet.py             |  6 +--
 tests/models/encodec/test_modeling_encodec.py | 30 +++++++++----
 tests/models/esm/test_modeling_esm.py         |  4 --
 tests/models/esm/test_modeling_esmfold.py     | 32 ++++++++++----
 tests/models/flava/test_modeling_flava.py     | 42 ++++++++-----------
 tests/models/git/test_modeling_git.py         | 10 ++---
 .../test_modeling_gptsan_japanese.py          | 10 +----
 .../graphormer/test_modeling_graphormer.py    | 28 ++++++-------
 tests/models/levit/test_modeling_levit.py     | 10 ++---
 .../mask2former/test_modeling_mask2former.py  | 22 +++++++---
 .../maskformer/test_modeling_maskformer.py    | 12 +++---
 .../mobilevit/test_modeling_mobilevit.py      |  8 ++--
 .../mobilevit/test_modeling_tf_mobilevit.py   |  4 +-
 .../mobilevitv2/test_modeling_mobilevitv2.py  |  7 ++--
 tests/test_modeling_common.py                 |  2 +-
 22 files changed, 149 insertions(+), 137 deletions(-)

diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index fd2027723916..dfcb2c913386 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -133,7 +133,7 @@ def get_config(self):
             n_embd=self.hidden_size,
             n_layer=self.num_hidden_layers,
             n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
+            dff=self.intermediate_size,
             # hidden_act=self.hidden_act,
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
@@ -243,10 +243,6 @@ def test_ctrl_lm_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in CTRL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index 9a5ebbe34a77..01e57bcca37b 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -95,7 +95,7 @@ def prepare_config_and_inputs(self):
             n_embd=self.hidden_size,
             n_layer=self.num_hidden_layers,
             n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
+            dff=self.intermediate_size,
             # hidden_act=self.hidden_act,
             # hidden_dropout_prob=self.hidden_dropout_prob,
             # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index ab37e47b5130..6f4f63f0f9df 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -55,8 +55,8 @@ def __init__(
         batch_size=13,
         image_size=64,
         num_channels=3,
-        embed_dim=[16, 48, 96],
-        num_heads=[1, 3, 6],
+        embed_dim=[16, 32, 48],
+        num_heads=[1, 2, 3],
         depth=[1, 2, 10],
         patch_sizes=[7, 3, 3],
         patch_stride=[4, 2, 2],
@@ -247,10 +247,6 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in CVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index d1d5835d7bc8..ecb672d422a7 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -45,8 +45,8 @@ def __init__(
         batch_size=13,
         image_size=64,
         num_channels=3,
-        embed_dim=[16, 48, 96],
-        num_heads=[1, 3, 6],
+        embed_dim=[16, 32, 48],
+        num_heads=[1, 2, 3],
         depth=[1, 2, 10],
         patch_sizes=[7, 3, 3],
         patch_stride=[4, 2, 2],
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index 0693b030e299..d5bf32acaba7 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -19,7 +19,7 @@
 import math
 import unittest
 
-from transformers import DetaConfig, is_torch_available, is_torchvision_available, is_vision_available
+from transformers import DetaConfig, ResNetConfig, is_torch_available, is_torchvision_available, is_vision_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torchvision, require_vision, slow, torch_device
 
@@ -49,7 +49,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -118,6 +118,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return DetaConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -134,6 +144,7 @@ def get_config(self):
             encoder_n_points=self.encoder_n_points,
             decoder_n_points=self.decoder_n_points,
             two_stage=self.two_stage,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -423,10 +434,6 @@ def test_forward_signature(self):
     def test_tied_model_weights_key_ignore(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index f9cb66607d0f..62ac20df3130 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -62,6 +62,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         num_labels=3,
+        neck_hidden_sizes=[16, 16, 32, 32],
         is_hybrid=False,
         scope=None,
     ):
@@ -84,6 +85,7 @@ def __init__(
         self.num_labels = num_labels
         self.scope = scope
         self.is_hybrid = is_hybrid
+        self.neck_hidden_sizes = neck_hidden_sizes
         # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
@@ -105,6 +107,7 @@ def get_config(self):
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
+            fusion_hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             backbone_out_indices=self.backbone_out_indices,
             num_attention_heads=self.num_attention_heads,
@@ -115,6 +118,7 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             is_hybrid=self.is_hybrid,
+            neck_hidden_sizes=self.neck_hidden_sizes,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -275,10 +279,6 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 4c32c76e8659..7270f609c2bc 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -62,7 +62,8 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         num_labels=3,
-        backbone_featmap_shape=[1, 384, 24, 24],
+        backbone_featmap_shape=[1, 32, 24, 24],
+        neck_hidden_sizes=[16, 16, 32, 32],
         is_hybrid=True,
         scope=None,
     ):
@@ -86,6 +87,7 @@ def __init__(
         self.backbone_featmap_shape = backbone_featmap_shape
         self.scope = scope
         self.is_hybrid = is_hybrid
+        self.neck_hidden_sizes = neck_hidden_sizes
         # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
@@ -108,7 +110,7 @@ def get_config(self):
             "depths": [3, 4, 9],
             "out_features": ["stage1", "stage2", "stage3"],
             "embedding_dynamic_padding": True,
-            "hidden_sizes": [96, 192, 384, 768],
+            "hidden_sizes": [16, 16, 32, 32],
             "num_groups": 2,
         }
 
@@ -117,6 +119,7 @@ def get_config(self):
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
+            fusion_hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             backbone_out_indices=self.backbone_out_indices,
             num_attention_heads=self.num_attention_heads,
@@ -129,6 +132,7 @@ def get_config(self):
             is_hybrid=self.is_hybrid,
             backbone_config=backbone_config,
             backbone_featmap_shape=self.backbone_featmap_shape,
+            neck_hidden_sizes=self.neck_hidden_sizes,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -289,10 +293,6 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in DPT_PRETRAINED_MODEL_ARCHIVE_LIST[1:]:
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index e77c17a7a6a0..52a7ec4a7dd7 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -49,7 +49,7 @@ def __init__(
         num_channels=3,
         kernel_sizes=[3, 3, 5],
         in_channels=[32, 16, 24],
-        out_channels=[16, 24, 40],
+        out_channels=[16, 24, 20],
         strides=[1, 1, 2],
         num_block_repeats=[1, 1, 2],
         expand_ratios=[1, 6, 6],
@@ -223,10 +223,6 @@ def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index b8883314619a..8f1b06da06c8 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -77,16 +77,25 @@ def __init__(
         batch_size=12,
         num_channels=2,
         is_training=False,
-        num_hidden_layers=4,
         intermediate_size=40,
+        hidden_size=32,
+        num_filters=8,
+        num_residual_layers=1,
+        upsampling_ratios=[8, 4],
+        num_lstm_layers=1,
+        codebook_size=64,
     ):
         self.parent = parent
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.is_training = is_training
-
-        self.num_hidden_layers = num_hidden_layers
         self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios
+        self.num_lstm_layers = num_lstm_layers
+        self.codebook_size = codebook_size
 
     def prepare_config_and_inputs(self):
         input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
@@ -99,7 +108,16 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def get_config(self):
-        return EncodecConfig(audio_channels=self.num_channels, chunk_in_sec=None)
+        return EncodecConfig(
+            audio_channels=self.num_channels,
+            chunk_in_sec=None,
+            hidden_size=self.hidden_size,
+            num_filters=self.num_filters,
+            num_residual_layers=self.num_residual_layers,
+            upsampling_ratios=self.upsampling_ratios,
+            num_lstm_layers=self.num_lstm_layers,
+            codebook_size=self.codebook_size,
+        )
 
     def create_and_check_model_forward(self, config, inputs_dict):
         model = EncodecModel(config=config).to(torch_device).eval()
@@ -397,10 +415,6 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_identity_shortcut(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         config.use_conv_shortcut = False
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index fc1879e6bf45..2e5d48082be2 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -279,10 +279,6 @@ def test_resize_embeddings_untied(self):
     def test_resize_tokens_embeddings(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class EsmModelIntegrationTest(TestCasePlus):
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index bc5c10ae2427..39f274af54dc 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -100,6 +100,28 @@ def prepare_config_and_inputs(self):
         return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
 
     def get_config(self):
+        esmfold_config = {
+            "trunk": {
+                "num_blocks": 2,
+                "sequence_state_dim": 64,
+                "pairwise_state_dim": 16,
+                "sequence_head_width": 4,
+                "pairwise_head_width": 4,
+                "position_bins": 4,
+                "chunk_size": 16,
+                "structure_module": {
+                    "ipa_dim": 16,
+                    "num_angles": 7,
+                    "num_blocks": 2,
+                    "num_heads_ipa": 4,
+                    "pairwise_dim": 16,
+                    "resnet_dim": 16,
+                    "sequence_dim": 48,
+                },
+            },
+            "fp16_esm": False,
+            "lddt_head_hid_dim": 16,
+        }
         config = EsmConfig(
             vocab_size=33,
             hidden_size=self.hidden_size,
@@ -114,7 +136,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             initializer_range=self.initializer_range,
             is_folding_model=True,
-            esmfold_config={"trunk": {"num_blocks": 2}, "fp16_esm": False},
+            esmfold_config=esmfold_config,
         )
         return config
 
@@ -126,8 +148,8 @@ def create_and_check_model(self, config, input_ids, input_mask, sequence_labels,
         result = model(input_ids)
         result = model(input_ids)
 
-        self.parent.assertEqual(result.positions.shape, (8, self.batch_size, self.seq_length, 14, 3))
-        self.parent.assertEqual(result.angles.shape, (8, self.batch_size, self.seq_length, 7, 2))
+        self.parent.assertEqual(result.positions.shape, (2, self.batch_size, self.seq_length, 14, 3))
+        self.parent.assertEqual(result.angles.shape, (2, self.batch_size, self.seq_length, 7, 2))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -243,10 +265,6 @@ def test_torchscript_simple(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class EsmModelIntegrationTest(TestCasePlus):
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index cef39224da3d..f1221f1061c5 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -92,7 +92,7 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         mask_token=True,
-        vocab_size=8192,
+        vocab_size=99,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -321,10 +321,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -341,7 +337,7 @@ def __init__(
         is_training=True,
         use_input_mask=True,
         use_token_type_ids=True,
-        vocab_size=30522,
+        vocab_size=102,
         type_vocab_size=2,
         max_position_embeddings=512,
         position_embedding_type="absolute",
@@ -476,10 +472,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -632,10 +624,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -644,11 +632,23 @@ def test_model_from_pretrained(self):
 
 
 class FlavaImageCodebookTester:
-    def __init__(self, parent, batch_size=12, image_size=112, num_channels=3):
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=112,
+        num_channels=3,
+        hidden_size=32,
+        num_groups=2,
+        vocab_size=99,
+    ):
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
         self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.num_groups = num_groups
+        self.vocab_size = vocab_size
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -657,7 +657,9 @@ def prepare_config_and_inputs(self):
         return config, pixel_values
 
     def get_config(self):
-        return FlavaImageCodebookConfig()
+        return FlavaImageCodebookConfig(
+            hidden_size=self.hidden_size, num_groups=self.num_groups, vocab_size=self.vocab_size
+        )
 
     def create_and_check_model(self, config, pixel_values):
         model = FlavaImageCodebook(config=config)
@@ -743,10 +745,6 @@ def test_save_load_fast_init_from_base(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in FLAVA_CODEBOOK_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -929,10 +927,6 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             return
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 5997230b16eb..ed094db4a056 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -203,7 +203,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=4,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -268,6 +268,10 @@ def get_config(self):
                 "num_channels": self.num_channels,
                 "image_size": self.image_size,
                 "patch_size": self.patch_size,
+                "hidden_size": self.hidden_size,
+                "projection_dim": 32,
+                "num_hidden_layers": self.num_hidden_layers,
+                "num_attention_heads": self.num_attention_heads,
             },
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
@@ -454,10 +458,6 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 @require_vision
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
index 0738b294c033..54a98cf70fd3 100644
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -38,7 +38,7 @@ class GPTSanJapaneseTester:
     def __init__(
         self,
         parent,
-        vocab_size=36000,
+        vocab_size=99,
         batch_size=13,
         num_contexts=7,
         # For common tests
@@ -182,10 +182,6 @@ def test_model(self):
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -216,10 +212,6 @@ def test_model(self):
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_logits(self):
         model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index 60f188f3b2b1..b6a994f4597f 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -42,22 +42,22 @@ def __init__(
         self,
         parent,
         num_classes=1,
-        num_atoms=512 * 9,
-        num_edges=512 * 3,
-        num_in_degree=512,
-        num_out_degree=512,
-        num_spatial=512,
-        num_edge_dis=128,
+        num_atoms=32 * 9,
+        num_edges=32 * 3,
+        num_in_degree=32,
+        num_out_degree=32,
+        num_spatial=32,
+        num_edge_dis=16,
         multi_hop_max_dist=5,  # sometimes is 20
-        spatial_pos_max=1024,
+        spatial_pos_max=32,
         edge_type="multi_hop",
         init_fn=None,
-        max_nodes=512,
+        max_nodes=32,
         share_input_output_embed=False,
-        num_hidden_layers=12,
-        embedding_dim=768,
-        ffn_embedding_dim=768,
-        num_attention_heads=32,
+        num_hidden_layers=2,
+        embedding_dim=32,
+        ffn_embedding_dim=32,
+        num_attention_heads=4,
         dropout=0.1,
         attention_dropout=0.1,
         activation_dropout=0.1,
@@ -470,10 +470,6 @@ def test_for_graph_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index b78554374e3c..0e46f6f56dd7 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -67,10 +67,10 @@ def __init__(
         stride=2,
         padding=1,
         patch_size=16,
-        hidden_sizes=[128, 256, 384],
-        num_attention_heads=[4, 6, 8],
+        hidden_sizes=[16, 32, 48],
+        num_attention_heads=[1, 2, 3],
         depths=[2, 3, 4],
-        key_dim=[16, 16, 16],
+        key_dim=[8, 8, 8],
         drop_path_rate=0,
         mlp_ratio=[2, 2, 2],
         attention_ratio=[2, 2, 2],
@@ -282,10 +282,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index c492bbb7664d..898f21999229 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -54,6 +54,8 @@ def __init__(
         max_size=32 * 8,
         num_labels=4,
         hidden_dim=64,
+        num_attention_heads=4,
+        num_hidden_layers=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -66,6 +68,8 @@ def __init__(
         self.num_labels = num_labels
         self.hidden_dim = hidden_dim
         self.mask_feature_size = hidden_dim
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]).to(
@@ -85,15 +89,25 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         config = Mask2FormerConfig(
             hidden_size=self.hidden_dim,
+            num_attention_heads=self.num_attention_heads,
+            num_hidden_layers=self.num_hidden_layers,
+            encoder_feedforward_dim=16,
+            dim_feedforward=32,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            decoder_layers=2,
+            encoder_layers=2,
+            feature_size=16,
         )
         config.num_queries = self.num_queries
         config.num_labels = self.num_labels
 
+        config.backbone_config.embed_dim = 16
         config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.hidden_size = 16
         config.backbone_config.num_channels = self.num_channels
+        config.backbone_config.num_heads = [1, 1, 2, 2]
 
-        config.encoder_feedforward_dim = 64
-        config.dim_feedforward = 128
         config.hidden_dim = self.hidden_dim
         config.mask_feature_size = self.hidden_dim
         config.feature_size = self.hidden_dim
@@ -220,10 +234,6 @@ def test_resize_tokens_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 69cf21d566ec..c69a0c94ced2 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -85,9 +85,15 @@ def get_config(self):
         return MaskFormerConfig.from_backbone_and_decoder_configs(
             backbone_config=SwinConfig(
                 depths=[1, 1, 1, 1],
+                embed_dim=16,
+                hidden_size=32,
+                num_heads=[1, 1, 2, 2],
             ),
             decoder_config=DetrConfig(
-                decoder_ffn_dim=128,
+                decoder_ffn_dim=64,
+                decoder_layers=2,
+                encoder_ffn_dim=64,
+                encoder_layers=2,
                 num_queries=self.num_queries,
                 decoder_attention_heads=2,
                 d_model=self.mask_feature_size,
@@ -224,10 +230,6 @@ def test_resize_tokens_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index 350934ad051a..2c01ea0c99bb 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -56,7 +56,7 @@ def __init__(
         image_size=32,
         patch_size=2,
         num_channels=3,
-        last_hidden_size=640,
+        last_hidden_size=32,
         num_attention_heads=4,
         hidden_act="silu",
         conv_kernel_size=3,
@@ -115,6 +115,8 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             classifier_dropout_prob=self.classifier_dropout_prob,
             initializer_range=self.initializer_range,
+            hidden_sizes=[12, 16, 20],
+            neck_hidden_sizes=[8, 8, 16, 16, 32, 32, 32],
         )
 
     def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
@@ -231,10 +233,6 @@ def test_forward_signature(self):
             expected_arg_names = ["pixel_values"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index 4a3c4484d283..289d739774ab 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -59,7 +59,7 @@ def __init__(
         image_size=32,
         patch_size=2,
         num_channels=3,
-        last_hidden_size=640,
+        last_hidden_size=32,
         num_attention_heads=4,
         hidden_act="silu",
         conv_kernel_size=3,
@@ -118,6 +118,8 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             classifier_dropout_prob=self.classifier_dropout_prob,
             initializer_range=self.initializer_range,
+            hidden_sizes=[12, 16, 20],
+            neck_hidden_sizes=[8, 8, 16, 16, 32, 32, 32],
         )
 
     def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index 7f5c332a6166..b1961b2e6d4a 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -115,6 +115,9 @@ def get_config(self):
             width_multiplier=self.width_multiplier,
             ffn_dropout=self.ffn_dropout_prob,
             attn_dropout=self.attn_dropout_prob,
+            base_attn_unit_dims=[16, 24, 32],
+            n_attn_blocks=[1, 1, 2],
+            aspp_out_channels=32,
         )
 
     def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
@@ -225,10 +228,6 @@ def test_attention_outputs(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0d5080ec5aa2..a394723e53c4 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2708,7 +2708,7 @@ def test_load_with_mismatched_shapes(self):
     def test_model_is_small(self):
         # Just a consistency check to make sure we are not running tests on 80M parameter models.
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        # print(config)
+        print(config)
 
         for model_class in self.all_model_classes:
             model = model_class(config)

From afe8bfc0751000a2483eaa403b7bd7de020a9e80 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 24 Jul 2023 10:12:20 -0400
Subject: [PATCH 746/935] Comment again print statement

---
 tests/test_modeling_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index a394723e53c4..0d5080ec5aa2 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2708,7 +2708,7 @@ def test_load_with_mismatched_shapes(self):
     def test_model_is_small(self):
         # Just a consistency check to make sure we are not running tests on 80M parameter models.
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
+        # print(config)
 
         for model_class in self.all_model_classes:
             model = model_class(config)

From a03d13c83d402a6c13866a756ebfaceb43b8e8b6 Mon Sep 17 00:00:00 2001
From: Rinat <51479797+Xrenya@users.noreply.github.com>
Date: Mon, 24 Jul 2023 23:34:19 +0900
Subject: [PATCH 747/935] Pvt model (#24720)

* pull and push updates

* add docs

* fix modeling

* Add and run test

* make copies

* add task

* fix tests and fix small issues

* Checks on a Pull Request

* fix docs

* add desc pvt.md
---
 README.md                                     |   1 +
 README_es.md                                  |   1 +
 README_hd.md                                  |   1 +
 README_ja.md                                  |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.md                       |   2 +
 docs/source/en/model_doc/pvt.md               |  71 ++
 docs/source/en/tasks/image_classification.md  |   3 +-
 src/transformers/__init__.py                  |  18 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 src/transformers/models/pvt/__init__.py       |  80 +++
 .../models/pvt/configuration_pvt.py           | 163 +++++
 .../models/pvt/convert_pvt_to_pytorch.py      | 227 ++++++
 .../models/pvt/image_processing_pvt.py        | 273 +++++++
 src/transformers/models/pvt/modeling_pvt.py   | 674 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  24 +
 .../utils/dummy_vision_objects.py             |   7 +
 tests/models/pvt/__init__.py                  |   0
 tests/models/pvt/test_image_processing_pvt.py | 198 +++++
 tests/models/pvt/test_modeling_pvt.py         | 331 +++++++++
 26 files changed, 2086 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/model_doc/pvt.md
 create mode 100644 src/transformers/models/pvt/__init__.py
 create mode 100644 src/transformers/models/pvt/configuration_pvt.py
 create mode 100644 src/transformers/models/pvt/convert_pvt_to_pytorch.py
 create mode 100644 src/transformers/models/pvt/image_processing_pvt.py
 create mode 100755 src/transformers/models/pvt/modeling_pvt.py
 create mode 100644 tests/models/pvt/__init__.py
 create mode 100644 tests/models/pvt/test_image_processing_pvt.py
 create mode 100644 tests/models/pvt/test_modeling_pvt.py

diff --git a/README.md b/README.md
index d1a4b153e95b..9a1e4eeb7100 100644
--- a/README.md
+++ b/README.md
@@ -433,6 +433,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
diff --git a/README_es.md b/README_es.md
index 94ec96a66131..e4086dc9e54f 100644
--- a/README_es.md
+++ b/README_es.md
@@ -410,6 +410,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
diff --git a/README_hd.md b/README_hd.md
index df23f4c9be9e..0264db5c8f0e 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -382,6 +382,7 @@ conda install -c huggingface transformers
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP से) साथ वाला पेपर [प्रोग्राम अंडरस्टैंडिंग एंड जेनरेशन के लिए यूनिफाइड प्री-ट्रेनिंग](https://arxiv .org/abs/2103.06333) वसी उद्दीन अहमद, सैकत चक्रवर्ती, बैशाखी रे, काई-वेई चांग द्वारा।
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू-सीक्वेंस प्री-ट्रेनिंग ](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा पोस्ट किया गया।
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) के साथ जारी किया गया
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA से) साथ वाला पेपर [डीप लर्निंग इंफ़ेक्शन के लिए इंटीजर क्वांटिज़ेशन: प्रिंसिपल्स एंड एम्पिरिकल इवैल्यूएशन](https:// arxiv.org/abs/2004.09602) हाओ वू, पैट्रिक जुड, जिआओजी झांग, मिखाइल इसेव और पॉलियस माइकेविसियस द्वारा।
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (फेसबुक से) साथ में कागज [रिट्रीवल-ऑगमेंटेड जेनरेशन फॉर नॉलेज-इंटेंसिव एनएलपी टास्क](https://arxiv .org/abs/2005.11401) पैट्रिक लुईस, एथन पेरेज़, अलेक्जेंड्रा पिक्टस, फैबियो पेट्रोनी, व्लादिमीर कारपुखिन, नमन गोयल, हेनरिक कुटलर, माइक लुईस, वेन-ताउ यिह, टिम रॉकटाशेल, सेबस्टियन रिडेल, डौवे कीला द्वारा।
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google अनुसंधान से) केल्विन गु, केंटन ली, ज़ोरा तुंग, पानुपोंग पसुपत और मिंग-वेई चांग द्वारा साथ में दिया गया पेपर [REALM: रिट्रीवल-ऑगमेंटेड लैंग्वेज मॉडल प्री-ट्रेनिंग](https://arxiv.org/abs/2002.08909)।
diff --git a/README_ja.md b/README_ja.md
index 55591188d2a5..ff5ee6170692 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -444,6 +444,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602)
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401)
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)
diff --git a/README_ko.md b/README_ko.md
index 61235f4a22e9..7c3523121a55 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -359,6 +359,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP 에서) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 의 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 논문과 함께 발표했습니다.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs 에서) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 의 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 논문과 함께 발표했습니다.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)논문과 함께 발표했습니다.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA 에서) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 의 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 논문과 함께 발표했습니다.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook 에서) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 의 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 논문과 함께 발표했습니다.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research 에서) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 의 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 논문과 함께 발표했습니다.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 2a6ebd0897c9..e041d7202b22 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -383,6 +383,7 @@ conda install -c huggingface transformers
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (来自 Nanjing University, The University of Hong Kong etc.) 伴随论文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index a2b35f085a4e..d9a4c73d7164 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -395,6 +395,7 @@ conda install -c huggingface transformers
 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 9c434d945f8a..14156ebf5bfe 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -514,6 +514,8 @@
         title: NAT
       - local: model_doc/poolformer
         title: PoolFormer
+      - local: model_doc/pvt
+        title: Pyramid Vision Transformer (PVT)
       - local: model_doc/regnet
         title: RegNet
       - local: model_doc/resnet
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 28139d406ce3..2317c2544248 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -199,6 +199,7 @@ The documentation is organized into five sections:
 1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
 1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
@@ -411,6 +412,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            PLBart             |       ✅        |         ❌         |      ❌      |
 |          PoolFormer           |       ✅        |         ❌         |      ❌      |
 |          ProphetNet           |       ✅        |         ❌         |      ❌      |
+|              PVT              |       ✅        |         ❌         |      ❌      |
 |            QDQBert            |       ✅        |         ❌         |      ❌      |
 |              RAG              |       ✅        |         ✅         |      ❌      |
 |             REALM             |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md
new file mode 100644
index 000000000000..4b297a33f8fc
--- /dev/null
+++ b/docs/source/en/model_doc/pvt.md
@@ -0,0 +1,71 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Pyramid Vision Transformer (PVT)
+
+## Overview
+
+The PVT model was proposed in
+[Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/abs/2102.12122)
+by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. The PVT is a type of
+vision transformer that utilizes a pyramid structure to make it an effective backbone for dense prediction tasks. Specifically
+it allows for more fine-grained inputs (4 x 4 pixels per patch) to be used, while simultaneously shrinking the sequence length
+of the Transformer as it deepens - reducing the computational cost. Additionally, a spatial-reduction attention (SRA) layer
+is used to further reduce the resource consumption when learning high-resolution features.
+
+The abstract from the paper is the following:
+
+*Although convolutional neural networks (CNNs) have achieved great success in computer vision, this work investigates a 
+simpler, convolution-free backbone network useful for many dense prediction tasks. Unlike the recently proposed Vision 
+Transformer (ViT) that was designed for image classification specifically, we introduce the Pyramid Vision Transformer 
+(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several 
+merits compared to current state of the arts. Different from ViT that typically yields low resolution outputs and 
+incurs high computational and memory costs, PVT not only can be trained on dense partitions of an image to achieve high 
+output resolution, which is important for dense prediction, but also uses a progressive shrinking pyramid to reduce the 
+computations of large feature maps. PVT inherits the advantages of both CNN and Transformer, making it a unified 
+backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. 
+We validate PVT through extensive experiments, showing that it boosts the performance of many downstream tasks, including
+object detection, instance and semantic segmentation. For example, with a comparable number of parameters, PVT+RetinaNet 
+achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP (see Figure 2). We hope 
+that PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future research.*
+
+This model was contributed by [Xrenya](<https://huggingface.co/Xrenya). The original code can be found [here](https://github.com/whai362/PVT).
+
+
+- PVTv1 on ImageNet-1K
+
+| **Model variant**  |**Size** |**Acc@1**|**Params (M)**|
+|--------------------|:-------:|:-------:|:------------:|
+| PVT-Tiny           |    224  |   75.1  |     13.2     |
+| PVT-Small          |    224  |   79.8  |     24.5     |
+| PVT-Medium         |    224  |   81.2  |     44.2     |
+| PVT-Large          |    224  |   81.7  |     61.4     |
+
+
+## PvtConfig
+
+[[autodoc]] PvtConfig
+
+## PvtImageProcessor
+
+[[autodoc]] PvtImageProcessor
+    - preprocess
+
+## PvtForImageClassification
+
+[[autodoc]] PvtForImageClassification
+    - forward
+
+## PvtModel
+
+[[autodoc]] PvtModel
+    - forward
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
index 1d0dd0b339ac..489ec59ddf6a 100644
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@@ -34,7 +34,8 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+
 <!--End of the generated tip-->
 
 </Tip>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1ebb68088020..9eaba9cd1d10 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -469,6 +469,7 @@
     "models.plbart": ["PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "PLBartConfig"],
     "models.poolformer": ["POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PoolFormerConfig"],
     "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"],
+    "models.pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig"],
     "models.qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
     "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
@@ -948,6 +949,7 @@
     _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
     _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
+    _import_structure["models.pvt"].extend(["PvtImageProcessor"])
     _import_structure["models.sam"].extend(["SamImageProcessor"])
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
@@ -2398,6 +2400,14 @@
             "ProphetNetPreTrainedModel",
         ]
     )
+    _import_structure["models.pvt"].extend(
+        [
+            "PVT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PvtForImageClassification",
+            "PvtModel",
+            "PvtPreTrainedModel",
+        ]
+    )
     _import_structure["models.qdqbert"].extend(
         [
             "QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4414,6 +4424,7 @@
     from .models.plbart import PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP, PLBartConfig
     from .models.poolformer import POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, PoolFormerConfig
     from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer
+    from .models.pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig
     from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
@@ -4838,6 +4849,7 @@
         from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
         from .models.pix2struct import Pix2StructImageProcessor
         from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
+        from .models.pvt import PvtImageProcessor
         from .models.sam import SamImageProcessor
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
@@ -6040,6 +6052,12 @@
             ProphetNetModel,
             ProphetNetPreTrainedModel,
         )
+        from .models.pvt import (
+            PVT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PvtForImageClassification,
+            PvtModel,
+            PvtPreTrainedModel,
+        )
         from .models.qdqbert import (
             QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             QDQBertForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 8649286b511d..dcef274798a2 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -156,6 +156,7 @@
     plbart,
     poolformer,
     prophetnet,
+    pvt,
     qdqbert,
     rag,
     realm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2eb7bd974567..0fb606e06e78 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -161,6 +161,7 @@
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("prophetnet", "ProphetNetConfig"),
+        ("pvt", "PvtConfig"),
         ("qdqbert", "QDQBertConfig"),
         ("rag", "RagConfig"),
         ("realm", "RealmConfig"),
@@ -357,6 +358,7 @@
         ("plbart", "PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("poolformer", "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("prophetnet", "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("pvt", "PVT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("qdqbert", "QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("realm", "REALM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("regnet", "REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -571,6 +573,7 @@
         ("plbart", "PLBart"),
         ("poolformer", "PoolFormer"),
         ("prophetnet", "ProphetNet"),
+        ("pvt", "PVT"),
         ("qdqbert", "QDQBert"),
         ("rag", "RAG"),
         ("realm", "REALM"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c9b0eee6b4b3..d86211f1f053 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -85,6 +85,7 @@
         ("perceiver", "PerceiverImageProcessor"),
         ("pix2struct", "Pix2StructImageProcessor"),
         ("poolformer", "PoolFormerImageProcessor"),
+        ("pvt", "PvtImageProcessor"),
         ("regnet", "ConvNextImageProcessor"),
         ("resnet", "ConvNextImageProcessor"),
         ("sam", "SamImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4eb90b7cd7e5..7d108abaa51c 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -155,6 +155,7 @@
         ("plbart", "PLBartModel"),
         ("poolformer", "PoolFormerModel"),
         ("prophetnet", "ProphetNetModel"),
+        ("pvt", "PvtModel"),
         ("qdqbert", "QDQBertModel"),
         ("reformer", "ReformerModel"),
         ("regnet", "RegNetModel"),
@@ -482,6 +483,7 @@
             ),
         ),
         ("poolformer", "PoolFormerForImageClassification"),
+        ("pvt", "PvtForImageClassification"),
         ("regnet", "RegNetForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
         ("segformer", "SegformerForImageClassification"),
diff --git a/src/transformers/models/pvt/__init__.py b/src/transformers/models/pvt/__init__.py
new file mode 100644
index 000000000000..cab5af9af7c9
--- /dev/null
+++ b/src/transformers/models/pvt/__init__.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {
+    "configuration_pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig", "PvtOnnxConfig"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_pvt"] = ["PvtImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_pvt"] = [
+        "PVT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "PvtForImageClassification",
+        "PvtModel",
+        "PvtPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig, PvtOnnxConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_pvt import PvtImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_pvt import (
+            PVT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PvtForImageClassification,
+            PvtModel,
+            PvtPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/pvt/configuration_pvt.py b/src/transformers/models/pvt/configuration_pvt.py
new file mode 100644
index 000000000000..c1dc5fd3b020
--- /dev/null
+++ b/src/transformers/models/pvt/configuration_pvt.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pvt model configuration"""
+
+from collections import OrderedDict
+from typing import Callable, List, Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PVT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "pvt-tiny-224": "https://huggingface.co/Zetatech/pvt-tiny-224",
+    # See all PVT models at https://huggingface.co/models?filter=pvt
+}
+
+
+class PvtConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PvtModel`]. It is used to instantiate an Pvt
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Pvt
+    [Xrenya/pvt-tiny-224](https://huggingface.co/Xrenya/pvt-tiny-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The input image size
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_encoder_blocks (`[int]`, *optional*., defaults to 4):
+            The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
+            The number of layers in each encoder block.
+        sequence_reduction_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
+            Sequence reduction ratios in each encoder block.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
+            Dimension of each of the encoder blocks.
+        patch_sizes (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Patch size before each encoder block.
+        strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride before each encoder block.
+        num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        mlp_ratios (`List[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        num_labels ('int', *optional*, defaults to 1000)
+            The number of classes.
+    Example:
+
+    ```python
+    >>> from transformers import PvtModel, PvtConfig
+
+    >>> # Initializing a PVT Xrenya/pvt-tiny-224 style configuration
+    >>> configuration = PvtConfig()
+
+    >>> # Initializing a model from the Xrenya/pvt-tiny-224 style configuration
+    >>> model = PvtModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pvt"
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        num_channels: int = 3,
+        num_encoder_blocks: int = 4,
+        depths: List[int] = [2, 2, 2, 2],
+        sequence_reduction_ratios: List[int] = [8, 4, 2, 1],
+        hidden_sizes: List[int] = [64, 128, 320, 512],
+        patch_sizes: List[int] = [4, 2, 2, 2],
+        strides: List[int] = [4, 2, 2, 2],
+        num_attention_heads: List[int] = [1, 2, 5, 8],
+        mlp_ratios: List[int] = [8, 8, 4, 4],
+        hidden_act: Mapping[str, Callable] = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        drop_path_rate: float = 0.0,
+        layer_norm_eps: float = 1e-6,
+        qkv_bias: bool = True,
+        num_labels: int = 1000,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.depths = depths
+        self.sequence_reduction_ratios = sequence_reduction_ratios
+        self.hidden_sizes = hidden_sizes
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.mlp_ratios = mlp_ratios
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+        self.num_labels = num_labels
+        self.qkv_bias = qkv_bias
+
+
+class PvtOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
diff --git a/src/transformers/models/pvt/convert_pvt_to_pytorch.py b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
new file mode 100644
index 000000000000..187f3200d608
--- /dev/null
+++ b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Pvt checkpoints from the original library."""
+
+
+import argparse
+from pathlib import Path
+
+import requests
+import torch
+from PIL import Image
+
+from transformers import PvtConfig, PvtForImageClassification, PvtImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+    for i in range(config.num_encoder_blocks):
+        # Remane embedings' paramters
+        rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
+
+        rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
+        rename_keys.append((f"patch_embed{i + 1}.proj.bias", f"pvt.encoder.patch_embeddings.{i}.projection.bias"))
+        rename_keys.append((f"patch_embed{i + 1}.norm.weight", f"pvt.encoder.patch_embeddings.{i}.layer_norm.weight"))
+        rename_keys.append((f"patch_embed{i + 1}.norm.bias", f"pvt.encoder.patch_embeddings.{i}.layer_norm.bias"))
+
+        for j in range(config.depths[i]):
+            # Rename blocks' parameters
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.q.weight", f"pvt.encoder.block.{i}.{j}.attention.self.query.weight")
+            )
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.q.bias", f"pvt.encoder.block.{i}.{j}.attention.self.query.bias")
+            )
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.kv.weight", f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
+            )
+            rename_keys.append((f"block{i + 1}.{j}.attn.kv.bias", f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias"))
+
+            if config.sequence_reduction_ratios[i] > 1:
+                rename_keys.append(
+                    (
+                        f"block{i + 1}.{j}.attn.norm.weight",
+                        f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.weight",
+                    )
+                )
+                rename_keys.append(
+                    (f"block{i + 1}.{j}.attn.norm.bias", f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.bias")
+                )
+                rename_keys.append(
+                    (
+                        f"block{i + 1}.{j}.attn.sr.weight",
+                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.weight",
+                    )
+                )
+                rename_keys.append(
+                    (
+                        f"block{i + 1}.{j}.attn.sr.bias",
+                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.bias",
+                    )
+                )
+
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.proj.weight", f"pvt.encoder.block.{i}.{j}.attention.output.dense.weight")
+            )
+            rename_keys.append(
+                (f"block{i + 1}.{j}.attn.proj.bias", f"pvt.encoder.block.{i}.{j}.attention.output.dense.bias")
+            )
+
+            rename_keys.append((f"block{i + 1}.{j}.norm1.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_1.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.norm1.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_1.bias"))
+
+            rename_keys.append((f"block{i + 1}.{j}.norm2.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_2.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.norm2.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_2.bias"))
+
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense1.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense1.bias"))
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense2.weight"))
+            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense2.bias"))
+
+    # Rename cls token
+    rename_keys.extend(
+        [
+            ("cls_token", "pvt.encoder.patch_embeddings.3.cls_token"),
+        ]
+    )
+    # Rename norm layer and classifier layer
+    rename_keys.extend(
+        [
+            ("norm.weight", "pvt.encoder.layer_norm.weight"),
+            ("norm.bias", "pvt.encoder.layer_norm.bias"),
+            ("head.weight", "classifier.weight"),
+            ("head.bias", "classifier.bias"),
+        ]
+    )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_k_v(state_dict, config):
+    # for each of the encoder blocks:
+    for i in range(config.num_encoder_blocks):
+        for j in range(config.depths[i]):
+            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
+            kv_weight = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
+            kv_bias = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias")
+            # next, add keys and values (in that order) to the state dict
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
+
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
+                config.hidden_sizes[i] :, :
+            ]
+            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our PVT structure.
+    """
+
+    # define default Pvt configuration
+    if pvt_size == "tiny":
+        config_path = "Zetatech/pvt-tiny-224"
+    elif pvt_size == "small":
+        config_path = "Zetatech/pvt-small-224"
+    elif pvt_size == "medium":
+        config_path = "Zetatech/pvt-medium-224"
+    elif pvt_size == "large":
+        config_path = "Zetatech/pvt-large-224"
+    else:
+        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
+    config = PvtConfig(name_or_path=config_path)
+    # load original model from https://github.com/whai362/PVT
+    state_dict = torch.load(pvt_checkpoint, map_location="cpu")
+
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_k_v(state_dict, config)
+
+    # load HuggingFace model
+    model = PvtForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by PVTFeatureExtractor
+    image_processor = PvtImageProcessor(size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+    logits = outputs.logits.detach().cpu()
+
+    if pvt_size == "tiny":
+        expected_slice_logits = torch.tensor([-1.4192, -1.9158, -0.9702])
+    elif pvt_size == "small":
+        expected_slice_logits = torch.tensor([0.4353, -0.1960, -0.2373])
+    elif pvt_size == "medium":
+        expected_slice_logits = torch.tensor([-0.2914, -0.2231, 0.0321])
+    elif pvt_size == "large":
+        expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
+    else:
+        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
+
+    assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pvt_size",
+        default="tiny",
+        type=str,
+        help="Size of the PVT pretrained model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pvt_checkpoint",
+        default="pvt_tiny.pth",
+        type=str,
+        help="Checkpoint of the PVT pretrained model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_pvt_checkpoint(args.pvt_size, args.pvt_checkpoint, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/pvt/image_processing_pvt.py b/src/transformers/models/pvt/image_processing_pvt.py
new file mode 100644
index 000000000000..767499dd2450
--- /dev/null
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -0,0 +1,273 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pvt."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PvtImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a PVT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        return resize(
+            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+        )
+
+    def rescale(
+        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean to use for normalization.
+            std (`float` or `List[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size_dict, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py
new file mode 100755
index 000000000000..09f75092a814
--- /dev/null
+++ b/src/transformers/models/pvt/modeling_pvt.py
@@ -0,0 +1,674 @@
+# coding=utf-8
+# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PVT model."""
+
+import collections
+import math
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_pvt import PvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PvtConfig"
+
+_CHECKPOINT_FOR_DOC = "Zetatech/pvt-tiny-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 50, 512]
+
+_IMAGE_CLASS_CHECKPOINT = "Zetatech/pvt-tiny-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+PVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Zetatech/pvt-tiny-224"
+    # See all PVT models at https://huggingface.co/models?filter=pvt
+]
+
+
+# Copied from transformers.models.convnext.modeling_convnext.drop_path
+def drop_path(input, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Pvt
+class PvtDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class PvtPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(
+        self,
+        config: PvtConfig,
+        image_size: Union[int, Iterable[int]],
+        patch_size: Union[int, Iterable[int]],
+        stride: int,
+        num_channels: int,
+        hidden_size: int,
+        cls_token: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1 if cls_token else num_patches, hidden_size)
+        )
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size)) if cls_token else None
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=stride, stride=patch_size)
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        num_patches = height * width
+        if num_patches == self.config.image_size * self.config.image_size:
+            return self.position_embeddings
+        embeddings = embeddings.reshape(1, height, width, -1).permute(0, 3, 1, 2)
+        interpolated_embeddings = F.interpolate(embeddings, size=(height, width), mode="bilinear")
+        interpolated_embeddings = interpolated_embeddings.reshape(1, -1, height * width).permute(0, 2, 1)
+        return interpolated_embeddings
+
+    def forward(self, pixel_values: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        patch_embed = self.projection(pixel_values)
+        *_, height, width = patch_embed.shape
+        patch_embed = patch_embed.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(patch_embed)
+        if self.cls_token is not None:
+            cls_token = self.cls_token.expand(batch_size, -1, -1)
+            embeddings = torch.cat((cls_token, embeddings), dim=1)
+            position_embeddings = self.interpolate_pos_encoding(self.position_embeddings[:, 1:], height, width)
+            position_embeddings = torch.cat((self.position_embeddings[:, :1], position_embeddings), dim=1)
+        else:
+            position_embeddings = self.interpolate_pos_encoding(self.position_embeddings, height, width)
+        embeddings = self.dropout(embeddings + position_embeddings)
+
+        return embeddings, height, width
+
+
+class PvtSelfOutput(nn.Module):
+    def __init__(self, config: PvtConfig, hidden_size: int):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class PvtEfficientSelfAttention(nn.Module):
+    """Efficient self-attention mechanism with reduction of the sequence [PvT paper](https://arxiv.org/abs/2102.12122)."""
+
+    def __init__(
+        self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.sequences_reduction_ratio = sequences_reduction_ratio
+        if sequences_reduction_ratio > 1:
+            self.sequence_reduction = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequences_reduction_ratio, stride=sequences_reduction_ratio
+            )
+            self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+
+    def transpose_for_scores(self, hidden_states: int) -> torch.Tensor:
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        hidden_states = hidden_states.view(new_shape)
+        return hidden_states.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        if self.sequences_reduction_ratio > 1:
+            batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
+            hidden_states = self.sequence_reduction(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            hidden_states = self.layer_norm(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class PvtAttention(nn.Module):
+    def __init__(
+        self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
+    ):
+        super().__init__()
+        self.self = PvtEfficientSelfAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequences_reduction_ratio=sequences_reduction_ratio,
+        )
+        self.output = PvtSelfOutput(config, hidden_size=hidden_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+
+        attention_output = self.output(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class PvtFFN(nn.Module):
+    def __init__(
+        self,
+        config: PvtConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(hidden_features, out_features)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class PvtLayer(nn.Module):
+    def __init__(
+        self,
+        config: PvtConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        drop_path: float,
+        sequences_reduction_ratio: float,
+        mlp_ratio: float,
+    ):
+        super().__init__()
+        self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.attention = PvtAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequences_reduction_ratio=sequences_reduction_ratio,
+        )
+        self.drop_path = PvtDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = PvtFFN(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
+        self_attention_outputs = self.attention(
+            hidden_states=self.layer_norm_1(hidden_states),
+            height=height,
+            width=width,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        attention_output = self.drop_path(attention_output)
+        hidden_states = attention_output + hidden_states
+
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states))
+
+        mlp_output = self.drop_path(mlp_output)
+        layer_output = hidden_states + mlp_output
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class PvtEncoder(nn.Module):
+    def __init__(self, config: PvtConfig):
+        super().__init__()
+        self.config = config
+
+        # stochastic depth decay rule
+        drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist()
+
+        # patch embeddings
+        embeddings = []
+
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                PvtPatchEmbeddings(
+                    config=config,
+                    image_size=config.image_size if i == 0 else self.config.image_size // (2 ** (i + 1)),
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                    cls_token=i == config.num_encoder_blocks - 1,
+                )
+            )
+        self.patch_embeddings = nn.ModuleList(embeddings)
+
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    PvtLayer(
+                        config=config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequences_reduction_ratio=config.sequence_reduction_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                    )
+                )
+            blocks.append(nn.ModuleList(layers))
+
+        self.block = nn.ModuleList(blocks)
+
+        # Layer norms
+        self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        batch_size = pixel_values.shape[0]
+        num_blocks = len(self.block)
+        hidden_states = pixel_values
+        for idx, (embedding_layer, block_layer) in enumerate(zip(self.patch_embeddings, self.block)):
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+            # second, send embeddings through blocks
+            for block in block_layer:
+                layer_outputs = block(hidden_states, height, width, output_attentions)
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+            if idx != num_blocks - 1:
+                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class PvtPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PvtConfig
+    base_model_prefix = "pvt"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, PvtPatchEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data,
+                mean=0.0,
+                std=self.config.initializer_range,
+            )
+            if module.cls_token is not None:
+                module.cls_token.data = nn.init.trunc_normal_(
+                    module.cls_token.data,
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                )
+
+    def _set_gradient_checkpointing(self, module: PvtEncoder, value: bool = False):
+        if isinstance(module, PvtEncoder):
+            module.gradient_checkpointing = value
+
+
+PVT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`~PvtConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PVT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`PvtImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Pvt encoder outputting raw hidden-states without any specific head on top.",
+    PVT_START_DOCSTRING,
+)
+class PvtModel(PvtPreTrainedModel):
+    def __init__(self, config: PvtConfig):
+        super().__init__(config)
+        self.config = config
+
+        # hierarchical Transformer encoder
+        self.encoder = PvtEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    PVT_START_DOCSTRING,
+)
+class PvtForImageClassification(PvtPreTrainedModel):
+    def __init__(self, config: PvtConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.pvt = PvtModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor],
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.pvt(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 6ef0dd43446f..299f2f8693c1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5870,6 +5870,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+PVT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class PvtForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PvtModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PvtPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 79c0e3eac7c1..a2457989ab69 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -408,6 +408,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PvtImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SamImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/pvt/__init__.py b/tests/models/pvt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/pvt/test_image_processing_pvt.py b/tests/models/pvt/test_image_processing_pvt.py
new file mode 100644
index 000000000000..726b0f9122fb
--- /dev/null
+++ b/tests/models/pvt/test_image_processing_pvt.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import PvtImageProcessor
+
+
+class PvtImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.485, 0.456, 0.406],
+        image_std=[0.229, 0.224, 0.225],
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+
+@require_torch
+@require_vision
+class PvtImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
+    image_processing_class = PvtImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = PvtImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
+
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.size["height"],
+                self.image_processor_tester.size["width"],
+            ),
+        )
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
new file mode 100644
index 000000000000..afc6fce79e6f
--- /dev/null
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -0,0 +1,331 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Pvt model. """
+
+
+import inspect
+import unittest
+
+from transformers import is_torch_available, is_vision_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import (
+    require_accelerate,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import MODEL_MAPPING, PvtConfig, PvtForImageClassification, PvtImageProcessor, PvtModel
+    from transformers.models.pvt.modeling_pvt import PVT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class PvtConfigTester(ConfigTester):
+    def run_common_tests(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
+        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
+
+
+class PvtModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        num_channels=3,
+        num_encoder_blocks=4,
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        hidden_sizes=[16, 32, 64, 128],
+        downsampling_rates=[1, 4, 8, 16],
+        num_attention_heads=[1, 2, 4, 8],
+        is_training=True,
+        use_labels=True,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.sr_ratios = sr_ratios
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.downsampling_rates = downsampling_rates
+        self.num_attention_heads = num_attention_heads
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return PvtConfig(
+            image_size=self.image_size,
+            num_channels=self.num_channels,
+            num_encoder_blocks=self.num_encoder_blocks,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            num_attention_heads=self.num_attention_heads,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = PvtModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertIsNotNone(result.last_hidden_state)
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = PvtForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = PvtForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+class PvtModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (PvtModel, PvtForImageClassification) if is_torch_available() else ()
+
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_torchscript = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = PvtModelTester(self)
+        self.config_tester = PvtConfigTester(self, config_class=PvtConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip("Pvt does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("Pvt does not have get_input_embeddings method and get_output_embeddings methods")
+    def test_model_common_attributes(self):
+        pass
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            for name, param in model.named_parameters():
+                self.assertTrue(
+                    -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = sum(self.model_tester.depths) + 1
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # verify the first hidden states (first block)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-3:]),
+                [
+                    self.model_tester.batch_size,
+                    (self.model_tester.image_size // 4) ** 2,
+                    self.model_tester.image_size // 4,
+                ],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in PVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = PvtModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class PvtModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_image_classification(self):
+        # only resize + normalize
+        image_processor = PvtImageProcessor.from_pretrained("Zetatech/pvt-tiny-224")
+        model = PvtForImageClassification.from_pretrained("Zetatech/pvt-tiny-224").to(torch_device).eval()
+
+        image = prepare_img()
+        encoded_inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = encoded_inputs.pixel_values.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values)
+
+        expected_shape = torch.Size((1, model.config.num_labels))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-1.4192, -1.9158, -0.9702]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_model(self):
+        model = PvtModel.from_pretrained("Zetatech/pvt-tiny-224").to(torch_device).eval()
+
+        image_processor = PvtImageProcessor.from_pretrained("Zetatech/pvt-tiny-224")
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 50, 512))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.3086, 1.0402, 1.1816], [-0.2880, 0.5781, 0.6124], [0.1480, 0.6129, -0.0590]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    @require_accelerate
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        r"""
+        A small test to make sure that inference work in half precision without any problem.
+        """
+        model = PvtForImageClassification.from_pretrained(
+            "Zetatech/pvt-tiny-224", torch_dtype=torch.float16, device_map="auto"
+        )
+        image_processor = PvtImageProcessor(size=224)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device).astype(torch.float16)
+
+        # forward pass to make sure inference works in fp16
+        with torch.no_grad():
+            _ = model(pixel_values)

From 3611fc90e090c92f56a46e7f95c4dc2144ee15f1 Mon Sep 17 00:00:00 2001
From: Nate Brake <33383515+njbrake@users.noreply.github.com>
Date: Mon, 24 Jul 2023 10:53:10 -0400
Subject: [PATCH 748/935] compute_loss in trainer failing to label shift for
 PEFT model when label smoothing enabled. (#25044)

* added PeftModelForCausalLM to MODEL_FOR_CAUSAL_LM_MAPPING_NAMES dict

* check for PEFT model in compute_loss section

---------

Co-authored-by: Nathan Brake <nbrake3@mmm.com>
---
 src/transformers/trainer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 2069dde73936..4ccad5b276d2 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2677,7 +2677,11 @@ def compute_loss(self, model, inputs, return_outputs=False):
             self._past = outputs[self.args.past_index]
 
         if labels is not None:
-            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+            if is_peft_available() and isinstance(model, PeftModel):
+                model_name = unwrap_model(model.base_model)._get_name()
+            else:
+                model_name = unwrap_model(model)._get_name()
+            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                 loss = self.label_smoother(outputs, labels, shift_labels=True)
             else:
                 loss = self.label_smoother(outputs, labels)

From b08f41e62a41632195cb986fcc41d428a5bf1d56 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 24 Jul 2023 16:58:40 +0200
Subject: [PATCH 749/935] [`8bit`] Fix 8bit corner case with Blip2 8bit
 (#25047)

fix 8bit corner case with Blip2 8bit
---
 src/transformers/utils/bitsandbytes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 8abbfd670665..94d6c33937a6 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -273,7 +273,7 @@ def get_keys_to_not_convert(model):
         return []
 
     # otherwise they have an attached head
-    list_modules = list(model.named_children())
+    list_modules = list(model.named_parameters())
     list_last_module = [list_modules[-1][0]]
 
     # add last module together with tied weights

From c0d1c330220e055d5ff966e3e9b517b5e9dd4802 Mon Sep 17 00:00:00 2001
From: seank021 <127049663+seank021@users.noreply.github.com>
Date: Tue, 25 Jul 2023 00:54:13 +0900
Subject: [PATCH 750/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perf=5Ftrain=5Fcpu.md`=20to=20Korean=20(#24911)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* dos: ko: perf_train_cpu.md

* feat: chatgpt draft

* fix: manual edits

* fix: resolve suggestions

* fix: manual edits

Co-authored-by: Haewon Kim <ehdvkf02@naver.com>

---------

Co-authored-by: Haewon Kim <ehdvkf02@naver.com>
---
 docs/source/ko/_toctree.yml      |  4 +-
 docs/source/ko/perf_train_cpu.md | 67 ++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perf_train_cpu.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index e8c8ddeab319..eebcd625ed92 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -111,8 +111,8 @@
       title: (번역중) Training on one GPU
     - local: in_translation
       title: (번역중) Training on many GPUs
-    - local: in_translation
-      title: (번역중) Training on CPU
+    - local: perf_train_cpu
+      title: CPU에서 훈련
     - local: in_translation
       title: (번역중) Training on many CPUs
     - local: in_translation
diff --git a/docs/source/ko/perf_train_cpu.md b/docs/source/ko/perf_train_cpu.md
new file mode 100644
index 000000000000..573e7abc9d59
--- /dev/null
+++ b/docs/source/ko/perf_train_cpu.md
@@ -0,0 +1,67 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPU에서 효율적인 훈련 [[efficient-training-on-cpu]]
+
+이 가이드는 CPU에서 대규모 모델을 효율적으로 훈련하는 데 초점을 맞춥니다.
+
+## IPEX와 혼합 정밀도 [[mixed-precision-with-ipex]]
+
+IPEX는 AVX-512 이상을 지원하는 CPU에 최적화되어 있으며, AVX2만 지원하는 CPU에도 기능적으로 작동합니다. 따라서 AVX-512 이상의 Intel CPU 세대에서는 성능상 이점이 있을 것으로 예상되지만, AVX2만 지원하는 CPU (예: AMD CPU 또는 오래된 Intel CPU)의 경우에는 IPEX 아래에서 더 나은 성능을 보일 수 있지만 이는 보장되지 않습니다. IPEX는 Float32와 BFloat16를 모두 사용하여 CPU 훈련을 위한 성능 최적화를 제공합니다. BFloat16의 사용은 다음 섹션의 주요 초점입니다.
+
+저정밀도 데이터 타입인 BFloat16은 3세대 Xeon® Scalable 프로세서 (코드명: Cooper Lake)에서 AVX512 명령어 집합을 네이티브로 지원해 왔으며, 다음 세대의 Intel® Xeon® Scalable 프로세서에서 Intel® Advanced Matrix Extensions (Intel® AMX) 명령어 집합을 지원하여 성능을 크게 향상시킬 예정입니다. CPU 백엔드의 자동 혼합 정밀도 기능은 PyTorch-1.10부터 활성화되었습니다. 동시에, Intel® Extension for PyTorch에서 BFloat16에 대한 CPU의 자동 혼합 정밀도 및 연산자의 BFloat16 최적화를 대규모로 활성화하고, PyTorch 마스터 브랜치로 부분적으로 업스트림을 반영했습니다. 사용자들은 IPEX 자동 혼합 정밀도를 사용하여 더 나은 성능과 사용자 경험을 얻을 수 있습니다.
+
+[자동 혼합 정밀도](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html)에 대한 자세한 정보를 확인하십시오.
+
+### IPEX 설치: [[ipex-installation]]
+
+IPEX 릴리스는 PyTorch를 따라갑니다. pip를 통해 설치하려면:
+
+| PyTorch Version   | IPEX version   |
+| :---------------: | :----------:   |
+| 1.13              |  1.13.0+cpu    |
+| 1.12              |  1.12.300+cpu  |
+| 1.11              |  1.11.200+cpu  |
+| 1.10              |  1.10.100+cpu  |
+
+```
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+[IPEX 설치](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html)에 대한 더 많은 접근 방법을 확인하십시오.
+
+### Trainer에서의 사용법 [[usage-in-trainer]]
+Trainer에서 IPEX의 자동 혼합 정밀도를 활성화하려면 사용자는 훈련 명령 인수에 `use_ipex`, `bf16`, `no_cuda`를 추가해야 합니다.
+
+[Transformers 질문-응답](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)의 사용 사례를 살펴보겠습니다.
+
+- CPU에서 BF16 자동 혼합 정밀도를 사용하여 IPEX로 훈련하기:
+<pre> python run_qa.py \
+--model_name_or_path bert-base-uncased \
+--dataset_name squad \
+--do_train \
+--do_eval \
+--per_device_train_batch_size 12 \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/debug_squad/ \
+<b>--use_ipex \</b>
+<b>--bf16 --no_cuda</b></pre> 
+
+### 실습 예시 [[practice-example]]
+
+블로그: [Intel Sapphire Rapids로 PyTorch Transformers 가속화](https://huggingface.co/blog/intel-sapphire-rapids)
\ No newline at end of file

From d2295708a64feff485bb40caf0decf7bae5a05a2 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 24 Jul 2023 14:34:16 -0400
Subject: [PATCH 751/935] Better error message when signal is not supported on
 OS (#25049)

* Better error message when signal is not supported on OS

* Address review comments
---
 src/transformers/dynamic_module_utils.py | 32 +++++++++++++++---------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 5f7a2659bffa..cccd616fa2a3 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -532,19 +532,27 @@ def resolve_trust_remote_code(trust_remote_code, model_name, has_local_code, has
         if has_local_code:
             trust_remote_code = False
         elif has_remote_code and TIME_OUT_REMOTE_CODE > 0:
-            signal.signal(signal.SIGALRM, _raise_timeout_error)
-            signal.alarm(TIME_OUT_REMOTE_CODE)
-            while trust_remote_code is None:
-                answer = input(
-                    f"Loading {model_name} requires to execute some code in that repo, you can inspect the content of "
-                    f"the repository at https://hf.co/{model_name}. You can dismiss this prompt by passing "
-                    "`trust_remote_code=True`.\nDo you accept? [y/N] "
+            try:
+                signal.signal(signal.SIGALRM, _raise_timeout_error)
+                signal.alarm(TIME_OUT_REMOTE_CODE)
+                while trust_remote_code is None:
+                    answer = input(
+                        f"Loading {model_name} requires to execute some code in that repo, you can inspect the content of "
+                        f"the repository at https://hf.co/{model_name}. You can dismiss this prompt by passing "
+                        "`trust_remote_code=True`.\nDo you accept? [y/N] "
+                    )
+                    if answer.lower() in ["yes", "y", "1"]:
+                        trust_remote_code = True
+                    elif answer.lower() in ["no", "n", "0", ""]:
+                        trust_remote_code = False
+                signal.alarm(0)
+            except AttributeError:
+                # OS which does not support signal.SIGALRM
+                raise ValueError(
+                    "Loading this model requires you to execute execute some code in that repo on your local machine. "
+                    f"Make sure you have read the code at https://hf.co/{model_name} to avoid malicious use, then set "
+                    "the option `trust_remote_code=True` to remove this error."
                 )
-                if answer.lower() in ["yes", "y", "1"]:
-                    trust_remote_code = True
-                elif answer.lower() in ["no", "n", "0", ""]:
-                    trust_remote_code = False
-            signal.alarm(0)
         elif has_remote_code:
             # For the CI which puts the timeout at 0
             _raise_timeout_error(None, None)

From c53a6eae74778ad65de9edb0f0bd1eef3674bde3 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 25 Jul 2023 10:15:00 +0200
Subject: [PATCH 752/935] [`RWKV`] Add note in doc on `RwkvStoppingCriteria`
 (#25055)

* Add note in doc on `RwkvStoppingCriteria`

* give some breathing space to the code
---
 docs/source/en/model_doc/rwkv.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md
index cde8218bd082..9293db14cc63 100644
--- a/docs/source/en/model_doc/rwkv.md
+++ b/docs/source/en/model_doc/rwkv.md
@@ -51,6 +51,24 @@ output_two = outputs.last_hidden_state
 torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)
 ```
 
+If you want to make sure the model stops generating when `'\n\n'` is detected, we recommend using the following stopping criteria:
+
+```python 
+from transformers import StoppingCriteria
+
+class RwkvStoppingCriteria(StoppingCriteria):
+    def __init__(self, eos_sequence = [187,187], eos_token_id = 537):
+        self.eos_sequence = eos_sequence
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        last_2_ids = input_ids[:,-2:].tolist()
+        return self.eos_sequence in last_2_ids
+
+
+output = model.generate(inputs["input_ids"], max_new_tokens=64, stopping_criteria = [RwkvStoppingCriteria()])
+```
+
 ## RwkvConfig
 
 [[autodoc]] RwkvConfig

From c0742b15cbba5116ee8896b369896eb27a2807a2 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 25 Jul 2023 11:12:29 +0100
Subject: [PATCH 753/935] Generate - add beam indices output in contrained beam
 search (#25042)

---
 src/transformers/generation/beam_search.py | 38 ++++++++++++++++++++--
 src/transformers/generation/utils.py       | 30 +++++++++++------
 2 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 1aeec5aeae86..cf729bb45afb 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -43,7 +43,7 @@
             The id of the *padding* token.
         eos_token_id (`Union[int, List[int]]`, *optional*):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-        beam_indices (`torch.LongTensor]`, *optional*):
+        beam_indices (`torch.LongTensor`, *optional*):
             Beam indices indicating to which beam hypothesis each token correspond.
         group_index (`int`, *optional*):
             The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
@@ -510,6 +510,7 @@ def process(
         scores_for_all_vocab: torch.FloatTensor,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor]:
         r"""
         Args:
@@ -532,6 +533,8 @@ def process(
                 The id of the *padding* token.
             eos_token_id (`Union[int, List[int]]`, *optional*):
                 The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices indicating to which beam hypothesis each token correspond.
 
         Return:
             `UserDict`: A dictionary composed of the fields as defined above:
@@ -597,9 +600,16 @@ def process(
 
                     completes_constraint = self.check_completes_constraints(input_ids[batch_beam_idx].cpu().tolist())
                     if completes_constraint:
+                        if beam_indices is not None:
+                            beam_index = beam_indices[batch_beam_idx]
+                            beam_index = beam_index + (batch_beam_idx,)
+                        else:
+                            beam_index = None
+
                         beam_hyp.add(
                             input_ids[batch_beam_idx].clone(),
                             next_score.item(),
+                            beam_indices=beam_index,
                         )
                 else:
                     # add next predicted token since it is not eos_token
@@ -794,6 +804,7 @@ def finalize(
         max_length: int,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[Union[int, List[int]]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.LongTensor]:
         batch_size = len(self._beam_hyps)
 
@@ -816,7 +827,8 @@ def finalize(
 
                 completes_constraint = self.check_completes_constraints(final_tokens.cpu().tolist())
                 if completes_constraint:
-                    beam_hyp.add(final_tokens, final_score)
+                    beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
+                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index)
                     ids_collect.append(beam_id)
 
             # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
@@ -834,6 +846,7 @@ def finalize(
         # select the best hypotheses
         sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
         best = []
+        best_indices = []
         best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
 
         # retrieve best hypotheses
@@ -843,10 +856,15 @@ def finalize(
                 best_hyp_tuple = sorted_hyps.pop()
                 best_score = best_hyp_tuple[0]
                 best_hyp = best_hyp_tuple[1]
+                best_index = best_hyp_tuple[2]
                 sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
 
                 # append to lists
                 best.append(best_hyp)
+
+                # append indices to list
+                best_indices.append(best_index)
+
                 best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
 
         # prepare for adding eos
@@ -854,15 +872,28 @@ def finalize(
 
         sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
         decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+
+        if len(best_indices) > 0 and best_indices[0] is not None:
+            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        else:
+            indices = None
+
         # shorter batches are padded if needed
         if sent_lengths.min().item() != sent_lengths.max().item():
             if pad_token_id is None:
                 raise ValueError("`pad_token_id` has to be defined")
             decoded.fill_(pad_token_id)
 
+        if indices is not None:
+            indices.fill_(-1)
+
         # fill with hypotheses and eos_token_id if the latter fits in
-        for i, hypo in enumerate(best):
+        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
             decoded[i, : sent_lengths[i]] = hypo
+
+            if indices is not None:
+                indices[i, : len(best_idx)] = torch.tensor(best_idx)
+
             if sent_lengths[i] < sent_max_len:
                 # inserting only the first eos_token_id
                 decoded[i, sent_lengths[i]] = eos_token_id[0]
@@ -871,6 +902,7 @@ def finalize(
             {
                 "sequences": decoded,
                 "sequence_scores": best_scores,
+                "beam_indices": indices,
             }
         )
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 0c657df7dd97..e4a5e1ad52ca 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -4000,8 +4000,21 @@ def constrained_beam_search(
             else self.generation_config.return_dict_in_generate
         )
 
+        batch_size = len(constrained_beam_scorer._beam_hyps)
+        num_beams = constrained_beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
         # init attention / hidden states / scores tuples
         scores = () if (return_dict_in_generate and output_scores) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
         decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
         cross_attentions = () if (return_dict_in_generate and output_attentions) else None
         decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
@@ -4013,16 +4026,6 @@ def constrained_beam_search(
                 model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
             )
 
-        batch_size = len(constrained_beam_scorer._beam_hyps)
-        num_beams = constrained_beam_scorer.num_beams
-
-        batch_beam_size, cur_len = input_ids.shape
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
         # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
         # of the first beam are considered to avoid sampling the exact same tokens across all beams.
         beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
@@ -4107,6 +4110,7 @@ def constrained_beam_search(
                 scores_for_all_vocab,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
             )
             beam_scores = beam_outputs["next_beam_scores"]
             beam_next_tokens = beam_outputs["next_beam_tokens"]
@@ -4119,6 +4123,9 @@ def constrained_beam_search(
             if model_kwargs["past_key_values"] is not None:
                 model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
 
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
             # increase cur_len
             cur_len = cur_len + 1
 
@@ -4136,6 +4143,7 @@ def constrained_beam_search(
             pad_token_id=pad_token_id,
             eos_token_id=eos_token_id,
             max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
         )
 
         if return_dict_in_generate:
@@ -4146,6 +4154,7 @@ def constrained_beam_search(
                     sequences=sequence_outputs["sequences"],
                     sequences_scores=sequence_outputs["sequence_scores"],
                     scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
                     encoder_attentions=encoder_attentions,
                     encoder_hidden_states=encoder_hidden_states,
                     decoder_attentions=decoder_attentions,
@@ -4157,6 +4166,7 @@ def constrained_beam_search(
                     sequences=sequence_outputs["sequences"],
                     sequences_scores=sequence_outputs["sequence_scores"],
                     scores=scores,
+                    beam_indices=sequence_outputs["beam_indices"],
                     attentions=decoder_attentions,
                     hidden_states=decoder_hidden_states,
                 )

From faf25c040d191cee9bd4d3b1d4fd6974f7fd3516 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 25 Jul 2023 13:34:10 +0200
Subject: [PATCH 754/935] [Docs] fix rope_scaling doc string (#25072)

fix rope_scaling doc string
---
 .../models/deprecated/open_llama/configuration_open_llama.py    | 2 +-
 src/transformers/models/gpt_neox/configuration_gpt_neox.py      | 2 +-
 src/transformers/models/llama/configuration_llama.py            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
index e4694fe11bb4..3bb4b94010a5 100644
--- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -68,7 +68,7 @@ class OpenLlamaConfig(PretrainedConfig):
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index 24c4b4a3df03..657f143f2166 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -79,7 +79,7 @@ class GPTNeoXConfig(PretrainedConfig):
             Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
             speedup at large scales (e.g. 20B).
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 45877721376d..d029def3162f 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -78,7 +78,7 @@ class LlamaConfig(PretrainedConfig):
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how

From f6fe1d551479ef1514a7e3ca3d824346361dfaed Mon Sep 17 00:00:00 2001
From: Haewon Kim <ehdvkf02@naver.com>
Date: Tue, 25 Jul 2023 20:43:22 +0900
Subject: [PATCH 755/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`<tf=5Fxla>.md`=20to=20Korean=20(#24904)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: tf_xla.md

* feat: chatgpt draft

* fix: manual edits

* fix: manual edits

* fix: manual edits

* fix: resolve suggestions
---
 docs/source/ko/_toctree.yml |   4 +-
 docs/source/ko/tf_xla.md    | 174 ++++++++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/tf_xla.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index eebcd625ed92..4b8d82129ca9 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -137,8 +137,8 @@
       title: (번역중) Debugging
     - local: in_translation
       title: (번역중) Hyperparameter Search using Trainer API
-    - local: in_translation
-      title: (번역중) XLA Integration for TensorFlow Models
+    - local: tf_xla
+      title: TensorFlow 모델을 위한 XLA 통합
   title: (번역중) 성능 및 확장성
 - sections:
     - local: in_translation
diff --git a/docs/source/ko/tf_xla.md b/docs/source/ko/tf_xla.md
new file mode 100644
index 000000000000..66d30abb2e98
--- /dev/null
+++ b/docs/source/ko/tf_xla.md
@@ -0,0 +1,174 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TensorFlow 모델을 위한 XLA 통합 [[xla-integration-for-tensorflow-models]]
+
+[[open-in-colab]]
+
+XLA(Accelerated Linear Algebra)는 TensorFlow 모델의 실행 시간을 가속화하기 위한 컴파일러입니다. [공식 문서](https://www.tensorflow.org/xla)에 따르면 다음과 같습니다:
+
+XLA(Accelerated Linear Algebra)는 선형 대수를 위한 도메인 특화 컴파일러로, TensorFlow 모델을 소스 코드 변경 없이 가속화할 수 있습니다.
+
+TensorFlow에서 XLA를 사용하는 것은 간단합니다. XLA는 `tensorflow` 라이브러리 내에 패키지로 제공되며, [`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs)과 같은 그래프 생성 함수에서 `jit_compile` 인수를 사용하여 활성화할 수 있습니다. `fit()` 및 `predict()`와 같은 Keras 메소드를 사용하는 경우, `jit_compile` 인수를 `model.compile()`에 전달하여 XLA를 간단하게 활성화할 수 있습니다. 그러나 XLA는 이러한 메소드에 국한되지 않고 임의의 `tf.function`을 가속화하는 데에도 사용할 수 있습니다.
+
+🤗 Transformers에서는 [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [T5](https://huggingface.co/docs/transformers/model_doc/t5), [OPT](https://huggingface.co/docs/transformers/model_doc/opt)와 같은 모델의 텍스트 생성, 그리고 [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)와 같은 모델의 음성 처리를 포함하여 여러 TensorFlow 메소드가 XLA와 호환되도록 다시 작성되었습니다.
+
+정확한 속도 향상은 모델에 따라 다르지만, 🤗 Transformers 내의 TensorFlow 텍스트 생성 모델의 경우 최대 100배의 속도 향상을 확인했습니다. 이 문서에서는 이러한 모델에 대해 XLA를 사용하여 최대 성능을 얻는 방법을 설명합니다. 또한 XLA 통합의 벤치마크 및 디자인 철학에 대한 추가 자료 링크도 제공할 것입니다.
+
+## XLA를 사용하여 TF 함수 실행하기 [[running-tf-functions-with-xla]]
+
+TensorFlow에서 다음과 같은 모델을 고려해 봅시다:
+
+```py
+import tensorflow as tf
+
+model = tf.keras.Sequential(
+    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
+)
+```
+
+위 모델은 차원이 `(10, )`인 입력을 받습니다. 다음과 같이 모델을 사용하여 순전파를 실행할 수 있습니다:
+
+```py
+# 모델에 대한 임의의 입력을 생성합니다.
+batch_size = 16
+input_vector_dim = 10
+random_inputs = tf.random.normal((batch_size, input_vector_dim))
+
+# 순전파를 실행합니다.
+_ = model(random_inputs)
+```
+
+XLA로 컴파일된 함수로 순전파를 실행하려면 다음과 같이 해야 합니다:
+
+```py
+xla_fn = tf.function(model, jit_compile=True)
+_ = xla_fn(random_inputs)
+```
+
+`model`의 기본 `call()` 함수는 XLA 그래프를 컴파일하는 데 사용됩니다. 그러나 다른 모델 함수를 XLA로 컴파일하려면 다음과 같이 할 수도 있습니다:
+
+```py
+my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
+```
+
+## 🤗 Transformers에서 XLA를 사용하여 TF 텍스트 생성 모델 실행하기 [[running-a-tf-text-generation-model-with-xla-from-transformers]]
+
+🤗 Transformers에서 XLA로 가속화된 생성을 활성화하려면 최신 버전의 `transformers`가 설치되어 있어야 합니다. 다음과 같이 설치할 수 있습니다:
+
+```bash
+pip install transformers --upgrade
+```
+
+그리고 다음 코드를 실행할 수 있습니다:
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+# 최소 버전의 Transformers가 설치되어 있지 않다면 오류가 발생합니다.
+from transformers.utils import check_min_version
+
+check_min_version("4.21.0")
+
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+input_string = ["TensorFlow is"]
+
+# XLA 생성 함수를 만들기 위한 한 줄
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+tokenized_input = tokenizer(input_string, return_tensors="tf")
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
+```
+
+알 수 있듯이, `generate()`에서 XLA를 활성화하는 것은 단 한 줄의 코드입니다. 코드의 나머지 부분은 변경되지 않습니다. 그러나 위 코드 스니펫에서는 XLA에 특정한 몇 가지 주의할 점이 있습니다. XLA가 가져다줄 속도 향상을 실현하기 위해서는 이를 알고 있어야 합니다. 다음 섹션에서 이에 대해 논의합니다.
+
+## 주의할 점 [[gotchas-to-be-aware-of]]
+
+XLA 활성화 함수(`xla_generate()`와 같은)를 처음 실행할 때 내부적으로 계산 그래프를 추론하려고 하며, 이는 시간이 소요됩니다. 이 과정은 [“추적(tracing)”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing)이라고 알려져 있습니다.
+
+생성 시간이 빠르지 않다는 것을 알 수 있을 것입니다. `xla_generate()`(또는 다른 XLA 활성화 함수)의 연속 호출은 함수에 전달된 입력이 초기에 구축된 계산 그래프와 동일한 형태를 따른다면, 계산 그래프를 추론할 필요가 없습니다. 이는 입력 형태가 고정된 모달리티(예: 이미지)에는 문제가 되지 않지만, 가변 입력 형태 모달리티(예: 텍스트)를 사용할 때 주의해야 합니다.
+
+`xla_generate()`가 항상 동일한 입력 형태로 동작하도록 하려면, 토크나이저를 호출할 때 `padding` 인수를 지정할 수 있습니다.
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+# 여기서, padding 옵션이 있는 토크나이저를 호출합니다.
+tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+```
+
+이렇게 하면 `xla_generate()`에 대한 입력이 항상 추적된 형태로 전달되어 생성 시간이 가속화됩니다. 다음 코드로 이를 확인할 수 있습니다:
+
+```py
+import time
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
+    tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+    start = time.time_ns()
+    generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+    end = time.time_ns()
+    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
+```
+
+Tesla T4 GPU에서는 다음과 같은 출력을 예상할 수 있습니다:
+
+```bash
+Execution time -- 30819.6 ms
+
+Execution time -- 79.0 ms
+
+Execution time -- 78.9 ms
+```
+`xla_generate()`의 첫 번째 호출은 추적 때문에 시간이 오래 걸리지만, 연속 호출은 몇 배나 빠릅니다. 생성 옵션에 대한 어떤 변경이든 다시 추적을 유발하므로 생성 시간이 느려질 수 있음을 명심하세요.
+
+이 문서에서는 🤗 Transformers에서 제공하는 모든 텍스트 생성 옵션을 다루지 않았습니다. 고급 사용 사례에 대해 문서를 참조하시기 바랍니다.
+
+## 추가 자료 [[additional-resources]]
+
+여기에 🤗 Transformers와 XLA에 대해 더 자세히 알고 싶은 경우 도움이 될 수 있는 몇 가지 추가 자료를 제공합니다. 
+ 
+* [이 Colab 노트북](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb)은 XLA와 호환되는 인코더-디코더([T5](https://huggingface.co/docs/transformers/model_doc/t5)와 같은) 및 디코더 전용([GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)와 같은) 텍스트 생성 모델을 실험해 볼 수 있는 대화형 데모를 제공합니다.
+* [이 블로그 글](https://huggingface.co/blog/tf-xla-generate)은 TensorFlow에서 XLA에 대한 친절한 소개와 함께 XLA와 호환되는 모델의 비교 벤치마크에 대한 개요를 제공합니다.
+* [이 블로그 글](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html)은 🤗 Transformers의 TensorFlow 모델에 XLA 지원을 추가하는 것에 대한 디자인 철학을 논의합니다.
+* XLA와 TensorFlow 그래프에 대해 더 자세히 알고 싶은 경우 추천하는 글:
+    * [XLA: 기계 학습을 위한 최적화 컴파일러](https://www.tensorflow.org/xla)
+    * [그래프 및 tf.function 소개](https://www.tensorflow.org/guide/intro_to_graphs)
+    * [tf.function으로 성능 향상하기](https://www.tensorflow.org/guide/function) 
\ No newline at end of file

From ee1eb3b325ce360bbd6c910c1402bca9dfb418f9 Mon Sep 17 00:00:00 2001
From: Sangam Lee <74291999+augustinLib@users.noreply.github.com>
Date: Tue, 25 Jul 2023 20:44:24 +0900
Subject: [PATCH 756/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perf=5Fhardware.md`=20to=20Korean=20(#24966)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: perf_hardware.md

* feat: nmt draft

* fix: manual edits

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>

* fix: resolve suggestions

Co-authored-by: Haewon Kim <ehdvkf02@naver.com>

* Fix: manual edits

* fix: manual edits

* fix: manual edits

* fix: manual edits

* fix: fix rendering error of perf_hardware.md

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Haewon Kim <ehdvkf02@naver.com>
---
 docs/source/en/perf_hardware.md |   1 +
 docs/source/ko/_toctree.yml     |   4 +-
 docs/source/ko/perf_hardware.md | 156 ++++++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perf_hardware.md

diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md
index 98579bbf8751..a28824346e4b 100644
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@@ -27,6 +27,7 @@ Let's have a look at some practical advice for GPU setups.
 
 ## GPU
 When you train bigger models you have essentially three options:
+
 - bigger GPUs
 - more GPUs
 - more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 4b8d82129ca9..40a3ae3875ab 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -129,8 +129,8 @@
       title: (번역중) Inference on many GPUs
     - local: in_translation
       title: (번역중) Inference on Specialized Hardware
-    - local: in_translation
-      title: (번역중) Custom hardware for training
+    - local: perf_hardware
+      title: 훈련용 사용자 맞춤형 하드웨어
     - local: in_translation
       title: (번역중) Instantiating a big model
     - local: in_translation
diff --git a/docs/source/ko/perf_hardware.md b/docs/source/ko/perf_hardware.md
new file mode 100644
index 000000000000..e715b39487f3
--- /dev/null
+++ b/docs/source/ko/perf_hardware.md
@@ -0,0 +1,156 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# 훈련용 사용자 맞춤형 하드웨어 [[custom-hardware-for-training]]
+
+모델 훈련과 추론에 사용하는 하드웨어는 성능에 큰 영향을 미칠 수 있습니다. GPU에 대해 자세히 알아보려면, Tim Dettmer의 훌륭한 블로그 포스트를 확인해보세요. [블로그 포스트 링크](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/) (영어로 작성됨).
+
+GPU 설정에 대한 실용적인 조언을 살펴보겠습니다.
+
+## GPU [[gpu]]
+더 큰 모델을 훈련시킬 때는 기본적으로 세 가지 옵션이 있습니다:
+
+- 더 큰 GPU
+- 더 많은 GPU
+- 더 많은 CPU 및 NVMe ([DeepSpeed-Infinity](../en/main_classes/deepspeed#nvme-support)를 통한 오프로드(offload))
+
+우선, 하나의 GPU만 사용하는 경우부터 시작해봅시다.
+
+### 전원 공급과 냉각 [[power-and-cooling]]
+
+비싼 고성능 GPU를 구매한 경우, 올바른 전원 공급과 충분한 냉각을 제공해야 합니다.
+
+**전원 공급**:
+
+일부 고성능 소비자용 GPU는 2개 혹은 가끔가다 3개의 PCI-E 8핀 전원 소켓이 있습니다. 카드에 있는 소켓 수만큼 독립적인 12V PCI-E 8핀 케이블이 연결되어 있는지 확인하세요. 같은 케이블의 한쪽 끝에 있는 2개의 스플릿(또는 피그테일(pigtail) 케이블)을 사용하지 마세요. 즉, GPU에 2개의 소켓이 있다면, PSU(전원 공급 장치)에서 카드로 연결되는 2개의 PCI-E 8핀 케이블이 필요하며, 끝에 2개의 PCI-E 8핀 커넥터가 있는 케이블이 필요하지 않습니다! 그렇지 않으면 카드의 전체 성능을 제대로 발휘하지 못할 수 있습니다.
+
+각각의 PCI-E 8핀 전원 케이블은 PSU 쪽의 12V 레일에 연결되어야 하며 최대 150W의 전력을 공급할 수 있습니다.
+
+일부 다른 GPU는 PCI-E 12핀 커넥터를 사용하며, 이러한 커넥터는 최대 500W-600W의 전력을 공급할 수 있습니다.
+
+저가형 GPU는 6핀 커넥터를 사용하며, 최대 75W의 전력을 공급합니다.
+
+또한 GPU가 안정적인 전압을 받을 수 있도록 고급 PSU를 선택해야 합니다. 일부 저품질의 PSU는 GPU가 최고 성능으로 동작하기 위해 필요한 전압을 안정적으로 공급하지 못할 수 있습니다.
+
+물론, PSU는 GPU에 전원을 공급하기에 충분한 여분의 전력 용량을 가져야 합니다.
+
+**냉각**:
+
+GPU가 과열되면 성능이 저하되고 최대 성능을 발휘하지 못할 수 있으며, 너무 뜨거워지면 중지될 수 있습니다.
+
+GPU가 과열될 때 정확한 적정 온도를 알기 어려우나, 아마도 +80℃ 미만이면 좋지만 더 낮을수록 좋습니다. 70℃-75℃ 정도가 훌륭한 온도 범위입니다. 성능 저하가 발생하기 시작하는 온도는 대략 84℃-90℃ 정도일 것입니다. 하지만 성능 저하 이외에도 지속적으로 매우 높은 온도는 GPU 수명을 단축시킬 수 있습니다.
+
+이어서, 여러 개의 GPU를 사용할 때 가장 중요한 측면 중 하나인 GPU 간 연결 방식을 살펴보겠습니다.
+
+### 다중 GPU 연결 방식 [[multigpu-connectivity]]
+
+다중 GPU를 사용하는 경우 GPU 간의 연결 방식은 전체 훈련 시간에 큰 영향을 미칠 수 있습니다. 만약 GPU가 동일한 물리적 노드에 있을 경우, 다음과 같이 확인할 수 있습니다:
+
+```
+nvidia-smi topo -m
+```
+
+만약 NVLink로 연결된 듀얼 GPU 환경이라면, 다음과 같은 결과를 확인할 수 있습니다:
+
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      NV2     0-23            N/A
+GPU1    NV2      X      0-23            N/A
+```
+
+NVLink를 지원하지 않는 다른 환경의 경우에는 다음과 같은 결과를 확인할 수 있습니다:
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
+```
+
+이 결과에는 다음과 같은 범례가 포함되어 있습니다:
+
+```
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks
+```
+
+따라서 첫 번째 결과의 `NV2`는 GPU가 2개의 NVLink로 연결되어 있다는 것을 나타내고, 두 번째 결과의 `PHB`는 일반적인 소비자용 PCIe+브릿지 설정을 가지고 있다는 것을 나타냅니다.
+
+설정에서 어떤 유형의 연결 방식을 가지고 있는지 확인하세요. 일부 연결 방식은 GPU 간 통신을 더 빠르게 만들 수 있으며(NVLink와 같이), 어떤 연결 방식은 더 느리게 만들 수 있습니다(PHB와 같이).
+
+사용하는 확장성 솔루션의 종류에 따라 연결 속도가 주요한 영향을 미칠 수도 있고 미미한 영향을 미칠 수도 있습니다. DDP와 같이 GPU가 거의 동기화하지 않아도 되는 경우, 연결 속도가 느려도 큰 영향을 받지 않습니다. 반면 ZeRO-DP와 같이 GPU간 통신이 많이 필요한 경우, 더 빠른 훈련을 위해서는 더 빠른 연결 속도가 중요합니다.
+
+#### NVLink [[nvlink]]
+
+[NVLink](https://en.wikipedia.org/wiki/NVLink)는 Nvidia에서 개발한 유선 기반의 직렬 다중 레인 근거리 통신 링크입니다.
+
+새로운 세대의 NVLink는 더 빠른 대역폭을 제공합니다. [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf)에서 아래와 같은 정보를 확인하실 수 있습니다:
+
+> 3세대 NVLink®
+> GA102 GPU는 4개의 x4 링크를 포함하는 NVIDIA의 3세대 NVLink 인터페이스를 활용하며,
+> 각 링크는 두 개의 GPU 간에 각 방향으로 초당 14.0625GB의 대역폭을 제공합니다.
+> 4개의 링크는 각 방향에 초당 56.25GB의 대역폭을 제공하며, 두 개의 GPU 간에는 초당 112.5GB의 총 대역폭을 제공합니다.
+> 두 개의 RTX 3090 GPU를 NVLink를 사용해 SLI로 연결할 수 있습니다.
+> (3-Way 및 4-Way SLI 구성은 지원되지 않음에 유의하세요.)
+
+
+따라서 `nvidia-smi topo -m`의 결과에서 `NVX`의 값이 높을수록 더 좋습니다. 세대는 GPU 아키텍처에 따라 다를 수 있습니다.
+
+그렇다면, gpt2를 작은 wikitext 샘플로 학습시키는 예제를 통해, NVLink가 훈련에 어떤 영향을 미치는지 살펴보겠습니다.
+
+결과는 다음과 같습니다:
+
+
+| NVlink | Time |
+| -----  | ---: |
+| Y      | 101s |
+| N      | 131s |
+
+
+NVLink 사용 시 훈련이 약 23% 더 빠르게 완료됨을 확인할 수 있습니다. 두 번째 벤치마크에서는 `NCCL_P2P_DISABLE=1`을 사용하여 NVLink를 사용하지 않도록 설정했습니다.
+
+전체 벤치마크 코드와 결과는 다음과 같습니다:
+
+```bash
+# DDP w/ NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+하드웨어: 각각 2개의 TITAN RTX 24GB + 2개의 NVLink (`NV2` in `nvidia-smi topo -m`)
+소프트웨어: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`

From f295fc8a164b5882df94266a7f371c3158c930f5 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 25 Jul 2023 07:56:04 -0400
Subject: [PATCH 757/935] Fix last models for common tests that are too big.
 (#25058)

* Fix last models for common tests that are too big.

* Remove print statement
---
 .../perceiver/configuration_perceiver.py      |  6 +++
 .../models/perceiver/modeling_perceiver.py    | 14 +++----
 .../configuration_table_transformer.py        | 15 ++++++-
 .../models/layoutlm/test_modeling_layoutlm.py |  4 --
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  2 +-
 .../oneformer/test_modeling_oneformer.py      | 18 ++++-----
 .../perceiver/test_modeling_perceiver.py      |  9 +++--
 .../segformer/test_modeling_segformer.py      | 10 ++---
 .../segformer/test_modeling_tf_segformer.py   |  6 +--
 .../models/speecht5/test_modeling_speecht5.py | 39 +++++++++----------
 .../swiftformer/test_modeling_swiftformer.py  | 10 ++---
 .../test_modeling_table_transformer.py        | 35 ++++++++++-------
 .../test_modeling_timm_backbone.py            |  4 +-
 tests/models/tvlt/test_modeling_tvlt.py       | 10 ++---
 tests/models/upernet/test_modeling_upernet.py | 10 ++---
 .../models/videomae/test_modeling_videomae.py |  8 ++--
 tests/models/vit_mae/test_modeling_vit_mae.py |  8 ++--
 tests/models/vivit/test_modeling_vivit.py     |  8 +---
 tests/test_modeling_common.py                 |  1 -
 19 files changed, 109 insertions(+), 108 deletions(-)

diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index 86f5268fed43..182e92b8a311 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -97,6 +97,8 @@ class PerceiverConfig(PretrainedConfig):
             Number of audio samples per frame for the multimodal autoencoding model.
         samples_per_patch (`int`, *optional*, defaults to 16):
             Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
+        output_num_channels (`int`, *optional*, defaults to 512):
+            Number of output channels for each modalitiy decoder.
         output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
             Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
             autoencoding model. This excludes the channel dimension.
@@ -144,6 +146,8 @@ def __init__(
         audio_samples_per_frame=1920,
         samples_per_patch=16,
         output_shape=[1, 16, 224, 224],
+        output_num_channels=512,
+        _label_trainable_num_channels=1024,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -177,6 +181,8 @@ def __init__(
         self.audio_samples_per_frame = audio_samples_per_frame
         self.samples_per_patch = samples_per_patch
         self.output_shape = output_shape
+        self.output_num_channels = output_num_channels
+        self._label_trainable_num_channels = _label_trainable_num_channels
 
 
 class PerceiverOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 9d2e2fcf9882..bb7ac2bc3139 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -1830,7 +1830,7 @@ def __init__(self, config: PerceiverConfig):
             # Autoencoding, don't pass inputs to the queries.
             concat_preprocessed_input=False,
             output_shape=config.output_shape,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
             use_query_residual=False,
             position_encoding_only=True,
             position_encoding_type="fourier",
@@ -1854,7 +1854,7 @@ def __init__(self, config: PerceiverConfig):
                     # Autoencoding, don't pass inputs to the queries.
                     concat_preprocessed_input=False,
                     output_index_dims=(n_audio_samples // config.samples_per_patch,),
-                    output_num_channels=512,
+                    output_num_channels=config.output_num_channels,
                     use_query_residual=False,
                     position_encoding_only=True,
                     position_encoding_type="fourier",
@@ -1874,21 +1874,21 @@ def __init__(self, config: PerceiverConfig):
                     position_encoding_only=True,
                     position_encoding_type="trainable",
                     trainable_position_encoding_kwargs={
-                        "num_channels": 1024,
+                        "num_channels": config._label_trainable_num_channels,
                         "index_dims": 1,
                     },
                 ),
             },
             num_outputs=None,
-            output_num_channels=512,
+            output_num_channels=config.output_num_channels,
             use_query_residual=False,
         )
 
         output_postprocessor = PerceiverMultimodalPostprocessor(
             modalities={
-                "audio": PerceiverAudioPostprocessor(config, in_channels=512),
-                "image": PerceiverProjectionPostprocessor(in_channels=512, out_channels=3),
-                "label": PerceiverClassificationPostprocessor(config, in_channels=512),
+                "audio": PerceiverAudioPostprocessor(config, in_channels=config.output_num_channels),
+                "image": PerceiverProjectionPostprocessor(in_channels=config.output_num_channels, out_channels=3),
+                "label": PerceiverClassificationPostprocessor(config, in_channels=config.output_num_channels),
             }
         )
 
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 94213008c6b6..250816ef4949 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Table Transformer model configuration"""
-
+import copy
 from collections import OrderedDict
-from typing import Mapping
+from typing import Dict, Mapping
 
 from packaging import version
 
@@ -237,6 +237,17 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
+    def to_dict(self) -> Dict[str, any]:
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 687d1ae4a562..0535fbf4e1f4 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -279,10 +279,6 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 def prepare_layoutlm_batch_inputs():
     # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 4eda8952c39d..c8457331c58a 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -415,7 +415,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    @unittest.skip("We cannot configure detectron2 to output a smaller backbone")
     def test_model_is_small(self):
         pass
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index f23c0f265cea..ef4a45021acb 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -112,16 +112,20 @@ def get_config(self):
         config = OneFormerConfig(
             text_encoder_vocab_size=self.vocab_size,
             hidden_size=self.hidden_dim,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+            encoder_feedforward_dim=32,
+            dim_feedforward=64,
+            encoder_layers=2,
+            decoder_layers=2,
         )
 
-        config.num_queries = self.num_queries
-        config.num_labels = self.num_labels
-
+        config.backbone_config.embed_dim = 16
         config.backbone_config.depths = [1, 1, 1, 1]
+        config.backbone_config.hidden_size = 16
         config.backbone_config.num_channels = self.num_channels
+        config.backbone_config.num_heads = [1, 1, 2, 2]
 
-        config.encoder_feedforward_dim = 64
-        config.dim_feedforward = 128
         config.hidden_dim = self.hidden_dim
         config.mask_dim = self.hidden_dim
         config.conv_dim = self.hidden_dim
@@ -309,10 +313,6 @@ def test_forward_signature(self):
             expected_arg_names = ["pixel_values", "task_inputs"]
             self.assertListEqual(arg_names[:2], expected_arg_names)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 23bd75bdd157..91fac90e7bd3 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -79,6 +79,7 @@ def __init__(
         nchunks=20,
         num_latents=10,
         d_latents=20,
+        d_model=64,
         num_blocks=1,
         num_self_attends_per_block=2,
         num_self_attention_heads=1,
@@ -108,6 +109,7 @@ def __init__(
         self.nchunks = nchunks
         self.num_latents = num_latents
         self.d_latents = d_latents
+        self.d_model = d_model
         self.num_blocks = num_blocks
         self.num_self_attends_per_block = num_self_attends_per_block
         self.num_self_attention_heads = num_self_attention_heads
@@ -181,6 +183,7 @@ def get_config(self):
         return PerceiverConfig(
             num_latents=self.num_latents,
             d_latents=self.d_latents,
+            d_model=self.d_model,
             qk_channels=self.d_latents,
             v_channels=self.d_latents,
             num_blocks=self.num_blocks,
@@ -200,6 +203,8 @@ def get_config(self):
             audio_samples_per_frame=self.audio_samples_per_frame,
             samples_per_patch=self.samples_per_patch,
             num_labels=self.num_labels,
+            output_num_channels=32,
+            _label_trainable_num_channels=16,
         )
 
     def get_pipeline_config(self):
@@ -784,10 +789,6 @@ def test_problem_types(self):
 
                     loss.backward()
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @require_torch_multi_gpu
     @unittest.skip(
         reason=(
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 7f1900114897..0506be9b1f11 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -61,11 +61,11 @@ def __init__(
         image_size=64,
         num_channels=3,
         num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
         sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
         downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
         is_training=True,
         use_labels=True,
         hidden_act="gelu",
@@ -347,10 +347,6 @@ def test_training(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index b4dc657b7976..aca621f5097d 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -58,11 +58,11 @@ def __init__(
         image_size=64,
         num_channels=3,
         num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
+        depths=[1, 1, 1, 1],
         sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
+        hidden_sizes=[8, 8, 16, 16],
         downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 2, 4, 8],
+        num_attention_heads=[1, 1, 2, 2],
         is_training=True,
         use_labels=True,
         hidden_act="gelu",
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index fbd09ef500b4..c357259d7865 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -238,10 +238,6 @@ def test_torchscript_simple(self):
         # disabled because this model doesn't have decoder_input_ids
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 class SpeechT5ForSpeechToTextTester:
@@ -705,10 +701,6 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
@@ -800,6 +792,9 @@ def __init__(
         vocab_size=81,
         num_mel_bins=20,
         reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -813,6 +808,9 @@ def __init__(
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
@@ -847,6 +845,9 @@ def get_config(self):
             vocab_size=self.vocab_size,
             num_mel_bins=self.num_mel_bins,
             reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
         )
 
     def create_and_check_model_forward(self, config, inputs_dict):
@@ -996,10 +997,6 @@ def _mock_init_weights(self, module):
         if hasattr(module, "bias") and module.bias is not None:
             module.bias.data.fill_(3)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
@@ -1046,6 +1043,9 @@ def __init__(
         vocab_size=81,
         num_mel_bins=20,
         reduction_factor=2,
+        speech_decoder_postnet_layers=2,
+        speech_decoder_postnet_units=32,
+        speech_decoder_prenet_units=32,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -1065,6 +1065,9 @@ def __init__(
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.reduction_factor = reduction_factor
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
 
     def prepare_config_and_inputs(self):
         input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
@@ -1105,6 +1108,9 @@ def get_config(self):
             vocab_size=self.vocab_size,
             num_mel_bins=self.num_mel_bins,
             reduction_factor=self.reduction_factor,
+            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
+            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
+            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
         )
 
     def create_and_check_model_forward(self, config, inputs_dict):
@@ -1416,10 +1422,6 @@ def _mock_init_weights(self, module):
         if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
             module.masked_spec_embed.data.fill_(3)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
@@ -1478,6 +1480,7 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return SpeechT5HifiGanConfig(
             model_in_dim=self.num_mel_bins,
+            upsample_initial_channel=32,
         )
 
     def create_and_check_model(self, config, input_values):
@@ -1562,10 +1565,6 @@ def test_model_outputs_equivalence(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     # skip because it fails on automapping of SpeechT5HifiGanConfig
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index 151807c80cff..3e286cc32048 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -58,9 +58,9 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         image_size=224,
-        num_labels=1000,
-        layer_depths=[3, 3, 6, 4],
-        embed_dims=[48, 56, 112, 220],
+        num_labels=3,
+        layer_depths=[1, 1, 1, 1],
+        embed_dims=[16, 16, 32, 32],
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -272,10 +272,6 @@ def _config_zero_init(config):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 0df8da45cb94..d81c52ff1307 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -21,8 +21,8 @@
 
 from huggingface_hub import hf_hub_download
 
-from transformers import TableTransformerConfig, is_timm_available, is_vision_available
-from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -30,10 +30,10 @@
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_timm_available():
+if is_torch_available():
     import torch
 
-    from transformers import ResNetConfig, TableTransformerForObjectDetection, TableTransformerModel
+    from transformers import TableTransformerForObjectDetection, TableTransformerModel
 
 
 if is_vision_available():
@@ -49,7 +49,7 @@ def __init__(
         batch_size=8,
         is_training=True,
         use_labels=True,
-        hidden_size=256,
+        hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=8,
         intermediate_size=4,
@@ -61,7 +61,7 @@ def __init__(
         min_size=200,
         max_size=200,
         n_targets=8,
-        num_labels=91,
+        num_labels=3,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -107,6 +107,16 @@ def prepare_config_and_inputs(self):
         return config, pixel_values, pixel_mask, labels
 
     def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=10,
+            hidden_sizes=[10, 20, 30, 40],
+            depths=[1, 1, 2, 1],
+            hidden_act="relu",
+            num_labels=3,
+            out_features=["stage2", "stage3", "stage4"],
+            out_indices=[2, 3, 4],
+        )
         return TableTransformerConfig(
             d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
@@ -119,6 +129,8 @@ def get_config(self):
             attention_dropout=self.attention_probs_dropout_prob,
             num_queries=self.num_queries,
             num_labels=self.num_labels,
+            use_timm_backbone=False,
+            backbone_config=resnet_config,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -175,19 +187,19 @@ def create_and_check_table_transformer_no_timm_backbone(self, config, pixel_valu
         self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
 
 
-@require_timm
+@require_torch
 class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TableTransformerModel,
             TableTransformerForObjectDetection,
         )
-        if is_timm_available()
+        if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
         {"feature-extraction": TableTransformerModel, "object-detection": TableTransformerForObjectDetection}
-        if is_timm_available()
+        if is_torch_available()
         else {}
     )
     is_encoder_decoder = True
@@ -453,6 +465,7 @@ def test_greyscale_images(self):
 
         # let's set num_channels to 1
         config.num_channels = 1
+        config.backbone_config.num_channels = 1
 
         for model_class in self.all_model_classes:
             model = model_class(config)
@@ -486,10 +499,6 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 TOLERANCE = 1e-4
 
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index c134a588b6b5..eaa022422cd0 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -42,7 +42,7 @@ def __init__(
         out_indices=None,
         out_features=None,
         stage_names=None,
-        backbone="resnet50",
+        backbone="resnet18",
         batch_size=3,
         image_size=32,
         num_channels=3,
@@ -196,7 +196,7 @@ def test_torchscript_output_attentions(self):
     def test_can_use_safetensors(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
+    @unittest.skip("Need to use a timm backbone and there is no tiny model available.")
     def test_model_is_small(self):
         pass
 
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index 41eefc9eb73d..e437b2651e4f 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -67,8 +67,8 @@ def __init__(
         num_image_channels=3,
         num_audio_channels=1,
         num_frames=2,
-        hidden_size=128,
-        num_hidden_layers=12,
+        hidden_size=32,
+        num_hidden_layers=3,
         num_attention_heads=4,
         intermediate_size=128,
         hidden_act="gelu",
@@ -79,7 +79,7 @@ def __init__(
         qkv_bias=True,
         use_mean_pooling=True,
         decoder_num_attention_heads=4,
-        decoder_hidden_size=64,
+        decoder_hidden_size=32,
         decoder_num_hidden_layers=2,
         decoder_intermediate_size=128,
         image_mask_ratio=0.75,
@@ -542,10 +542,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 97ba37f8be79..84c32f7233e7 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -51,7 +51,7 @@ def __init__(
         num_channels=3,
         num_stages=4,
         hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
+        depths=[1, 1, 1, 1],
         is_training=True,
         use_labels=True,
         intermediate_size=37,
@@ -106,12 +106,12 @@ def get_backbone_config(self):
     def get_config(self):
         return UperNetConfig(
             backbone_config=self.get_backbone_config(),
-            hidden_size=512,
+            hidden_size=64,
             pool_scales=[1, 2, 3, 6],
             use_auxiliary_head=True,
             auxiliary_loss_weight=0.4,
             auxiliary_in_channels=40,
-            auxiliary_channels=256,
+            auxiliary_channels=32,
             auxiliary_num_convs=1,
             auxiliary_concat_input=False,
             loss_ignore_index=255,
@@ -207,10 +207,6 @@ def test_save_load_fast_init_to_base(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 6f2e7fa31e39..85a0d2714e98 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -130,6 +130,10 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -344,10 +348,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index bb50cb9606ec..3cedb0c176f5 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -118,6 +118,10 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             mask_ratio=self.mask_ratio,
+            decoder_hidden_size=self.hidden_size,
+            decoder_intermediate_size=self.intermediate_size,
+            decoder_num_attention_heads=self.num_attention_heads,
+            decoder_num_hidden_layers=self.num_hidden_layers,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -279,10 +283,6 @@ def test_save_load_fast_init_to_base(self):
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index d7d72eca7ead..43db8bad7bde 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -55,8 +55,8 @@ def __init__(
         num_frames=8,  # decreased, because default 32 takes too much RAM at inference
         tubelet_size=[2, 4, 4],
         num_channels=3,
-        hidden_size=768,
-        num_hidden_layers=5,
+        hidden_size=32,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu_fast",
@@ -310,10 +310,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip("Will be fixed soon by reducing the size of the model used for common tests.")
-    def test_model_is_small(self):
-        pass
-
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0d5080ec5aa2..87960983f9b3 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2708,7 +2708,6 @@ def test_load_with_mismatched_shapes(self):
     def test_model_is_small(self):
         # Just a consistency check to make sure we are not running tests on 80M parameter models.
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        # print(config)
 
         for model_class in self.all_model_classes:
             model = model_class(config)

From 5dba88b2d258e20464213cbe87169c0b253e90c3 Mon Sep 17 00:00:00 2001
From: Injin Paek <71638597+eenzeenee@users.noreply.github.com>
Date: Tue, 25 Jul 2023 21:02:33 +0900
Subject: [PATCH 758/935] fix: add TOC anchor link (#25066)

---
 docs/source/en/perf_infer_gpu_one.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index d08d84dd484b..543d711c3769 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -48,7 +48,7 @@ Note that this feature can also be used in a multi GPU setup.
 
 </Tip>
 
-### Requirements
+### Requirements [[requirements-for-fp4-mixedprecision-inference]]
 
 - Latest `bitsandbytes` library
 `pip install bitsandbytes>=0.39.0`
@@ -114,7 +114,7 @@ For more details regarding the method, check out the [paper](https://arxiv.org/a
 Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature.
 Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
 
-### Requirements
+### Requirements [[requirements-for-int8-mixedprecision-matrix-decomposition]]
 
 - If you have `bitsandbytes<0.37.0`, make sure you run on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100). For `bitsandbytes>=0.37.0`, all GPUs should be supported.
 - Install the correct version of `bitsandbytes` by running:

From 6bc61aa7afdb16f3179ee476a61b900181798369 Mon Sep 17 00:00:00 2001
From: Xuehai Pan <XuehaiPan@pku.edu.cn>
Date: Tue, 25 Jul 2023 20:04:48 +0800
Subject: [PATCH 759/935] Set `TF32` flag for PyTorch cuDNN backend (#25075)

---
 docs/source/en/perf_train_gpu_one.md          | 1 +
 src/transformers/training_args.py             | 3 +++
 tests/models/jukebox/test_modeling_jukebox.py | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index b526d7a9e180..5ad0046f1804 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -203,6 +203,7 @@ improvement. All you need to do is to add the following to your code:
 ```
 import torch
 torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
 ```
 
 CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 313caf47e9d4..b409a84bed8b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1432,6 +1432,7 @@ def __post_init__(self):
                         " otherwise."
                     )
                     torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
             else:
                 logger.warning(
                     "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
@@ -1440,11 +1441,13 @@ def __post_init__(self):
             if self.tf32:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
                 else:
                     raise ValueError("--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7")
             else:
                 if is_torch_tf32_available():
                     torch.backends.cuda.matmul.allow_tf32 = False
+                    torch.backends.cudnn.allow_tf32 = False
                 # no need to assert on else
 
         if self.report_to is None:
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 93de88715671..c0c78a25f8bb 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -167,6 +167,7 @@ def test_sampling(self):
     @slow
     def test_conditioning(self):
         torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
 
         labels = self.prepare_inputs()
@@ -195,6 +196,7 @@ def test_conditioning(self):
     @slow
     def test_primed_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
 
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         set_seed(0)

From 25e443c0d479be372659d112b0e3efa2e97ff6d7 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Tue, 25 Jul 2023 17:39:01 +0530
Subject: [PATCH 760/935] Fix broken link in README_hd.md (#25067)

Update README_hd.md
---
 README_hd.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_hd.md b/README_hd.md
index 0264db5c8f0e..02c0b555b47f 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -454,7 +454,7 @@ conda install -c huggingface transformers
 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में पेपर [यू ओनली सैंपल (लगभग) ज़ानपेंग ज़ेंग, युनयांग ज़िओंग द्वारा , सत्य एन. रवि, शैलेश आचार्य, ग्लेन फंग, विकास सिंह द्वारा पोस्ट किया गया।
 1. एक नए मॉडल में योगदान देना चाहते हैं? नए मॉडल जोड़ने में आपका मार्गदर्शन करने के लिए हमारे पास एक **विस्तृत मार्गदर्शिका और टेम्प्लेट** है। आप उन्हें [`टेम्पलेट्स`](./templates) निर्देशिका में पा सकते हैं। पीआर शुरू करने से पहले [योगदान दिशानिर्देश] (./CONTRIBUTING.md) देखना और अनुरक्षकों से संपर्क करना या प्रतिक्रिया प्राप्त करने के लिए एक नया मुद्दा खोलना याद रखें।
 
-यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका] (https://huggingface.co/ docs/transformers/index#supported) देखें। -फ्रेमवर्क)।
+यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका](https://huggingface.co/docs/transformers/index#supported) देखें। -फ्रेमवर्क)।
 
 इन कार्यान्वयनों का परीक्षण कई डेटासेट पर किया गया है (देखें केस स्क्रिप्ट का उपयोग करें) और वैनिला कार्यान्वयन के लिए तुलनात्मक रूप से प्रदर्शन करना चाहिए। आप उपयोग के मामले के दस्तावेज़ [इस अनुभाग](https://huggingface.co/docs/transformers/examples) में व्यवहार का विवरण पढ़ सकते हैं।
 

From c879318cc59cba27ad2f0d178ee03b924c3882c6 Mon Sep 17 00:00:00 2001
From: Alan Ji <3140102143@zju.edu.cn>
Date: Tue, 25 Jul 2023 20:11:56 +0800
Subject: [PATCH 761/935] replace `per_gpu_eval_batch_size` with
 `per_device_eval_batch_size` in readme of multiple-choice task (#25078)

replace `per_gpu_eval_batch_size` with `per_device_eval_batch_size`
in readme of multiple-choice
---
 examples/pytorch/multiple-choice/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/multiple-choice/README.md b/examples/pytorch/multiple-choice/README.md
index 735d1f5f33a0..8d56ccfe3dbd 100644
--- a/examples/pytorch/multiple-choice/README.md
+++ b/examples/pytorch/multiple-choice/README.md
@@ -28,7 +28,7 @@ python examples/multiple-choice/run_swag.py \
 --learning_rate 5e-5 \
 --num_train_epochs 3 \
 --output_dir /tmp/swag_base \
---per_gpu_eval_batch_size=16 \
+--per_device_eval_batch_size=16 \
 --per_device_train_batch_size=16 \
 --overwrite_output
 ```

From f2c1df93f5bb13a57de21e355836b7aa7c820d63 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 25 Jul 2023 14:20:37 +0200
Subject: [PATCH 762/935] [`generate`]  Only warn users if the
 `generation_config`'s `max_length` is set to the default value (#25030)

* check max length is default

* nit

* update warning: no-longer deprecate

* comment in the configuration_utils in case max length's default gets changed in the futur
---
 src/transformers/generation/configuration_utils.py   |  1 +
 src/transformers/generation/flax_utils.py            |  8 ++++----
 src/transformers/generation/tf_utils.py              |  8 ++++----
 src/transformers/generation/utils.py                 |  8 ++++----
 .../models/musicgen/modeling_musicgen.py             | 12 +++++-------
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 8b65e75904ed..d05c67b88e91 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -234,6 +234,7 @@ class GenerationConfig(PushToHubMixin):
 
     def __init__(self, **kwargs):
         # Parameters that control the length of the output
+        # if the default `max_length` is updated here, make sure to update the `generate` tests following https://github.com/huggingface/transformers/pull/25030
         self.max_length = kwargs.pop("max_length", 20)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
         self.min_length = kwargs.pop("min_length", 0)
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 260595091a6f..228bfb4a2e81 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -377,11 +377,11 @@ def generate(
         # Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length != 20:
+            # 20 is the default max_length of the generation config
             warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 2a48b79406d7..dcb407f5707a 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -850,11 +850,11 @@ def generate(
         # 7. Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = shape_list(input_ids)[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length != 20:
+            # 20 is the default max_length of the generation config
             warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index e4a5e1ad52ca..c8d97247e788 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1365,11 +1365,11 @@ def generate(
         # 6. Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length != 20:
+            # 20 is the default max_length of the generation config
             warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 8e2913263c69..11a0c4083cb7 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1303,11 +1303,10 @@ def generate(
         # 5. Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length != 20:
             logger.warning(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:
@@ -2332,9 +2331,8 @@ def generate(
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
         if has_default_max_length and generation_config.max_new_tokens is None:
             logger.warning(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif generation_config.max_new_tokens is not None:

From cb8abee5118ef72bd91198746f7eb6efd45a9794 Mon Sep 17 00:00:00 2001
From: Harheem Kim <49297157+harheem@users.noreply.github.com>
Date: Tue, 25 Jul 2023 21:28:20 +0900
Subject: [PATCH 763/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`hpo=5Ftrain.md`=20to=20Korean=20(#24968)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* dos: ko: hpo_train.mdx

* feat: chatgpt draft

* fix: manual edits

* fix: resolve suggestions
---
 docs/source/ko/_toctree.yml |   4 +-
 docs/source/ko/hpo_train.md | 124 ++++++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/hpo_train.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 40a3ae3875ab..2e9310078c4d 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -135,8 +135,8 @@
       title: (번역중) Instantiating a big model
     - local: in_translation
       title: (번역중) Debugging
-    - local: in_translation
-      title: (번역중) Hyperparameter Search using Trainer API
+    - local: hpo_train
+      title: Trainer API를 사용한 하이퍼파라미터 탐색
     - local: tf_xla
       title: TensorFlow 모델을 위한 XLA 통합
   title: (번역중) 성능 및 확장성
diff --git a/docs/source/ko/hpo_train.md b/docs/source/ko/hpo_train.md
new file mode 100644
index 000000000000..c7b25306930a
--- /dev/null
+++ b/docs/source/ko/hpo_train.md
@@ -0,0 +1,124 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer API를 사용한 하이퍼파라미터 탐색 [[hyperparameter-search-using-trainer-api]]
+
+🤗 Transformers에서는 🤗 Transformers 모델을 학습시키는데 최적화된 [`Trainer`] 클래스를 제공하기 때문에, 사용자는 직접 훈련 루프를 작성할 필요 없이 더욱 간편하게 학습을 시킬 수 있습니다. 또한, [`Trainer`]는 하이퍼파라미터 탐색을 위한 API를 제공합니다. 이 문서에서 이 API를 활용하는 방법을 예시와 함께 보여드리겠습니다.
+
+## 하이퍼파라미터 탐색 백엔드 [[hyperparameter-search-backend]]
+
+[`Trainer`]는 현재 아래 4가지 하이퍼파라미터 탐색 백엔드를 지원합니다:
+[optuna](https://optuna.org/)와 [sigopt](https://sigopt.com/), [raytune](https://docs.ray.io/en/latest/tune/index.html), [wandb](https://wandb.ai/site/sweeps) 입니다.
+
+하이퍼파라미터 탐색 백엔드로 사용하기 전에 아래의 명령어를 사용하여 라이브러리들을 설치하세요.
+```bash
+pip install optuna/sigopt/wandb/ray[tune] 
+```
+
+## 예제에서 하이퍼파라미터 탐색을 활성화하는 방법 [[how-to-enable-hyperparameter-search-in-example]]
+
+하이퍼파라미터 탐색 공간을 정의하세요. 하이퍼파라미터 탐색 백엔드마다 서로 다른 형식이 필요합니다.
+
+sigopt의 경우, 해당 [object_parameter](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter) 문서를 참조하여 아래와 같이 작성하세요:
+```py
+>>> def sigopt_hp_space(trial):
+...     return [
+...         {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
+...         {
+...             "categorical_values": ["16", "32", "64", "128"],
+...             "name": "per_device_train_batch_size",
+...             "type": "categorical",
+...         },
+...     ]
+```
+
+optuna의 경우, 해당 [object_parameter](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py) 문서를 참조하여 아래와 같이 작성하세요:
+
+```py
+>>> def optuna_hp_space(trial):
+...     return {
+...         "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+...         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
+...     }
+```
+
+raytune의 경우, 해당 [object_parameter](https://docs.ray.io/en/latest/tune/api/search_space.html) 문서를 참조하여 아래와 같이 작성하세요:
+
+```py
+>>> def ray_hp_space(trial):
+...     return {
+...         "learning_rate": tune.loguniform(1e-6, 1e-4),
+...         "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
+...     }
+```
+
+wandb의 경우, 해당 [object_parameter](https://docs.wandb.ai/guides/sweeps/configuration) 문서를 참조하여 아래와 같이 작성하세요:
+
+```py
+>>> def wandb_hp_space(trial):
+...     return {
+...         "method": "random",
+...         "metric": {"name": "objective", "goal": "minimize"},
+...         "parameters": {
+...             "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
+...             "per_device_train_batch_size": {"values": [16, 32, 64, 128]},
+...         },
+...     }
+```
+
+`model_init` 함수를 정의하고 이를 [`Trainer`]에 전달하세요. 아래는 그 예시입니다.
+```py
+>>> def model_init(trial):
+...     return AutoModelForSequenceClassification.from_pretrained(
+...         model_args.model_name_or_path,
+...         from_tf=bool(".ckpt" in model_args.model_name_or_path),
+...         config=config,
+...         cache_dir=model_args.cache_dir,
+...         revision=model_args.model_revision,
+...         use_auth_token=True if model_args.use_auth_token else None,
+...     )
+```
+
+아래와 같이 `model_init` 함수, 훈련 인수, 훈련 및 테스트 데이터셋, 그리고 평가 함수를 사용하여 [`Trainer`]를 생성하세요:
+
+```py
+>>> trainer = Trainer(
+...     model=None,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+...     tokenizer=tokenizer,
+...     model_init=model_init,
+...     data_collator=data_collator,
+... )
+```
+
+하이퍼파라미터 탐색을 호출하고, 최적의 시험 매개변수를 가져오세요. 백엔드는 `"optuna"`/`"sigopt"`/`"wandb"`/`"ray"` 중에서 선택할 수 있습니다. 방향은 `"minimize"` 또는 `"maximize"` 중 선택하며, 목표를 최소화할 것인지 최대화할 것인지를 결정합니다.
+
+자신만의 compute_objective 함수를 정의할 수 있습니다. 만약 이 함수를 정의하지 않으면, 기본 compute_objective가 호출되고, f1과 같은 평가 지표의 합이 목푯값으로 반환됩니다.
+
+```py
+>>> best_trial = trainer.hyperparameter_search(
+...     direction="maximize",
+...     backend="optuna",
+...     hp_space=optuna_hp_space,
+...     n_trials=20,
+...     compute_objective=compute_objective,
+... )
+```
+
+## DDP 미세 조정을 위한 하이퍼파라미터 탐색 [[hyperparameter-search-for-ddp-finetune]]
+현재, DDP(Distributed Data Parallelism; 분산 데이터 병렬처리)를 위한 하이퍼파라미터 탐색은 optuna와 sigopt에서 가능합니다. 최상위 프로세스가 하이퍼파라미터 탐색 과정을 시작하고 그 결과를 다른 프로세스에 전달합니다.

From 1dbc1440a7d750f69815a56ebda20d6dc00a37ed Mon Sep 17 00:00:00 2001
From: Xiaoke Huang <xiaokehuang@foxmail.com>
Date: Tue, 25 Jul 2023 20:30:14 +0800
Subject: [PATCH 764/935] Fix: repeat per sample for SAM image embeddings
 (#25074)

Repeat per sample for SAM image embeddings
---
 src/transformers/models/sam/modeling_sam.py    | 4 ++--
 src/transformers/models/sam/modeling_tf_sam.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 43d88232e364..3b8e1aba714c 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -507,8 +507,8 @@ def forward(
 
         # Expand per-image data in batch direction to be per-point
         image_embeddings = image_embeddings + dense_prompt_embeddings
-        image_embeddings = image_embeddings.repeat(point_batch_size, 1, 1, 1)
-        image_positional_embeddings = image_positional_embeddings.repeat(point_batch_size, 1, 1, 1)
+        image_embeddings = image_embeddings.repeat_interleave(point_batch_size, 0)
+        image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0)
 
         # Run the transformer, image_positional_embedding are consumed
         point_embedding, image_embeddings, attentions = self.transformer(
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index a47b091a09f6..48b25ae134a8 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -517,8 +517,8 @@ def call(
         point_embeddings = tf.cast(tokens, self.iou_token.dtype)
 
         image_embeddings = image_embeddings + dense_prompt_embeddings
-        image_embeddings = tf.tile(image_embeddings, [point_batch_size, 1, 1, 1])
-        image_positional_embeddings = tf.tile(image_positional_embeddings, [point_batch_size, 1, 1, 1])
+        image_embeddings = tf.repeat(image_embeddings, point_batch_size, axis=0)
+        image_positional_embeddings = tf.repeat(image_positional_embeddings, point_batch_size, axis=0)
 
         point_embedding, image_embeddings, attentions = self.transformer(
             point_embeddings=point_embeddings,

From dcb183f4bdcd9491efb68b3c28d51614a11e59dc Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 25 Jul 2023 14:32:40 +0200
Subject: [PATCH 765/935] [`MPT`] Add MosaicML's `MPT` model to transformers
 (#24629)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* draft add new model like

* some cleaning of the config

* nits

* add nested configs

* nits

* update

* update

* added layer norms + triton kernels

* consider only LPLayerNorm for now.

* update

* all keys match.

* Update

* fixing nits here and there

* working forward pass.

* removed einops dependency

* nits

* format

* add alibi

* byebye head mask

* refactor attention

* nits.

* format

* fix nits.

* nuke ande updates

* nuke tokenizer test

* don't reshape query with kv heads

* added a bit of documentation.

* remove unneeded things

* nuke more stuff

* nit

* logits match - same generations

* rm unneeded methods

* 1 remaining failing CI test

* nit

* fix nits

* fix docs

* fix docs

* rm tokenizer

* fixup

* fixup

* fixup and fix tests

* fixed configuration object.

* use correct activation

* few minor fixes

* clarify docs a bit

* logits match à 1e-12

* skip and unskip a test

* added some slow tests.

* fix readme

* add more details

* Update docs/source/en/model_doc/mpt.md

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix configuration issues

* more fixes in config

* added more models

* Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* remove unneeded position ids

* fix some  comments

* Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* revert suggestion

* mpt alibi + added batched generation

* Update src/transformers/models/mpt/__init__.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* remove init config

* Update src/transformers/models/mpt/configuration_mpt.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix nit

* add another slow test

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fits in one line

* some refactor because make fixup doesn't pass

* add ft notebook

* update md

* correct doc path

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 README.md                                     |    1 +
 README_es.md                                  |    3 +-
 README_hd.md                                  |    1 +
 README_ja.md                                  |    1 +
 README_ko.md                                  |    1 +
 README_zh-hans.md                             |    1 +
 README_zh-hant.md                             |    1 +
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    2 +
 docs/source/en/model_doc/mpt.md               |   69 ++
 docs/source/en/tasks/language_modeling.md     |    3 +-
 docs/source/en/tasks/question_answering.md    |    3 +-
 .../en/tasks/sequence_classification.md       |    4 +-
 docs/source/en/tasks/token_classification.md  |    3 +-
 src/transformers/__init__.py                  |   22 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    7 +
 .../models/auto/tokenization_auto.py          |    1 +
 src/transformers/models/mpt/__init__.py       |   62 +
 .../models/mpt/configuration_mpt.py           |  251 ++++
 src/transformers/models/mpt/modeling_mpt.py   | 1009 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   45 +
 tests/models/mpt/__init__.py                  |    0
 tests/models/mpt/test_modeling_mpt.py         |  487 ++++++++
 utils/check_config_attributes.py              |    3 +
 26 files changed, 1981 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/en/model_doc/mpt.md
 create mode 100644 src/transformers/models/mpt/__init__.py
 create mode 100644 src/transformers/models/mpt/configuration_mpt.py
 create mode 100644 src/transformers/models/mpt/modeling_mpt.py
 create mode 100644 tests/models/mpt/__init__.py
 create mode 100644 tests/models/mpt/test_modeling_mpt.py

diff --git a/README.md b/README.md
index 9a1e4eeb7100..0c8bebcdb93e 100644
--- a/README.md
+++ b/README.md
@@ -412,6 +412,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/README_es.md b/README_es.md
index e4086dc9e54f..db413d2e9911 100644
--- a/README_es.md
+++ b/README_es.md
@@ -389,6 +389,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
@@ -514,4 +515,4 @@ Ahora nosotros tenemos un [papel](https://www.aclweb.org/anthology/2020.emnlp-de
     url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
     pages = "38--45"
 }
-```
+```
\ No newline at end of file
diff --git a/README_hd.md b/README_hd.md
index 02c0b555b47f..232068d51325 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -361,6 +361,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple से) साथ में कागज [MobileViT: लाइट-वेट, जनरल-पर्पस, और मोबाइल-फ्रेंडली विजन ट्रांसफॉर्मर] (https://arxiv.org/abs/2110.02178) सचिन मेहता और मोहम्मद रस्तगरी द्वारा पोस्ट किया गया।
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple से) Sachin Mehta and Mohammad Rastegari. द्वाराअनुसंधान पत्र [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) के साथ जारी किया गया
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML से) the MosaicML NLP Team. द्वाराअनुसंधान पत्र [llm-foundry](https://github.com/mosaicml/llm-foundry/) के साथ जारी किया गया
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison से) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. द्वाराअनुसंधान पत्र [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) के साथ जारी किया गया
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI से) साथ वाला पेपर [mT5: एक व्यापक बहुभाषी पूर्व-प्रशिक्षित टेक्स्ट-टू-टेक्स्ट ट्रांसफॉर्मर]( https://arxiv.org/abs/2010.11934) लिंटिंग ज़ू, नोआ कॉन्सटेंट, एडम रॉबर्ट्स, मिहिर काले, रामी अल-रफू, आदित्य सिद्धांत, आदित्य बरुआ, कॉलिन रैफेल द्वारा पोस्ट किया गया।
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/README_ja.md b/README_ja.md
index ff5ee6170692..9d61fede528f 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -423,6 +423,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple から) Sachin Mehta and Mohammad Rastegari から公開された研究論文: [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple から) Sachin Mehta and Mohammad Rastegari. から公開された研究論文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research から) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu から公開された研究論文: [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297)
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML から) the MosaicML NLP Team. から公開された研究論文 [llm-foundry](https://github.com/mosaicml/llm-foundry/)
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison から) Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh. から公開された研究論文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284)
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI から) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel から公開された研究論文: [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934)
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/README_ko.md b/README_ko.md
index 7c3523121a55..0689b3474076 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -338,6 +338,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (Apple 에서) Sachin Mehta and Mohammad Rastegari 의 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 논문과 함께 발표했습니다.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (Apple 에서 제공)은 Sachin Mehta and Mohammad Rastegari.의 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680)논문과 함께 발표했습니다.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (Microsoft Research 에서) Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 의 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 논문과 함께 발표했습니다.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (MosaiML 에서 제공)은 the MosaicML NLP Team.의 [llm-foundry](https://github.com/mosaicml/llm-foundry/)논문과 함께 발표했습니다.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (the University of Wisconsin - Madison 에서 제공)은 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.의 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 논문과 함께 발표했습니다.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (Google AI 에서) Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 의 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 논문과 함께 발표했습니다.
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index e041d7202b22..105522fcf488 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -362,6 +362,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (来自 Apple) 伴随论文 [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (来自 Apple) 伴随论文 [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) 由 Sachin Mehta and Mohammad Rastegari 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (来自 MosaiML) 伴随论文 [llm-foundry](https://github.com/mosaicml/llm-foundry/) 由 the MosaicML NLP Team 发布。
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (来自 the University of Wisconsin - Madison) 伴随论文 [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) 由 Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/README_zh-hant.md b/README_zh-hant.md
index d9a4c73d7164..591495697161 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -374,6 +374,7 @@ conda install -c huggingface transformers
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the paper [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA)](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 14156ebf5bfe..b6f496c11fb8 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -369,6 +369,8 @@
         title: MobileBERT
       - local: model_doc/mpnet
         title: MPNet
+      - local: model_doc/mpt
+        title: MPT
       - local: model_doc/mra
         title: MRA
       - local: model_doc/mt5
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 2317c2544248..5063df249487 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -178,6 +178,7 @@ The documentation is organized into five sections:
 1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
 1. **[MobileViTV2](model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MRA](model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[MusicGen](model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
@@ -391,6 +392,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           MobileViT           |       ✅        |         ✅         |      ❌      |
 |          MobileViTV2          |       ✅        |         ❌         |      ❌      |
 |             MPNet             |       ✅        |         ✅         |      ❌      |
+|              MPT              |       ✅        |         ❌         |      ❌      |
 |              MRA              |       ✅        |         ❌         |      ❌      |
 |              MT5              |       ✅        |         ✅         |      ✅      |
 |           MusicGen            |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md
new file mode 100644
index 000000000000..fd0a3b5c46bf
--- /dev/null
+++ b/docs/source/en/model_doc/mpt.md
@@ -0,0 +1,69 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MPT
+
+## Overview
+
+The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models is a series of open source and commercially usable LLMs pre-trained on 1T tokens. 
+
+MPT models are GPT-style decoder-only transformers with several improvements: performance-optimized layer implementations, architecture changes that provide greater training stability, and the elimination of context length limits by replacing positional embeddings with ALiBi. 
+
+- MPT base: MPT base pre-trained models on next token prediction 
+- MPT instruct: MPT base models fine-tuned on instruction based tasks
+- MPT storywriter: MPT base models fine-tuned for 2500 steps on 65k-token excerpts of fiction books contained in the books3 corpus, this enables the model to handle very long sequences
+
+The original code is available at the  [`llm-foundry`](https://github.com/mosaicml/llm-foundry/tree/main) repository.
+
+Read more about it [in the release blogpost](https://www.mosaicml.com/blog/mpt-7b)
+
+Tips:
+
+- Learn more about some techniques behind training of the model [in this section of llm-foundry repository](https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#faqs)
+- If you want to use the advanced version of the model (triton kernels, direct flash attention integration), you can still use the original model implementation by adding `trust_remote_code=True` when calling `from_pretrained`.
+
+- [Fine-tuning Notebook](https://colab.research.google.com/drive/1HCpQkLL7UXW8xJUJJ29X7QAeNJKO0frZ?usp=sharing) on how to fine-tune MPT-7B on a free Google Colab instance to turn the model into a Chatbot.
+
+
+## MptConfig
+
+[[autodoc]] MptConfig
+    - all
+
+## MptModel
+
+[[autodoc]] MptModel
+    - forward
+
+## MptForCausalLM
+
+[[autodoc]] MptForCausalLM
+    - forward
+
+## MptForSequenceClassification
+
+[[autodoc]] MptForSequenceClassification
+    - forward
+
+## MptForTokenClassification
+
+[[autodoc]] MptForTokenClassification
+    - forward
+
+## MptForQuestionAnswering
+
+[[autodoc]] MptForQuestionAnswering
+    - forward
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 172ac5786ea1..695788fc20a4 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -37,7 +37,8 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 6009cef4930b..0db26ab8cbb7 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -35,7 +35,8 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index 25649b03de8c..10232edae234 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -32,7 +32,9 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
 
 
 <!--End of the generated tip-->
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 399b7c515dcd..3197d49de2f6 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -32,7 +32,8 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
 
 <!--End of the generated tip-->
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9eaba9cd1d10..fe0a68e056a2 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -429,6 +429,7 @@
     "models.mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig"],
     "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
+    "models.mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig"],
     "models.mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"],
     "models.mt5": ["MT5Config"],
     "models.musicgen": [
@@ -2215,6 +2216,17 @@
             "MPNetPreTrainedModel",
         ]
     )
+    _import_structure["models.mpt"].extend(
+        [
+            "MPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MptForCausalLM",
+            "MptForQuestionAnswering",
+            "MptForSequenceClassification",
+            "MptForTokenClassification",
+            "MptModel",
+            "MptPreTrainedModel",
+        ]
+    )
     _import_structure["models.mra"].extend(
         [
             "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4388,6 +4400,7 @@
     from .models.mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig
     from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
+    from .models.mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig
     from .models.mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
     from .models.mt5 import MT5Config
     from .models.musicgen import (
@@ -5894,6 +5907,15 @@
             MPNetModel,
             MPNetPreTrainedModel,
         )
+        from .models.mpt import (
+            MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MptForCausalLM,
+            MptForQuestionAnswering,
+            MptForSequenceClassification,
+            MptForTokenClassification,
+            MptModel,
+            MptPreTrainedModel,
+        )
         from .models.mra import (
             MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
             MraForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index dcef274798a2..7af9ff766aed 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -135,6 +135,7 @@
     mobilevit,
     mobilevitv2,
     mpnet,
+    mpt,
     mra,
     mt5,
     musicgen,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 0fb606e06e78..44d58313c0b2 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -141,6 +141,7 @@
         ("mobilevit", "MobileViTConfig"),
         ("mobilevitv2", "MobileViTV2Config"),
         ("mpnet", "MPNetConfig"),
+        ("mpt", "MptConfig"),
         ("mra", "MraConfig"),
         ("mt5", "MT5Config"),
         ("musicgen", "MusicgenConfig"),
@@ -339,6 +340,7 @@
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mpt", "MPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mra", "MRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -551,6 +553,7 @@
         ("mobilevit", "MobileViT"),
         ("mobilevitv2", "MobileViTV2"),
         ("mpnet", "MPNet"),
+        ("mpt", "MPT"),
         ("mra", "MRA"),
         ("mt5", "MT5"),
         ("musicgen", "MusicGen"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7d108abaa51c..06021a918856 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -137,6 +137,7 @@
         ("mobilevit", "MobileViTModel"),
         ("mobilevitv2", "MobileViTV2Model"),
         ("mpnet", "MPNetModel"),
+        ("mpt", "MptModel"),
         ("mra", "MraModel"),
         ("mt5", "MT5Model"),
         ("mvp", "MvpModel"),
@@ -253,6 +254,7 @@
         ("megatron-bert", "MegatronBertForPreTraining"),
         ("mobilebert", "MobileBertForPreTraining"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mpt", "MptForCausalLM"),
         ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForPreTraining"),
@@ -333,6 +335,7 @@
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
         ("mpnet", "MPNetForMaskedLM"),
+        ("mpt", "MptForCausalLM"),
         ("mra", "MraForMaskedLM"),
         ("mvp", "MvpForConditionalGeneration"),
         ("nezha", "NezhaForMaskedLM"),
@@ -399,6 +402,7 @@
         ("mbart", "MBartForCausalLM"),
         ("mega", "MegaForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
+        ("mpt", "MptForCausalLM"),
         ("musicgen", "MusicgenForCausalLM"),
         ("mvp", "MvpForCausalLM"),
         ("open-llama", "OpenLlamaForCausalLM"),
@@ -718,6 +722,7 @@
         ("megatron-bert", "MegatronBertForSequenceClassification"),
         ("mobilebert", "MobileBertForSequenceClassification"),
         ("mpnet", "MPNetForSequenceClassification"),
+        ("mpt", "MptForSequenceClassification"),
         ("mra", "MraForSequenceClassification"),
         ("mvp", "MvpForSequenceClassification"),
         ("nezha", "NezhaForSequenceClassification"),
@@ -787,6 +792,7 @@
         ("megatron-bert", "MegatronBertForQuestionAnswering"),
         ("mobilebert", "MobileBertForQuestionAnswering"),
         ("mpnet", "MPNetForQuestionAnswering"),
+        ("mpt", "MptForQuestionAnswering"),
         ("mra", "MraForQuestionAnswering"),
         ("mt5", "MT5ForQuestionAnswering"),
         ("mvp", "MvpForQuestionAnswering"),
@@ -874,6 +880,7 @@
         ("megatron-bert", "MegatronBertForTokenClassification"),
         ("mobilebert", "MobileBertForTokenClassification"),
         ("mpnet", "MPNetForTokenClassification"),
+        ("mpt", "MptForTokenClassification"),
         ("mra", "MraForTokenClassification"),
         ("nezha", "NezhaForTokenClassification"),
         ("nystromformer", "NystromformerForTokenClassification"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 68c99b7f43c6..370dc4bcb223 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -214,6 +214,7 @@
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
             ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
+            ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
             ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "mt5",
diff --git a/src/transformers/models/mpt/__init__.py b/src/transformers/models/mpt/__init__.py
new file mode 100644
index 000000000000..d24a5fad7b9d
--- /dev/null
+++ b/src/transformers/models/mpt/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig", "MptOnnxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mpt"] = [
+        "MPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MptForCausalLM",
+        "MptModel",
+        "MptPreTrainedModel",
+        "MptForSequenceClassification",
+        "MptForTokenClassification",
+        "MptForQuestionAnswering",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig, MptOnnxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mpt import (
+            MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MptForCausalLM,
+            MptForQuestionAnswering,
+            MptForSequenceClassification,
+            MptForTokenClassification,
+            MptModel,
+            MptPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
new file mode 100644
index 000000000000..9601fe5b3e06
--- /dev/null
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mpt configuration"""
+import copy
+from typing import TYPE_CHECKING, Optional, Union
+
+
+if TYPE_CHECKING:
+    pass
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mosaicml/mpt-7b": "https://huggingface.co/mosaicml/mpt-7b/resolve/main/config.json",
+}
+
+
+class MptAttentionConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`MptAttention`] class. It is used to instantiate
+    attention layers according to the specified arguments, defining the layers architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MPT
+    [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b) architecture. Most of the arguments are kept for backward
+    compatibility with previous MPT models that are hosted on the Hub (previously with `trust_remote_code=True`).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
+            type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        attn_impl (`str`, *optional*, defaults to `"torch"`):
+            The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
+        clip_qkv (`float`, *optional*):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        softmax_scale (`float`, *optional*, defaults to `None`):
+            If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
+            `1/sqrt(hidden_size)`.
+        prefix_lm (`bool`, *optional*, defaults to `False`)):
+            Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
+            which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
+            bi-directionally. Tokens outside the prefix use causal attention.
+        qk_ln (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization to the queries and keys in the attention layer.
+        attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)):
+            Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
+            mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
+            token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
+        alibi (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the alibi bias instead of positional embedding.
+        alibi_bias_max (`int`, *optional*, defaults to 8):
+            The maximum value of the alibi bias.
+    """
+
+    def __init__(
+        self,
+        attn_type="multihead_attention",
+        attn_pdrop=0,
+        attn_impl="torch",
+        clip_qkv=None,
+        softmax_scale=None,
+        prefix_lm=False,
+        qk_ln=False,
+        attn_uses_sequence_id=False,
+        alibi=True,
+        alibi_bias_max=8,
+        **kwargs,
+    ):
+        super().__init__()
+        self.attn_type = attn_type
+        self.attn_pdrop = attn_pdrop
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.softmax_scale = softmax_scale
+        self.prefix_lm = prefix_lm
+        self.attn_uses_sequence_id = attn_uses_sequence_id
+        self.alibi = alibi
+        self.qk_ln = qk_ln
+        self.alibi_bias_max = alibi_bias_max
+
+        if attn_type not in ["multihead_attention", "multiquery_attention"]:
+            raise ValueError(
+                f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
+            )
+
+
+class MptConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`MptModel`]. It is used to instantiate a Mpt model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to the Mpt-7b architecture
+    [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 2048):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        expansion_ratio (`int`, *optional*, defaults to 4):
+            The ratio of the up/down scale in the MLP.
+        max_seq_len (`int`, *optional*, defaults to 2048):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 50368):
+            Vocabulary size of the Mpt model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`MptModel`]. Check [this
+            discussion](https://huggingface.co/bigscience/mpt/discussions/120#633d28389addb8530b406c2a) on how the
+            `vocab_size` has been defined.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability applied to the attention output before combining with residual.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        emb_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        learned_pos_emb (`bool`, *optional*, defaults to `False`):
+            Whether to use learned positional embeddings.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        init_device (`str`, *optional*):
+            The device to use for parameter initialization. Defined for backward compatibility
+        logit_scale (`float`, *optional*):
+            If not None, scale the logits by this value.
+        no_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in all linear layers.
+        verbose (`int`, *optional*, defaults to 0):
+            The verbosity level to use for logging. Used in the previous versions of MPT models for logging. This
+            argument is deprecated.
+        embedding_fraction (`float`, *optional*, defaults to 1.0):
+            The fraction to scale the gradients of the embedding layer by.
+        norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
+            Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
+            compatibility.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import MptConfig, MptModel
+
+    >>> # Initializing a Mpt configuration
+    >>> configuration = MptConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = MptModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "mpt"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        expansion_ratio: int = 4,
+        max_seq_len: int = 2048,
+        vocab_size: int = 50368,
+        resid_pdrop: float = 0.0,
+        layer_norm_epsilon: float = 1e-5,
+        emb_pdrop: float = 0.0,
+        learned_pos_emb: bool = True,
+        attn_config: MptAttentionConfig = None,
+        init_device: str = "cpu",
+        logit_scale: Optional[Union[float, str]] = None,
+        no_bias: bool = True,
+        verbose: int = 0,
+        embedding_fraction: float = 1.0,
+        norm_type: str = "low_precision_layernorm",
+        use_cache: bool = False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+
+        if attn_config is None:
+            self.attn_config = MptAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = MptAttentionConfig(**attn_config)
+        elif isinstance(attn_config, MptAttentionConfig):
+            self.attn_config = attn_config
+        else:
+            raise ValueError(
+                f"`attn_config` has to be either a `MptAttentionConfig` or a dictionary. Received: {type(attn_config)}"
+            )
+
+        super().__init__(**kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["attn_config"] = (
+            self.attn_config.to_dict() if not isinstance(self.attn_config, dict) else self.attn_config
+        )
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
new file mode 100644
index 000000000000..c71571558622
--- /dev/null
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -0,0 +1,1009 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team and MosaicML NLP team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MPT model."""
+
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_mpt import MptConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "mosaicml/mpt-7b"
+_CONFIG_FOR_DOC = "MptConfig"
+
+MPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "mosaicml/mpt-7b",
+    "mosaicml/mpt-7b-storywriter",
+    "mosaicml/mpt-7b-instruct",
+    "mosaicml/mpt-7b-8k",
+    "mosaicml/mpt-7b-8k-instruct",
+    "mosaicml/mpt-7b-8k-chat",
+    "mosaicml/mpt-30b",
+    "mosaicml/mpt-30b-instruct",
+    "mosaicml/mpt-30b-chat"
+    # See all MPT models at https://huggingface.co/models?filter=mpt
+]
+
+
+# Copied from transformers.models.bloom.modeling_bloom._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
+    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
+    seq_ids = torch.arange(target_length, device=device)
+    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
+
+    if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = False
+
+    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
+    return expanded_mask
+
+
+# Copied from transformers.models.bloom.modeling_bloom._expand_mask
+def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
+
+
+def build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max=8, device=None):
+    r"""
+    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
+    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
+    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
+    """
+    alibi = torch.arange(1 - sequence_length, 1, dtype=torch.int32, device=device).view(1, 1, 1, sequence_length)
+    num_heads_power_of_2 = 2 ** math.ceil(math.log2(num_heads))
+
+    base = torch.arange(1, num_heads_power_of_2 + 1, dtype=torch.float32, device=device)
+    base = base * (alibi_bias_max / num_heads_power_of_2)
+
+    slopes = 1.0 / torch.pow(2, base)
+    slopes = slopes.view(1, num_heads, 1, 1)
+
+    if num_heads_power_of_2 != num_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:num_heads]
+
+    alibi = alibi * slopes
+    return alibi.squeeze(0)
+
+
+class MptAttention(nn.Module):
+    """Multi-head self attention.
+    Using torch or triton attention implemetation enables user to also use additive bias.
+    """
+
+    def __init__(self, config: MptConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.n_heads = config.n_heads
+        self.max_seq_length = config.max_seq_len
+        self.head_dim = self.hidden_size // self.n_heads
+        self.softmax_scale = config.attn_config.softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.hidden_size / self.n_heads)
+
+        self.attn_dropout_p = config.attn_config.attn_pdrop
+        self.Wqkv = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_bias: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        mixed_qkv = self.Wqkv(hidden_states)
+        query_states, key_states, value_states = mixed_qkv.chunk(3, dim=2)
+        query_states = query_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.reshape(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            if len(past_key_value) != 0:
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+            past_key_value = (key_states, value_states)
+        else:
+            past_key_value = (key_states, value_states)
+
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
+
+        query_length = seq_length
+        if past_key_value is not None:
+            query_length += past_key_value[0].shape[2]
+
+        if position_bias is not None:
+            if len(position_bias.shape) != 3:
+                raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
+            key_length = key_states.shape[-2]
+
+            position_bias_query_index = max(0, position_bias.size(1) - query_length)
+            position_bias_key_index = max(0, position_bias.size(2) - key_length)
+
+            position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
+
+            attention_scores = attention_scores + position_bias
+
+        if attention_mask is not None:
+            attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
+
+        context_states = torch.matmul(attn_weights, value_states)
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.out_proj(context_states)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MptMLP(nn.Module):
+    def __init__(self, config: MptConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.up_proj = nn.Linear(hidden_size, 4 * hidden_size, bias=False)
+        self.act = nn.GELU(approximate="none")
+        self.down_proj = nn.Linear(4 * hidden_size, hidden_size, bias=False)
+        self.hidden_dropout = config.attn_config.attn_pdrop
+
+    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.act(self.up_proj(hidden_states))
+
+        intermediate_output = self.down_proj(hidden_states)
+
+        output = F.dropout(intermediate_output, p=self.hidden_dropout, training=self.training)
+        output = output + residual
+
+        return output
+
+
+class MptBlock(nn.Module):
+    def __init__(self, config: MptConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.norm_1 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        # backward compatibility with weights on the Hub
+        self.norm_1.bias = None
+
+        self.num_heads = config.n_heads
+        self.attn = MptAttention(config)
+
+        self.norm_2 = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        # backward compatibility with weights on the Hub
+        self.norm_2.bias = None
+
+        self.ffn = MptMLP(config)
+
+        self.dropout_rate = config.attn_config.attn_pdrop
+        self.resid_attn_dropout = nn.Dropout(self.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_bias: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        # hidden_states: [batch_size, seq_length, hidden_size]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.norm_1(hidden_states)
+
+        residual = hidden_states
+
+        # Self attention.
+        attn_outputs, attn_weights, past_key_value = self.attn(
+            layernorm_output,
+            position_bias=position_bias,
+            attention_mask=attention_mask,
+            past_key_value=layer_past,
+        )
+
+        hidden_states = self.resid_attn_dropout(attn_outputs) + residual
+
+        layernorm_output = self.norm_2(hidden_states)
+
+        # Get residual
+        residual = hidden_states
+
+        # MLP.
+        output = self.ffn(layernorm_output, residual)
+        outputs = (output,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # hidden_states, present, attentions
+
+
+class MptPreTrainedModel(PreTrainedModel):
+    config_class = MptConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MptBlock"]
+    _keys_to_ignore_on_load_missing = [r"lm_head.*."]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
+        if isinstance(module, MptModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_to_mpt_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+        """
+        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].reshape(batch_size_times_num_heads, head_dim, seq_length),
+                layer_past[1].reshape(batch_size_times_num_heads, seq_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+
+MPT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MptConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MPT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+
+            Each element of `past_key_values` is a tuple (past_key, past_value):
+            - past_key: [batch_size * num_heads, head_dim, kv_length]
+            - past_value: [batch_size * num_heads, kv_length, head_dim]
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.",
+    MPT_START_DOCSTRING,
+)
+class MptModel(MptPreTrainedModel):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.n_heads
+
+        # Embedding + LN Embedding
+        self.wte = nn.Embedding(config.vocab_size, self.hidden_size)
+
+        # Transformer blocks
+        self.blocks = nn.ModuleList([MptBlock(config) for _ in range(config.n_layers)])
+
+        # Final Layer Norm
+        self.norm_f = LayerNorm(self.hidden_size, eps=config.layer_norm_epsilon)
+        # backward compatibility with weights on the Hub
+        self.norm_f.bias = None
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def _prepare_attn_mask(
+        self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
+    ) -> torch.BoolTensor:
+        # create causal mask
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        if input_shape[1] + past_key_values_length != attention_mask.shape[1]:
+            raise ValueError(
+                "Attention mask shape should be (batch_size, seq_length + past_key_values_length)"
+                f" but is {attention_mask.shape} with input_ids shape {input_shape} and past length"
+                f" {past_key_values_length}."
+            )
+        combined_attention_mask = None
+        device = attention_mask.device
+        _, src_length = input_shape
+
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, device=device, past_key_values_length=past_key_values_length
+            )
+
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.wte = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.blocks))
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        hidden_states = inputs_embeds
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        alibi = build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    layer_past,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    position_bias=alibi,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.norm_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForCausalLM(MptPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.transformer = MptModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,  # NITS should it be layer_past?
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in past
+        )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The MPT Model transformer with a sequence classification head on top (linear layer).
+
+    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForSequenceClassification(MptPreTrainedModel):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = MptModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForTokenClassification(MptPreTrainedModel):
+    def __init__(self, config: MptConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = MptModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MPT_START_DOCSTRING,
+)
+class MptForQuestionAnswering(MptPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = MptModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 299f2f8693c1..e4b3aa14aca5 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5101,6 +5101,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+MPT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MptForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MptPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/mpt/__init__.py b/tests/models/mpt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
new file mode 100644
index 000000000000..82c157a4bcfe
--- /dev/null
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -0,0 +1,487 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import math
+import unittest
+
+from transformers import MptConfig, is_torch_available
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        AutoTokenizer,
+        MptForCausalLM,
+        MptForQuestionAnswering,
+        MptForSequenceClassification,
+        MptForTokenClassification,
+        MptModel,
+    )
+
+
+@require_torch
+class MptModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=False,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def get_large_model_config(self):
+        return MptConfig.from_pretrained("mosaicml/mpt-7")
+
+    def prepare_config_and_inputs(self, gradient_checkpointing=False):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config(gradient_checkpointing=gradient_checkpointing)
+
+        return (config, input_ids, input_mask, sequence_labels)
+
+    def get_config(self, gradient_checkpointing=False):
+        return MptConfig(
+            vocab_size=self.vocab_size,
+            seq_length=self.seq_length,
+            hidden_size=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            hidden_dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            num_labels=self.num_labels,
+            gradient_checkpointing=gradient_checkpointing,
+            dtype="float32",
+        )
+
+    def create_and_check_mpt_model(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.past_key_values), config.n_layers)
+
+    def create_and_check_mpt_model_past(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=torch.ones_like(input_ids), use_cache=True)
+        outputs_use_cache_conf = model(input_ids, attention_mask=torch.ones_like(input_ids))
+        outputs_no_past = model(input_ids, use_cache=False, attention_mask=torch.ones_like(input_ids))
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_mpt_model_attention_mask_past(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        half_seq_length = self.seq_length // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_mpt_model_past_large_inputs(self, config, input_ids, input_mask, *args):
+        model = MptModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            output_hidden_states=True,
+        )
+        hidden_states_from_no_past = output_from_no_past["hidden_states"][0]
+
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )
+        hidden_states_from_past = output_from_past["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), hidden_states_from_past.shape[-1]).item()
+        output_from_no_past_slice = hidden_states_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = hidden_states_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args):
+        model = MptForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_sequence_classification_model(self, config, input_ids, input_mask, *args):
+        config.num_labels = self.num_labels
+        model = MptForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_token_classification_model(self, config, input_ids, input_mask, *args):
+        model = MptForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_question_answering_model(self, config, input_ids, input_mask, *args):
+        model = MptForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, *args, gradient_checkpointing=False
+    ):
+        model = MptForCausalLM(config)
+        model.to(torch_device)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+
+        result = model(input_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def create_and_check_mpt_weight_initialization(self, config, *args):
+        model = MptModel(config)
+        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layers)
+        for key in model.state_dict().keys():
+            if "c_proj" in key and "weight" in key:
+                self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
+                self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        config, input_ids, input_mask, sequence_labels = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids}
+
+        return config, inputs_dict
+
+
+@require_torch
+class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            MptModel,
+            MptForCausalLM,
+            MptForSequenceClassification,
+            MptForTokenClassification,
+            MptForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    all_generative_model_classes = (MptForCausalLM,) if is_torch_available() else ()
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_torchscript = False
+    test_head_masking = False
+    pipeline_model_mapping = (
+        {"feature-extraction": MptModel, "text-generation": MptForCausalLM} if is_torch_available() else {}
+    )
+
+    def setUp(self):
+        self.model_tester = MptModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MptConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model(*config_and_inputs)
+
+    def test_mpt_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model_past(*config_and_inputs)
+
+    def test_mpt_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model_attention_mask_past(*config_and_inputs)
+
+    def test_mpt_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_model_past_large_inputs(*config_and_inputs)
+
+    def test_mpt_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_mpt_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_sequence_classification_model(*config_and_inputs)
+
+    def test_mpt_token_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_token_classification_model(*config_and_inputs)
+
+    def test_mpt_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
+
+    def test_mpt_weight_initialization(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpt_weight_initialization(*config_and_inputs)
+
+    @unittest.skip("For backward compatibility the lm_head is not in the model's state dict on the Hub.")
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MptModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@slow
+@require_torch_gpu
+class MptIntegrationTests(unittest.TestCase):
+    def test_generation_8k(self):
+        model_id = "mosaicml/mpt-7b-8k"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        input_text = "Hello"
+        expected_output = "Hello my name is [name] and I am a [type] at [company]. I have a [number]"
+
+        inputs = tokenizer(input_text, return_tensors="pt")
+        outputs = model.generate(**inputs, max_new_tokens=20)
+
+        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        self.assertEqual(decoded_output, expected_output)
+
+    def test_generation(self):
+        model_id = "mosaicml/mpt-7b"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        input_text = "Hello"
+        expected_output = "Hello my name is Kaitlyn and I am a senior at the University of Wisconsin-Stout. I am major"
+
+        inputs = tokenizer(input_text, return_tensors="pt")
+        outputs = model.generate(**inputs, max_new_tokens=20)
+
+        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        self.assertEqual(decoded_output, expected_output)
+
+    def test_generation_batched(self):
+        model_id = "mosaicml/mpt-7b"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        input_texts = ["Hello my name is", "Today I am going at the gym and"]
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+
+        inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(torch_device)
+
+        expected_output = [
+            "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for over",
+            "Today I am going at the gym and then I am going to go to the grocery store. I am going to get some food and then",
+        ]
+        outputs = model.generate(**inputs, max_new_tokens=20)
+
+        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        for i, predicted_output in enumerate(decoded_outputs):
+            self.assertEqual(predicted_output, expected_output[i])
+
+    def test_model_logits(self):
+        model_id = "mosaicml/mpt-7b"
+
+        # Load in 4bit to fit the daily CI runner GPU RAM
+        model = MptForCausalLM.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map={"": 0}, load_in_4bit=True
+        )
+
+        dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device)
+
+        outputs = model(dummy_input, output_hidden_states=True)
+
+        expected_slice = torch.Tensor([-0.2559, -0.2197, -0.2480]).to(torch_device, torch.bfloat16)
+        predicted_slice = outputs.hidden_states[-1][0, 0, :3]
+
+        self.assertTrue(torch.allclose(expected_slice, predicted_slice, atol=1e-3, rtol=1e-3))
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index f96b9f3700b5..dd54fe1d888a 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -98,6 +98,9 @@
         "LayoutLMv2Config": True,
         "MaskFormerSwinConfig": True,
         "MT5Config": True,
+        # For backward compatibility with trust remote code models
+        "MptConfig": True,
+        "MptAttentionConfig": True,
         "NatConfig": True,
         "OneFormerConfig": True,
         "PerceiverConfig": True,

From b99f7bd4fce73eeb3ce731ef49d18f0e1c311dd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gema=20Parre=C3=B1o?= <gema.parreno.piqueras@gmail.com>
Date: Tue, 25 Jul 2023 15:41:48 +0200
Subject: [PATCH 766/935] [DOCS] add example NoBadWordsLogitsProcessor (#25046)

* add example NoBadWordsLogitsProcessor

* fix L764 & L767

* make style
---
 src/transformers/generation/logits_process.py | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 195dfcdc592d..d5a87df092f7 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -747,6 +747,50 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
             List of list of token ids that are not allowed to be generated.
         eos_token_id (`Union[int, List[int]]`):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["In a word, the cake is a"], return_tensors="pt")
+
+    >>> summary_ids = model.generate(inputs["input_ids"], max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a mess.
+
+    >>> # Now let's control generation taking the bad words out. Please note that the tokenizer is initialized differently
+
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
+
+
+    >>> def get_tokens_as_list(word_list):
+    ...     "Converts a sequence of words into a list of tokens"
+    ...     tokens_list = []
+    ...     for word in word_list.split(" "):
+    ...         tokenized_word = tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0]
+    ...         tokens_list.append(tokenized_word)
+    ...     return tokens_list
+
+
+    >>> word_list = "mess"
+    >>> bad_words_ids = get_tokens_as_list(word_list=word_list)
+
+    >>> badwords_ids = model.generate(
+    ...     inputs["input_ids"],
+    ...     max_new_tokens=5,
+    ...     bad_words_ids=bad_words_ids,
+    ...     eos_token_id=tokenizer_with_prefix_space.eos_token_id,
+    ... )
+    >>> print(tokenizer.batch_decode(badwords_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a surprise.
+
+    >>> badwords_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=5, bad_words_ids=bad_words_ids)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a great way to start
+    ```
     """
 
     def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List[int]]):

From b51312e24dfb2a33370f9d59a242bcedc138f31a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=A4=80=EC=9E=AC=5FT3056?=
 <55151385+junejae@users.noreply.github.com>
Date: Tue, 25 Jul 2023 23:04:14 +0900
Subject: [PATCH 767/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perf=5Finfer=5Fcpu.md`=20to=20Korean=20(#24920)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: perf_infer_cpu.md

* feat: chatgpt draft

* fix: manual edits

* Update docs/source/ko/_toctree.yml

* Update docs/source/ko/perf_infer_cpu.md

* Update docs/source/ko/perf_infer_cpu.md

이 부분은 저도 걸리적거렸던 부분입니다. 반영하겠습니다!

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

동의합니다! 제가 원본에 너무 얽매여 있었네요!

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

말씀하신대로 원문에 너무 집착했던것 같습니다

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

더 나은 어휘 사용에 감사드립니다!

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

이 당시 '주기'란 용어를 생각해내질 못했네요...

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

좀 더 자연스러운 문맥이 됐네요!

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

굳이 원본 형식에 얽매일 필요가 없군요!

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* Update docs/source/ko/perf_infer_cpu.md

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

---------

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
---
 docs/source/ko/_toctree.yml      |  4 +-
 docs/source/ko/perf_infer_cpu.md | 73 ++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perf_infer_cpu.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 2e9310078c4d..93f0075b14e5 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -121,8 +121,8 @@
       title: (번역중) Training on TPU with TensorFlow
     - local: in_translation
       title: (번역중) Training on Specialized Hardware
-    - local: in_translation
-      title: (번역중) Inference on CPU
+    - local: perf_infer_cpu
+      title: CPU로 추론하기
     - local: in_translation
       title: (번역중) Inference on one GPU
     - local: in_translation
diff --git a/docs/source/ko/perf_infer_cpu.md b/docs/source/ko/perf_infer_cpu.md
new file mode 100644
index 000000000000..123e56b4f32c
--- /dev/null
+++ b/docs/source/ko/perf_infer_cpu.md
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPU에서 효율적인 추론하기 [[efficient-inference-on-cpu]]
+
+이 가이드는 CPU에서 대규모 모델을 효율적으로 추론하는 방법에 중점을 두고 있습니다.
+
+## 더 빠른 추론을 위한 `BetterTransformer` [[bettertransformer-for-faster-inference]]
+
+우리는 최근 CPU에서 텍스트, 이미지 및 오디오 모델의 빠른 추론을 위해 `BetterTransformer`를 통합했습니다. 이 통합에 대한 더 자세한 내용은 [이 문서](https://huggingface.co/docs/optimum/bettertransformer/overview)를 참조하세요.
+
+## PyTorch JIT 모드 (TorchScript) [[pytorch-jitmode-torchscript]]
+TorchScript는 PyTorch 코드에서 직렬화와 최적화가 가능한 모델을 생성할때 쓰입니다. TorchScript로 만들어진 프로그램은 기존 Python 프로세스에서 저장한 뒤, 종속성이 없는 새로운 프로세스로 가져올 수 있습니다. PyTorch의 기본 설정인 `eager` 모드와 비교했을때, `jit` 모드는 연산자 결합과 같은 최적화 방법론을 통해 모델 추론에서 대부분 더 나은 성능을 제공합니다.
+
+TorchScript에 대한 친절한 소개는 [PyTorch TorchScript 튜토리얼](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules)을 참조하세요.
+
+### JIT 모드와 함께하는 IPEX 그래프 최적화 [[ipex-graph-optimization-with-jitmode]]
+Intel® Extension for PyTorch(IPEX)는 Transformers 계열 모델의 jit 모드에서 추가적인 최적화를 제공합니다. jit 모드와 더불어 Intel® Extension for PyTorch(IPEX)를 활용하시길 강력히 권장드립니다. Transformers 모델에서 자주 사용되는 일부 연산자 패턴은 이미 jit 모드 연산자 결합(operator fusion)의 형태로 Intel® Extension for PyTorch(IPEX)에서 지원되고 있습니다. Multi-head-attention, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm 결합 패턴 등이 이용 가능하며 활용했을 때 성능이 우수합니다. 연산자 결합의 이점은 사용자에게 고스란히 전달됩니다. 분석에 따르면, 질의 응답, 텍스트 분류 및 토큰 분류와 같은 가장 인기 있는 NLP 태스크 중 약 70%가 이러한 결합 패턴을 사용하여 Float32 정밀도와 BFloat16 혼합 정밀도 모두에서 성능상의 이점을 얻을 수 있습니다.
+
+[IPEX 그래프 최적화](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html)에 대한 자세한 정보를 확인하세요.
+
+#### IPEX 설치: [[ipex-installation]]
+
+IPEX 배포 주기는 PyTorch를 따라서 이루어집니다. 자세한 정보는 [IPEX 설치 방법](https://intel.github.io/intel-extension-for-pytorch/)을 확인하세요.
+
+### JIT 모드 사용법 [[usage-of-jitmode]]
+평가 또는 예측을 위해 Trainer에서 JIT 모드를 사용하려면 Trainer의 명령 인수에 `jit_mode_eval`을 추가해야 합니다.
+
+<Tip warning={true}>
+
+PyTorch의 버전이 1.14.0 이상이라면, jit 모드는 jit.trace에서 dict 입력이 지원되므로, 모든 모델의 예측과 평가가 개선될 수 있습니다.
+
+PyTorch의 버전이 1.14.0 미만이라면, 질의 응답 모델과 같이 forward 매개변수의 순서가 jit.trace의 튜플 입력 순서와 일치하는 모델에 득이 될 수 있습니다. 텍스트 분류 모델과 같이 forward 매개변수 순서가 jit.trace의 튜플 입력 순서와 다른 경우, jit.trace가 실패하며 예외가 발생합니다. 이때 예외상황을 사용자에게 알리기 위해 Logging이 사용됩니다.
+
+</Tip>
+
+[Transformers 질의 응답](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)의 사용 사례 예시를 참조하세요.
+
+
+- CPU에서 jit 모드를 사용한 추론:
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--jit_mode_eval </b></pre> 
+
+- CPU에서 IPEX와 함께 jit 모드를 사용한 추론:
+<pre>python run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+<b>--use_ipex \</b>
+<b>--jit_mode_eval</b></pre> 

From 1e662f0f0716479b88fe4ea2c86fbbdec9ebc16c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 25 Jul 2023 16:07:00 +0200
Subject: [PATCH 768/935] Allow generic composite models to pass more kwargs
 (#24927)

* fix

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/utils.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index c8d97247e788..b47e221b3f25 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1147,6 +1147,27 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
         if "kwargs" in model_args or "model_kwargs" in model_args:
             model_args |= set(inspect.signature(self.forward).parameters)
+
+        # Encoder-Decoder models may also need Encoder arguments from `model_kwargs`
+        if self.config.is_encoder_decoder:
+            base_model = getattr(self, self.base_model_prefix, None)
+
+            # allow encoder kwargs
+            encoder = getattr(self, "encoder", None)
+            if encoder is None:
+                encoder = getattr(base_model, "encoder")
+
+            encoder_model_args = set(inspect.signature(encoder.forward).parameters)
+            model_args |= encoder_model_args
+
+            # allow decoder kwargs
+            decoder = getattr(self, "decoder", None)
+            if decoder is None:
+                decoder = getattr(base_model, "decoder")
+
+            decoder_model_args = set(inspect.signature(decoder.forward).parameters)
+            model_args |= {f"decoder_{x}" for x in decoder_model_args}
+
         for key, value in model_kwargs.items():
             if value is not None and key not in model_args:
                 unused_model_args.append(key)

From f10452271802573fe6e19442631113c4c23a2c70 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 25 Jul 2023 16:19:43 +0200
Subject: [PATCH 769/935] [ `ForSequenceClassification`] Support `left` padding
 (#24979)

* support left padding

* nit

* Update src/transformers/models/gpt_neox/modeling_gpt_neox.py

* Update src/transformers/models/gpt_neox/modeling_gpt_neox.py
---
 .../models/deprecated/open_llama/modeling_open_llama.py       | 4 +++-
 src/transformers/models/gpt2/modeling_gpt2.py                 | 4 +++-
 src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py   | 4 +++-
 src/transformers/models/gpt_neo/modeling_gpt_neo.py           | 4 +++-
 src/transformers/models/gpt_neox/modeling_gpt_neox.py         | 4 +++-
 src/transformers/models/gptj/modeling_gptj.py                 | 4 +++-
 src/transformers/models/llama/modeling_llama.py               | 4 +++-
 src/transformers/models/opt/modeling_opt.py                   | 4 +++-
 8 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index 1ea64946c8ca..f02c64502c00 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -956,7 +956,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 5014109c2d01..bdc14b4c54f6 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -1443,7 +1443,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 8c93e583f883..d0f71d382f27 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -934,7 +934,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index a0e5b23b4ed3..b2192dfeff5a 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -878,7 +878,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index b1d694e61de1..2d7f2df45649 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -926,7 +926,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index e9a9045a6d2f..56011471bdb9 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -1002,7 +1002,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index b9f278619ad3..84d63fae15fb 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -971,7 +971,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
 
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 037fa8a7ecd2..a6706bfeea01 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -1084,7 +1084,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(

From 2fac3422389d5b4284482f036409222b3beba822 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 25 Jul 2023 17:23:09 +0200
Subject: [PATCH 770/935] [`TF`]  Also apply patch to support left padding
 (#25085)

* tf versions

* apply changes to other models

* 3 models slipped through the cracks
---
 src/transformers/models/ctrl/modeling_ctrl.py          |  4 +++-
 src/transformers/models/ctrl/modeling_tf_ctrl.py       | 10 ++--------
 src/transformers/models/gpt2/modeling_tf_gpt2.py       | 10 ++--------
 src/transformers/models/gptj/modeling_tf_gptj.py       | 10 ++--------
 src/transformers/models/openai/modeling_openai.py      |  4 +++-
 src/transformers/models/openai/modeling_tf_openai.py   | 10 ++--------
 .../models/transfo_xl/modeling_tf_transfo_xl.py        | 10 ++--------
 .../models/transfo_xl/modeling_transfo_xl.py           |  4 +++-
 8 files changed, 19 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index 7cf5168e74b9..342494ad41be 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -785,7 +785,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index d8005868178b..0c715c14f89d 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -798,16 +798,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index ab6bc07947cc..525207268e22 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -1082,16 +1082,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index bbcdf3bd240a..f215adaaac00 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -867,16 +867,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 23f8fc8bc7d3..9fa3a6429e19 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -813,7 +813,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 70b7f6c05efb..775664b1b381 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -809,16 +809,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index a58b5ea70827..9ac116b290b3 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -1066,16 +1066,10 @@ def call(
         else:
             if input_ids is not None:
                 sequence_lengths = (
-                    tf.reduce_sum(
-                        tf.cast(
-                            tf.math.not_equal(input_ids, self.config.pad_token_id),
-                            dtype=input_ids.dtype,
-                        ),
-                        -1,
-                        keepdims=False,
-                    )
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                     - 1
                 )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                 in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
             else:
                 sequence_lengths = -1
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index 8ba96905242d..59c532650eb8 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -1247,7 +1247,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
                 logger.warning(

From 0779fc8eb8e7c86d11c2fab5637d06d9ec907daf Mon Sep 17 00:00:00 2001
From: Connor Henderson <connor.henderson@talkiatry.com>
Date: Tue, 25 Jul 2023 12:24:36 -0400
Subject: [PATCH 771/935] Edit err message and comment in `test_model_is_small`
 (#25087)

* Edit err message and comment in

* put back 80M comment
---
 tests/test_modeling_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 87960983f9b3..d80624b7189b 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2714,7 +2714,7 @@ def test_model_is_small(self):
             num_params = model.num_parameters()
             assert (
                 num_params < 1000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 200k max."
+            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
 
 
 global_rng = random.Random()

From f9cc333805c47665c6afee8b5867931e54abe0c6 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 25 Jul 2023 18:45:01 +0200
Subject: [PATCH 772/935] [ `PreTrainedTokenizerFast`] Keep properties from
 fast tokenizer (#25053)

* draft solution

* use `setdefault`

* nits

* add tests and fix truncation issue

* fix test

* test passes locally

* quality

* updates

* update tsets
---
 src/transformers/tokenization_utils_fast.py  | 20 ++++++++
 tests/tokenization/test_tokenization_fast.py | 52 ++++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 471221e713b0..ced9d7959338 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -132,6 +132,26 @@ def __init__(self, *args, **kwargs):
 
         self._decode_use_source_tokenizer = False
 
+        _truncation = self._tokenizer.truncation
+
+        if _truncation is not None:
+            self._tokenizer.enable_truncation(**_truncation)
+            kwargs.setdefault("max_length", _truncation["max_length"])
+            kwargs.setdefault("truncation_side", _truncation["direction"])
+            kwargs.setdefault("stride", _truncation["stride"])
+            kwargs.setdefault("truncation_strategy", _truncation["strategy"])
+        else:
+            self._tokenizer.no_truncation()
+
+        _padding = self._tokenizer.padding
+        if _padding is not None:
+            self._tokenizer.enable_padding(**_padding)
+            kwargs.setdefault("pad_token", _padding["pad_token"])
+            kwargs.setdefault("pad_token_type_id", _padding["pad_type_id"])
+            kwargs.setdefault("padding_side", _padding["direction"])
+            kwargs.setdefault("max_length", _padding["length"])
+            kwargs.setdefault("pad_to_multiple_of", _padding["pad_to_multiple_of"])
+
         # We call this after having initialized the backend tokenizer because we update it.
         super().__init__(**kwargs)
 
diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py
index da98d17d7722..c6259610aa18 100644
--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -109,6 +109,58 @@ def test_training_new_tokenizer_with_bytelevel(self):
         encoding_ids = new_tokenizer.encode("a🤗")
         self.assertEqual(encoding_ids, [64, 172, 253, 97, 245])
 
+    def test_init_from_tokenizers_model(self):
+        from tokenizers import Tokenizer
+
+        sentences = ["Hello, y'all!", "How are you 😁 ? There should not be any issue right?"]
+
+        tokenizer = Tokenizer.from_pretrained("t5-base")
+        # Enable padding
+        tokenizer.enable_padding(pad_id=0, pad_token="<pad>", length=512, pad_to_multiple_of=8)
+        self.assertEqual(
+            tokenizer.padding,
+            {
+                "length": 512,
+                "pad_to_multiple_of": 8,
+                "pad_id": 0,
+                "pad_token": "<pad>",
+                "pad_type_id": 0,
+                "direction": "right",
+            },
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+        tmpdirname = tempfile.mkdtemp()
+        fast_tokenizer.save_pretrained(tmpdirname)
+        fast_from_saved = PreTrainedTokenizerFast.from_pretrained(tmpdirname)
+        for tok in [fast_tokenizer, fast_from_saved]:
+            self.assertEqual(tok.pad_token_id, 0)
+            self.assertEqual(tok.padding_side, "right")
+            self.assertEqual(tok.pad_token, "<pad>")
+            self.assertEqual(tok.init_kwargs["max_length"], 512)
+            self.assertEqual(tok.init_kwargs["pad_to_multiple_of"], 8)
+            # fmt: off
+            self.assertEqual(tok(sentences, padding = True), {'input_ids': [[8774, 6, 3, 63, 31, 1748, 55, 1, 0, 0, 0, 0,0, 0, 0, 0],[ 571, 33, 25, 3, 2, 3, 58, 290, 225, 59, 36, 136, 962, 269, 58, 1]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]})
+            # fmt: on
+
+        tokenizer.enable_truncation(8, stride=0, strategy="longest_first", direction="right")
+        self.assertEqual(
+            tokenizer.truncation, {"max_length": 8, "stride": 0, "strategy": "longest_first", "direction": "right"}
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+        tmpdirname = tempfile.mkdtemp()
+        fast_tokenizer.save_pretrained(tmpdirname)
+        fast_from_saved = PreTrainedTokenizerFast.from_pretrained(tmpdirname)
+        for tok in [fast_tokenizer, fast_from_saved]:
+            self.assertEqual(tok.truncation_side, "right")
+            self.assertEqual(tok.init_kwargs["truncation_strategy"], "longest_first")
+            self.assertEqual(tok.init_kwargs["max_length"], 8)
+            self.assertEqual(tok.init_kwargs["stride"], 0)
+            # NOTE even if the model has a default max_length, it is not used...
+            # thus tok(sentences, truncation = True) does nothing and does not warn either
+            # fmt: off
+            self.assertEqual(tok(sentences, truncation = True, max_length = 8), {'input_ids': [[8774, 6, 3, 63, 31, 1748, 55, 1],[ 571, 33, 25, 3, 2, 3, 58, 1]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1]]})
+            # fmt: on
+
 
 @require_tokenizers
 class TokenizerVersioningTest(unittest.TestCase):

From 21150cb0f3739a69d767384d58c0247d72735b50 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 25 Jul 2023 20:26:00 +0200
Subject: [PATCH 773/935] Hotfix for failing `MusicgenForConditionalGeneration`
 tests (#25091)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/generation/utils.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index b47e221b3f25..7465e88161b6 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1154,19 +1154,24 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
 
             # allow encoder kwargs
             encoder = getattr(self, "encoder", None)
-            if encoder is None:
-                encoder = getattr(base_model, "encoder")
+            # `MusicgenForConditionalGeneration` has `text_encoder` and `audio_encoder`.
+            # Also, it has `base_model_prefix = "encoder_decoder"` but there is no `self.encoder_decoder`
+            # TODO: A better way to handle this.
+            if encoder is None and base_model is not None:
+                encoder = getattr(base_model, "encoder", None)
 
-            encoder_model_args = set(inspect.signature(encoder.forward).parameters)
-            model_args |= encoder_model_args
+            if encoder is not None:
+                encoder_model_args = set(inspect.signature(encoder.forward).parameters)
+                model_args |= encoder_model_args
 
             # allow decoder kwargs
             decoder = getattr(self, "decoder", None)
-            if decoder is None:
-                decoder = getattr(base_model, "decoder")
+            if decoder is None and base_model is not None:
+                decoder = getattr(base_model, "decoder", None)
 
-            decoder_model_args = set(inspect.signature(decoder.forward).parameters)
-            model_args |= {f"decoder_{x}" for x in decoder_model_args}
+            if decoder is not None:
+                decoder_model_args = set(inspect.signature(decoder.forward).parameters)
+                model_args |= {f"decoder_{x}" for x in decoder_model_args}
 
         for key, value in model_kwargs.items():
             if value is not None and key not in model_args:

From 8f36ab3e22c029018e020ce232a5f5310620b53f Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl@users.noreply.github.com>
Date: Tue, 25 Jul 2023 21:02:49 +0200
Subject: [PATCH 774/935] [`T5`, `MT5`, `UMT5`] Add [T5, MT5,
 UMT5]ForSequenceClassification (#24726)

* Initial addition of t5forsequenceclassification

* Adding imports and adding tests

* Formatting

* Running make fix-copies

* Adding mt5forseq

* Formatting

* run make fix-copies

* Adding to docs

* Add model_parallel

* Fix bug

* Fix

* Remove TODO

* Fixing tests for T5ForSequenceClassification

* Undo changes to dependency_versions_table.py

* Change classification head to work with T5Config directly

* Change seq length to let tests pass

* PR comments for formatting

* Formatting

* Initial addition of UMT5ForSequenceClassification

* Adding to inits and formatting

* run make fix-copies

* Add doc for UMT5ForSeqClass

* Update UMT5 config

* Fix docs

* Skip torch fx test for SequenceClassification

* Formatting

* Add skip to UMT5 tests as well

* Fix umt5 tests

* Running make fix-copies

* PR comments

* Fix for change to sentence_representation

* Rename seq_len to hidden_size since that's what it is

* Use base_model to follow format of the rest of the library

* Update docs

* Extract the decoder_input_ids changes and make one liner

* Make one-liner
---
 docs/source/en/model_doc/mt5.md               |   4 +
 docs/source/en/model_doc/t5.md                |   5 +
 docs/source/en/model_doc/umt5.md              |   5 +
 .../en/tasks/sequence_classification.md       |   2 +-
 src/transformers/__init__.py                  |  14 +-
 src/transformers/models/auto/modeling_auto.py |   3 +
 src/transformers/models/mt5/__init__.py       |   2 +
 .../models/mt5/configuration_mt5.py           |   4 +
 src/transformers/models/mt5/modeling_mt5.py   | 171 ++++++++++++++-
 src/transformers/models/t5/__init__.py        |   2 +
 .../models/t5/configuration_t5.py             |   4 +
 src/transformers/models/t5/modeling_t5.py     | 168 ++++++++++++++-
 src/transformers/models/umt5/__init__.py      |   2 +
 .../models/umt5/configuration_umt5.py         |   4 +
 src/transformers/models/umt5/modeling_umt5.py | 175 ++++++++++++++-
 src/transformers/utils/dummy_pt_objects.py    |  21 ++
 tests/models/t5/test_modeling_t5.py           | 199 +++++++++++++++++-
 tests/models/umt5/test_modeling_umt5.py       | 196 ++++++++++++++++-
 18 files changed, 960 insertions(+), 21 deletions(-)

diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md
index ea3da7668157..beec9b535490 100644
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@@ -95,6 +95,10 @@ See [`T5TokenizerFast`] for all details.
 
 [[autodoc]] MT5EncoderModel
 
+## MT5ForSequenceClassification
+
+[[autodoc]] MT5ForSequenceClassification
+
 ## MT5ForQuestionAnswering
 
 [[autodoc]] MT5ForQuestionAnswering
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index 674143ddd1de..2e833e8e1a67 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -401,6 +401,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] T5EncoderModel
     - forward
 
+## T5ForSequenceClassification
+
+[[autodoc]] T5ForSequenceClassification
+    - forward
+
 ## T5ForQuestionAnswering
 
 [[autodoc]] T5ForQuestionAnswering
diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md
index d7dbc69e8e45..4e6375bd465a 100644
--- a/docs/source/en/model_doc/umt5.md
+++ b/docs/source/en/model_doc/umt5.md
@@ -92,6 +92,11 @@ The conversion script is also different because the model was saved in t5x's lat
 [[autodoc]] UMT5EncoderModel
     - forward
 
+## UMT5ForSequenceClassification
+
+[[autodoc]] UMT5ForSequenceClassification
+    - forward
+
 ## UMT5ForQuestionAnswering
 
 [[autodoc]] UMT5ForQuestionAnswering
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index 10232edae234..1b025eba2911 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -33,7 +33,7 @@ The task illustrated in this tutorial is supported by the following model archit
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index fe0a68e056a2..5b4a75215aa7 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2240,7 +2240,14 @@
         ]
     )
     _import_structure["models.mt5"].extend(
-        ["MT5EncoderModel", "MT5ForConditionalGeneration", "MT5ForQuestionAnswering", "MT5Model", "MT5PreTrainedModel"]
+        [
+            "MT5EncoderModel",
+            "MT5ForConditionalGeneration",
+            "MT5ForQuestionAnswering",
+            "MT5ForSequenceClassification",
+            "MT5Model",
+            "MT5PreTrainedModel",
+        ]
     )
     _import_structure["models.musicgen"].extend(
         [
@@ -2694,6 +2701,7 @@
             "T5EncoderModel",
             "T5ForConditionalGeneration",
             "T5ForQuestionAnswering",
+            "T5ForSequenceClassification",
             "T5Model",
             "T5PreTrainedModel",
             "load_tf_weights_in_t5",
@@ -2763,6 +2771,7 @@
             "UMT5EncoderModel",
             "UMT5ForConditionalGeneration",
             "UMT5ForQuestionAnswering",
+            "UMT5ForSequenceClassification",
             "UMT5Model",
             "UMT5PreTrainedModel",
         ]
@@ -5930,6 +5939,7 @@
             MT5EncoderModel,
             MT5ForConditionalGeneration,
             MT5ForQuestionAnswering,
+            MT5ForSequenceClassification,
             MT5Model,
             MT5PreTrainedModel,
         )
@@ -6303,6 +6313,7 @@
             T5EncoderModel,
             T5ForConditionalGeneration,
             T5ForQuestionAnswering,
+            T5ForSequenceClassification,
             T5Model,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
@@ -6356,6 +6367,7 @@
             UMT5EncoderModel,
             UMT5ForConditionalGeneration,
             UMT5ForQuestionAnswering,
+            UMT5ForSequenceClassification,
             UMT5Model,
             UMT5PreTrainedModel,
         )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 06021a918856..aec9eacc2a7a 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -724,6 +724,7 @@
         ("mpnet", "MPNetForSequenceClassification"),
         ("mpt", "MptForSequenceClassification"),
         ("mra", "MraForSequenceClassification"),
+        ("mt5", "MT5ForSequenceClassification"),
         ("mvp", "MvpForSequenceClassification"),
         ("nezha", "NezhaForSequenceClassification"),
         ("nystromformer", "NystromformerForSequenceClassification"),
@@ -740,8 +741,10 @@
         ("roc_bert", "RoCBertForSequenceClassification"),
         ("roformer", "RoFormerForSequenceClassification"),
         ("squeezebert", "SqueezeBertForSequenceClassification"),
+        ("t5", "T5ForSequenceClassification"),
         ("tapas", "TapasForSequenceClassification"),
         ("transfo-xl", "TransfoXLForSequenceClassification"),
+        ("umt5", "UMT5ForSequenceClassification"),
         ("xlm", "XLMForSequenceClassification"),
         ("xlm-roberta", "XLMRobertaForSequenceClassification"),
         ("xlm-roberta-xl", "XLMRobertaXLForSequenceClassification"),
diff --git a/src/transformers/models/mt5/__init__.py b/src/transformers/models/mt5/__init__.py
index d86823021d71..ee536f50dfb6 100644
--- a/src/transformers/models/mt5/__init__.py
+++ b/src/transformers/models/mt5/__init__.py
@@ -51,6 +51,7 @@
         "MT5EncoderModel",
         "MT5ForConditionalGeneration",
         "MT5ForQuestionAnswering",
+        "MT5ForSequenceClassification",
         "MT5Model",
         "MT5PreTrainedModel",
         "MT5Stack",
@@ -86,6 +87,7 @@
             MT5EncoderModel,
             MT5ForConditionalGeneration,
             MT5ForQuestionAnswering,
+            MT5ForSequenceClassification,
             MT5Model,
             MT5PreTrainedModel,
             MT5Stack,
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index 168f0eb1cabe..b1bb201bebc5 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -56,6 +56,8 @@ class MT5Config(PretrainedConfig):
             The maximum distance of the longer sequences for the bucket separation.
         dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1):
@@ -91,6 +93,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         decoder_start_token_id=0,
+        classifier_dropout=0.0,
         **kwargs,
     ):
         super().__init__(
@@ -114,6 +117,7 @@ def __init__(
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.relative_attention_max_distance = relative_attention_max_distance
         self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 57593f12089b..3d03503ddd40 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -18,11 +18,11 @@
 import math
 import os
 import warnings
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
@@ -32,6 +32,7 @@
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
     Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
@@ -742,6 +743,25 @@ def load_tf_weights_in_mt5(model, config, tf_checkpoint_path):
     return model
 
 
+# Copied from transformers.models.t5.modeling_t5.T5ClassificationHead with T5->MT5
+class MT5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
 # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel with T5->MT5, t5->mt5
 class MT5PreTrainedModel(PreTrainedModel):
     """
@@ -773,7 +793,10 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, MT5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (MT5Model, MT5ForConditionalGeneration, MT5EncoderModel, MT5ForQuestionAnswering)):
+        elif isinstance(
+            module,
+            (MT5Model, MT5ForConditionalGeneration, MT5EncoderModel, MT5ForQuestionAnswering),
+        ):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
@@ -782,6 +805,13 @@ def _init_weights(self, module):
             if hasattr(module, "qa_outputs"):
                 module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
                 module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, MT5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
         elif isinstance(module, MT5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -1996,6 +2026,141 @@ def forward(
         return encoder_outputs
 
 
+@add_start_docstrings(
+    """
+    MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    MT5_START_DOCSTRING,
+)
+class MT5ForSequenceClassification(MT5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->MT5
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.transformer = MT5Model(config)
+        self.classification_head = MT5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.forward
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        # Copied from models.bart.modeling_bart.BartModel.forward different to other models, T5 automatically creates
+        # decoder_input_ids from input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
 @add_start_docstrings(
     """
     MT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index 5a02877504b9..be73c1f6480b 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -57,6 +57,7 @@
         "T5PreTrainedModel",
         "load_tf_weights_in_t5",
         "T5ForQuestionAnswering",
+        "T5ForSequenceClassification",
     ]
 
 try:
@@ -117,6 +118,7 @@
             T5EncoderModel,
             T5ForConditionalGeneration,
             T5ForQuestionAnswering,
+            T5ForSequenceClassification,
             T5Model,
             T5PreTrainedModel,
             load_tf_weights_in_t5,
diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py
index 5a5d20497185..7eb621f58821 100644
--- a/src/transformers/models/t5/configuration_t5.py
+++ b/src/transformers/models/t5/configuration_t5.py
@@ -64,6 +64,8 @@ class T5Config(PretrainedConfig):
             The maximum distance of the longer sequences for the bucket separation.
         dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1):
@@ -98,6 +100,7 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
+        classifier_dropout=0.0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -112,6 +115,7 @@ def __init__(
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.relative_attention_max_distance = relative_attention_max_distance
         self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 71d282c615fa..33f29298dcca 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -19,11 +19,11 @@
 import math
 import os
 import warnings
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
@@ -33,6 +33,7 @@
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
     Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
@@ -772,6 +773,24 @@ def forward(
         return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
 
 
+class T5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
 class T5PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -802,7 +821,10 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering)):
+        elif isinstance(
+            module,
+            (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5ForQuestionAnswering),
+        ):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
@@ -811,6 +833,13 @@ def _init_weights(self, module):
             if hasattr(module, "qa_outputs"):
                 module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
                 module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, T5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
         elif isinstance(module, T5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -1945,6 +1974,139 @@ def forward(
         return encoder_outputs
 
 
+@add_start_docstrings(
+    """
+    T5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    T5_START_DOCSTRING,
+)
+class T5ForSequenceClassification(T5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.transformer = T5Model(config)
+        self.classification_head = T5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        # Copied from models.bart.modeling_bart.BartModel.forward different to other models, T5 automatically creates
+        # decoder_input_ids from input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
 @add_start_docstrings(
     """
     T5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
diff --git a/src/transformers/models/umt5/__init__.py b/src/transformers/models/umt5/__init__.py
index 2ad1c56d2390..cd7301e36d28 100644
--- a/src/transformers/models/umt5/__init__.py
+++ b/src/transformers/models/umt5/__init__.py
@@ -30,6 +30,7 @@
         "UMT5EncoderModel",
         "UMT5ForConditionalGeneration",
         "UMT5ForQuestionAnswering",
+        "UMT5ForSequenceClassification",
         "UMT5Model",
         "UMT5PreTrainedModel",
     ]
@@ -47,6 +48,7 @@
             UMT5EncoderModel,
             UMT5ForConditionalGeneration,
             UMT5ForQuestionAnswering,
+            UMT5ForSequenceClassification,
             UMT5Model,
             UMT5PreTrainedModel,
         )
diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py
index 462b57f7f1cb..4c3b279230f1 100644
--- a/src/transformers/models/umt5/configuration_umt5.py
+++ b/src/transformers/models/umt5/configuration_umt5.py
@@ -61,6 +61,8 @@ class UMT5Config(PretrainedConfig):
             The maximum distance of the longer sequences for the bucket separation.
         dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
         layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
         initializer_factor (`float`, *optional*, defaults to 1):
@@ -96,6 +98,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         decoder_start_token_id=0,
+        classifier_dropout=0.0,
         **kwargs,
     ):
         super().__init__(
@@ -119,6 +122,7 @@ def __init__(
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.relative_attention_max_distance = relative_attention_max_distance
         self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index ba3bca7875a6..16d92c70e094 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -16,11 +16,11 @@
 
 import copy
 import math
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
@@ -30,6 +30,7 @@
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
     Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
@@ -451,6 +452,25 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.t5.modeling_t5.T5ClassificationHead with T5->UMT5
+class UMT5ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: UMT5Config):
+        super().__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.classifier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
 class UMT5PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -479,7 +499,15 @@ def _init_weights(self, module):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, UMT5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (UMT5Model, UMT5ForConditionalGeneration, UMT5EncoderModel, UMT5ForQuestionAnswering)):
+        elif isinstance(
+            module,
+            (
+                UMT5Model,
+                UMT5ForConditionalGeneration,
+                UMT5EncoderModel,
+                UMT5ForQuestionAnswering,
+            ),
+        ):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
@@ -488,6 +516,13 @@ def _init_weights(self, module):
             if hasattr(module, "qa_outputs"):
                 module.qa_outputs.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
                 module.qa_outputs.bias.data.zero_()
+        elif isinstance(module, UMT5ClassificationHead):
+            module.dense.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.dense, "bias") and module.dense.bias is not None:
+                module.dense.bias.data.zero_()
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None:
+                module.out_proj.bias.data.zero_()
         elif isinstance(module, UMT5DenseActDense):
             # Mesh TensorFlow FF initialization
             # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
@@ -1401,6 +1436,140 @@ def forward(
         return encoder_outputs
 
 
+@add_start_docstrings(
+    """
+    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    UMT5_START_DOCSTRING,
+)
+class UMT5ForSequenceClassification(UMT5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->UMT5
+    def __init__(self, config: UMT5Config):
+        super().__init__(config)
+        self.transformer = UMT5Model(config)
+        self.classification_head = UMT5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.model_parallel = False
+
+    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        # Copied from models.bart.modeling_bart.BartModel.forward different to other models, T5 automatically creates
+        # decoder_input_ids from input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = self._shift_right(input_ids)
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        batch_size, _, hidden_size = sequence_output.shape
+        sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
 @add_start_docstrings(
     """
     UMT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e4b3aa14aca5..65fedf02d8c9 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5219,6 +5219,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class MT5ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MT5Model(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7070,6 +7077,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class T5ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class T5Model(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7320,6 +7334,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class UMT5ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class UMT5Model(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index b0ff82dd62c2..e219a8802251 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -15,10 +15,13 @@
 
 
 import copy
+import os
+import pickle
 import tempfile
 import unittest
 
 from transformers import T5Config, is_torch_available
+from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.testing_utils import (
     require_accelerate,
     require_sentencepiece,
@@ -27,14 +30,18 @@
     slow,
     torch_device,
 )
-from transformers.utils import cached_property
+from transformers.utils import cached_property, is_torch_fx_available
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
+
+
 if is_torch_available():
     import torch
 
@@ -44,6 +51,7 @@
         T5EncoderModel,
         T5ForConditionalGeneration,
         T5ForQuestionAnswering,
+        T5ForSequenceClassification,
         T5Model,
         T5Tokenizer,
     )
@@ -57,7 +65,7 @@ def __init__(
         vocab_size=99,
         batch_size=13,
         encoder_seq_length=7,
-        decoder_seq_length=9,
+        decoder_seq_length=7,
         # For common tests
         is_training=True,
         use_attention_mask=True,
@@ -102,7 +110,8 @@ def get_large_model_config(self):
         return T5Config.from_pretrained("t5-base")
 
     def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
         decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
 
         attention_mask = None
@@ -251,6 +260,26 @@ def create_and_check_with_lm_head(
         self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
         self.parent.assertEqual(outputs["loss"].size(), ())
 
+    def create_and_check_with_sequence_classification_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        labels = torch.tensor([1] * self.batch_size, dtype=torch.long, device=torch_device)
+        model = T5ForSequenceClassification(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=input_ids,
+            labels=labels,
+        )
+        # self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, config.num_labels))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
     def create_and_check_decoder_model_past(
         self,
         config,
@@ -521,7 +550,11 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (T5Model, T5ForConditionalGeneration, T5ForQuestionAnswering) if is_torch_available() else ()
+    all_model_classes = (
+        (T5Model, T5ForConditionalGeneration, T5ForSequenceClassification, T5ForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
@@ -531,6 +564,8 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
             "text2text-generation": T5ForConditionalGeneration,
             "translation": T5ForConditionalGeneration,
             "question-answering": T5ForQuestionAnswering,
+            "text-classification": T5ForSequenceClassification,
+            "zero-shot": T5ForSequenceClassification,
         }
         if is_torch_available()
         else {}
@@ -548,6 +583,126 @@ def setUp(self):
         self.model_tester = T5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
 
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "T5ForSequenceClassification":
+                continue
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = [
+                        "attention_mask",
+                        "decoder_attention_mask",
+                        "decoder_input_ids",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                    ]
+                    if labels is not None:
+                        input_names.append("labels")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = [
+                        "attention_mask",
+                        "bbox",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                        "pixel_values",
+                        "token_type_ids",
+                        "visual_feats",
+                        "visual_pos",
+                    ]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
+                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
+                    ):
+                        model.config.problem_type = "single_label_classification"
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                    model_output = model(**filtered_inputs)
+
+            except Exception as e:
+                self.fail(f"Couldn't trace module: {e}")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+            # Test that the model can be serialized and restored properly
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
+                try:
+                    with open(pkl_file_name, "wb") as f:
+                        pickle.dump(traced_model, f)
+                    with open(pkl_file_name, "rb") as f:
+                        loaded = pickle.load(f)
+                except Exception as e:
+                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
+
+                loaded_output = loaded(**filtered_inputs)
+                loaded_output = flatten_output(loaded_output)
+
+                for i in range(num_outputs):
+                    self.assertTrue(
+                        torch.allclose(model_output[i], loaded_output[i]),
+                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
+                    )
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -567,6 +722,36 @@ def test_model_v1_1(self):
         config.feed_forward_proj = "gated-gelu"
         self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
 
+    # T5ForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (T5Model, T5ForConditionalGeneration, T5ForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
     def test_config_and_model_silu_gated(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         config = config_and_inputs[0]
@@ -577,6 +762,10 @@ def test_with_lm_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
 
+    def test_with_sequence_classification_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
+
     def test_decoder_model_past(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index b5910ced3672..3615547f6215 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+import os
+import pickle
 import tempfile
 import unittest
 
 from transformers import T5Config, is_torch_available
+from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.testing_utils import (
     require_sentencepiece,
     require_tokenizers,
@@ -23,16 +27,27 @@
     slow,
     torch_device,
 )
+from transformers.utils import is_torch_fx_available
 
 from ...generation.test_utils import GenerationTesterMixin
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
+if is_torch_fx_available():
+    from transformers.utils.fx import symbolic_trace
+
+
 if is_torch_available():
     import torch
 
-    from transformers import AutoTokenizer, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering, UMT5Model
+    from transformers import (
+        AutoTokenizer,
+        UMT5ForConditionalGeneration,
+        UMT5ForQuestionAnswering,
+        UMT5ForSequenceClassification,
+        UMT5Model,
+    )
 
 
 # Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5
@@ -43,7 +58,7 @@ def __init__(
         vocab_size=99,
         batch_size=13,
         encoder_seq_length=7,
-        decoder_seq_length=9,
+        decoder_seq_length=7,
         # For common tests
         is_training=True,
         use_attention_mask=True,
@@ -131,7 +146,8 @@ def prepare_config_and_inputs(self):
         # but when using past, there is no way of knowing if the past input ids had
         # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
         # position_ids being off by num_pad_tokens in past input
-        input_ids = input_ids.clamp(self.pad_token_id + 1)
+        input_ids = input_ids.clamp(self.pad_token_id + 2)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
         decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
 
         config = self.get_config()
@@ -255,11 +271,25 @@ def create_and_check_model_fp16_forward(
         output = model(**input_dict)["last_hidden_state"]
         self.parent.assertFalse(torch.isnan(output).any().item())
 
+    def create_and_check_with_sequence_classification_head(
+        self,
+        config,
+        input_dict,
+    ):
+        labels = torch.tensor([1] * self.batch_size, dtype=torch.long, device=torch_device)
+        model = UMT5ForSequenceClassification(config=config).to(torch_device).eval()
+        outputs = model(**input_dict, labels=labels)
+        # self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, config.num_labels))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
 
 @require_torch
 class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (UMT5Model, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering) if is_torch_available() else ()
+        (UMT5Model, UMT5ForConditionalGeneration, UMT5ForSequenceClassification, UMT5ForQuestionAnswering)
+        if is_torch_available()
+        else ()
     )
     all_generative_model_classes = (UMT5ForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = (
@@ -270,6 +300,8 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             "text2text-generation": UMT5ForConditionalGeneration,
             "translation": UMT5ForConditionalGeneration,
             "question-answering": UMT5ForQuestionAnswering,
+            "text-classification": UMT5ForSequenceClassification,
+            "zero-shot": UMT5ForSequenceClassification,
         }
         if is_torch_available()
         else {}
@@ -285,6 +317,160 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     def setUp(self):
         self.model_tester = UMT5ModelTester(self)
 
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available() or not self.fx_compatible:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "UMT5ForSequenceClassification":
+                continue
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    labels = inputs.get("labels", None)
+                    input_names = [
+                        "attention_mask",
+                        "decoder_attention_mask",
+                        "decoder_input_ids",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                    ]
+                    if labels is not None:
+                        input_names.append("labels")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    model_output = model(**filtered_inputs)
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                else:
+                    input_names = [
+                        "attention_mask",
+                        "bbox",
+                        "input_features",
+                        "input_ids",
+                        "input_values",
+                        "pixel_values",
+                        "token_type_ids",
+                        "visual_feats",
+                        "visual_pos",
+                    ]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = list(filtered_inputs.keys())
+
+                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
+                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
+                    ):
+                        model.config.problem_type = "single_label_classification"
+
+                    traced_model = symbolic_trace(model, input_names)
+                    traced_output = traced_model(**filtered_inputs)
+                    model_output = model(**filtered_inputs)
+
+            except Exception as e:
+                self.fail(f"Couldn't trace module: {e}")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+            # Test that the model can be serialized and restored properly
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
+                try:
+                    with open(pkl_file_name, "wb") as f:
+                        pickle.dump(traced_model, f)
+                    with open(pkl_file_name, "rb") as f:
+                        loaded = pickle.load(f)
+                except Exception as e:
+                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
+
+                loaded_output = loaded(**filtered_inputs)
+                loaded_output = flatten_output(loaded_output)
+
+                for i in range(num_outputs):
+                    self.assertTrue(
+                        torch.allclose(model_output[i], loaded_output[i]),
+                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
+                    )
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
+    # UMT5ForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (UMT5Model, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_with_sequence_classification_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
+
     @unittest.skip("Test has a segmentation fault on torch 1.8.0")
     def test_export_to_onnx(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()

From da5ff18a4acde15b92cc5907c17f8cc0d2437797 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 25 Jul 2023 22:10:06 +0200
Subject: [PATCH 775/935] Fix doctest (#25031)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docs/source/en/task_summary.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md
index 46e73297bbf9..6cb49f88ba7f 100644
--- a/docs/source/en/task_summary.md
+++ b/docs/source/en/task_summary.md
@@ -335,7 +335,7 @@ Document question answering is a task that answers natural language questions fr
 ...     image=image,
 ... )
 >>> preds
-[{'score': 0.8531239628791809, 'answer': '17,000', 'start': 4, 'end': 4}]
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
 ```
 
 Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next [section](tasks_explained), you'll learn **how** 🤗 Transformers work to solve these tasks.
\ No newline at end of file

From 6b8dbc283c4f2609fe14dc95195b1719c28e3a7d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 25 Jul 2023 17:24:50 -0400
Subject: [PATCH 776/935] Bump certifi from 2022.12.7 to 2023.7.22 in
 /examples/research_projects/lxmert (#25096)

Bump certifi in /examples/research_projects/lxmert

Bumps [certifi](https://github.com/certifi/python-certifi) from 2022.12.7 to 2023.7.22.
- [Commits](https://github.com/certifi/python-certifi/compare/2022.12.07...2023.07.22)

---
updated-dependencies:
- dependency-name: certifi
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/lxmert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index 52e3878fac24..3e7dc5b1b675 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -4,7 +4,7 @@ async-generator==1.10
 attrs==20.2.0
 backcall==0.2.0
 CacheControl==0.12.6
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.14.2
 chardet==3.0.4
 click==7.1.2

From 45bde362d2f935feb9ea7778f04792e59fe0bce2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 25 Jul 2023 17:25:05 -0400
Subject: [PATCH 777/935] Bump certifi from 2022.12.7 to 2023.7.22 in
 /examples/research_projects/decision_transformer (#25098)

Bump certifi in /examples/research_projects/decision_transformer

Bumps [certifi](https://github.com/certifi/python-certifi) from 2022.12.7 to 2023.7.22.
- [Commits](https://github.com/certifi/python-certifi/compare/2022.12.07...2023.07.22)

---
updated-dependencies:
- dependency-name: certifi
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index e7d3be0c4ccc..88cd41d42b6f 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -20,7 +20,7 @@ boto3==1.16.34
 botocore==1.19.63
 Brotli==1.0.9
 cachetools==5.0.0
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.15.0
 chardet==4.0.0
 charset-normalizer==2.0.12

From f1deb21fce4589a9f0a12b6539de073ab39bd22c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 25 Jul 2023 17:25:14 -0400
Subject: [PATCH 778/935] Bump certifi from 2022.12.7 to 2023.7.22 in
 /examples/research_projects/visual_bert (#25097)

Bump certifi in /examples/research_projects/visual_bert

Bumps [certifi](https://github.com/certifi/python-certifi) from 2022.12.7 to 2023.7.22.
- [Commits](https://github.com/certifi/python-certifi/compare/2022.12.07...2023.07.22)

---
updated-dependencies:
- dependency-name: certifi
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/visual_bert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
index 52e3878fac24..3e7dc5b1b675 100644
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@@ -4,7 +4,7 @@ async-generator==1.10
 attrs==20.2.0
 backcall==0.2.0
 CacheControl==0.12.6
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.14.2
 chardet==3.0.4
 click==7.1.2

From a5cc30d72ae2dc19af534e4b35c986cc28db1275 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Tue, 25 Jul 2023 18:08:45 -0400
Subject: [PATCH 779/935] fix tied_params for meta tensor (#25101)

* fix tied_params for meta tensor

* remove duplicate
---
 src/transformers/modeling_utils.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c7d31c09024e..4d41ff49841c 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3060,13 +3060,17 @@ def _fix_key(key):
         unexpected_keys = list(unexpected_keys - model_buffers)
 
         model.tie_weights()
-        ptrs = collections.defaultdict(list)
-        for name, tensor in model.state_dict().items():
-            id_tensor = id_tensor_storage(tensor) if tensor.device != torch.device("meta") else id(tensor)
-            ptrs[id_tensor].append(name)
+        if device_map is None:
+            ptrs = collections.defaultdict(list)
+            for name, tensor in model.state_dict().items():
+                id_tensor = id_tensor_storage(tensor)
+                ptrs[id_tensor].append(name)
 
-        # These are all the pointers of shared tensors.
-        tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+            # These are all the pointers of shared tensors.
+            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
+        else:
+            # id function doesn't work for meta tensor so we need this function
+            tied_params = find_tied_parameters(model)
 
         for group in tied_params:
             if remove_prefix_from_model:

From 277d3aed0a8da1a92379f3c412803f2e2dd8ce03 Mon Sep 17 00:00:00 2001
From: Shauray Singh <39147312+shauray8@users.noreply.github.com>
Date: Wed, 26 Jul 2023 18:00:33 +0530
Subject: [PATCH 780/935] documentation for llama2 models (#25102)

* fix documentation

* changes
---
 src/transformers/models/llama/convert_llama_weights_to_hf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index 75760f4bbed6..03c1eb4034da 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -282,6 +282,7 @@ def main():
     parser.add_argument(
         "--model_size",
         choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
     )
     parser.add_argument(
         "--output_dir",

From ee63520a7b52b7feead3f7e168ca279c5f4faa1c Mon Sep 17 00:00:00 2001
From: Kihoon Son <75935546+kihoon71@users.noreply.github.com>
Date: Wed, 26 Jul 2023 21:40:37 +0900
Subject: [PATCH 781/935] =?UTF-8?q?=F0=9F=8C=90[i18n-KO]=20Translated=20pi?=
 =?UTF-8?q?peline=5Fwebserver.md=20to=20Korean=20(#24828)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* translated pipeline_webserver.md

Co-Authored-By: Hyeonseo Yun <0525yhs@gmail.com>
Co-Authored-By: Wonhyeong Seo <wonhseo@kakao.com>
Co-Authored-By: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-Authored-By: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-Authored-By: Nayeon Han <nayeon2.han@gmail.com>
Co-Authored-By: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* Update pipeline_webserver.md

* Apply suggestions from code review

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Sangam Lee <74291999+augustinLib@users.noreply.github.com>
Co-authored-by: Kim haewon <ehdvkf02@naver.com>

---------

Co-authored-by: Hyeonseo Yun <0525yhs@gmail.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Gabriel Yang <gabrielwithhappy@gmail.com>
Co-authored-by: Nayeon Han <nayeon2.han@gmail.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: Sangam Lee <74291999+augustinLib@users.noreply.github.com>
Co-authored-by: Kim haewon <ehdvkf02@naver.com>
---
 docs/source/ko/_toctree.yml          |   4 +-
 docs/source/ko/pipeline_webserver.md | 143 +++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/pipeline_webserver.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 93f0075b14e5..e142f05b46b6 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -174,10 +174,10 @@
     title: 패딩과 잘라내기
   - local: bertology
     title: BERTology
-  - local: in_translation
-    title: (번역중) Perplexity of fixed-length models
   - local: perplexity
     title: 고정 길이 모델의 펄플렉서티(Perplexity)
+  - local: pipeline_webserver
+    title: 추론 웹 서버를 위한 파이프라인
   title: (번역중) 개념 가이드
 - sections:
   - sections:
diff --git a/docs/source/ko/pipeline_webserver.md b/docs/source/ko/pipeline_webserver.md
new file mode 100644
index 000000000000..5e423d1a7c4b
--- /dev/null
+++ b/docs/source/ko/pipeline_webserver.md
@@ -0,0 +1,143 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# 웹 서버를 위한 파이프라인 사용하기[[using_pipelines_for_a_webserver]]
+
+<Tip>
+추론 엔진을 만드는 것은 복잡한 주제이며, "최선의" 솔루션은 문제 공간에 따라 달라질 가능성이 높습니다. CPU 또는 GPU를 사용하는지에 따라 다르고 낮은 지연 시간을 원하는지, 높은 처리량을 원하는지, 다양한 모델을 지원할 수 있길 원하는지, 하나의 특정 모델을 고도로 최적화하길 원하는지 등에 따라 달라집니다. 이 주제를 해결하는 방법에는 여러 가지가 있으므로, 이 장에서 제시하는 것은 처음 시도해 보기에 좋은 출발점일 수는 있지만, 이 장을 읽는 여러분이 필요로 하는 최적의 솔루션은 아닐 수 있습니다.
+</Tip>
+
+핵심적으로 이해해야 할 점은 [dataset](pipeline_tutorial#using-pipelines-on-a-dataset)를 다룰 때와 마찬가지로 반복자를 사용 가능하다는 것입니다. 왜냐하면, 웹 서버는 기본적으로 요청을 기다리고 들어오는 대로 처리하는 시스템이기 때문입니다.
+
+보통 웹 서버는 다양한 요청을 동시에 다루기 위해 매우 다중화된 구조(멀티 스레딩, 비동기 등)를 지니고 있습니다. 반면에, 파이프라인(대부분 파이프라인 안에 있는 모델)은 병렬처리에 그다지 좋지 않습니다. 왜냐하면 파이프라인은 많은 RAM을 차지하기 때문입니다. 따라서, 파이프라인이 실행 중이거나 계산 집약적인 작업 중일 때 모든 사용 가능한 리소스를 제공하는 것이 가장 좋습니다.
+
+이 문제를 우리는 웹 서버가 요청을 받고 보내는 가벼운 부하를 처리하고, 실제 작업을 처리하는 단일 스레드를 갖는 방법으로 해결할 것입니다. 이 예제는 `starlette` 라이브러리를 사용합니다.
+실제 프레임워크는 중요하지 않지만, 다른 프레임워크를 사용하는 경우 동일한 효과를 보기 위해선 코드를 조정하거나 변경해야 할 수 있습니다.
+
+`server.py`를 생성하세요:
+
+```py
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+from transformers import pipeline
+import asyncio
+
+
+async def homepage(request):
+    payload = await request.body()
+    string = payload.decode("utf-8")
+    response_q = asyncio.Queue()
+    await request.app.model_queue.put((string, response_q))
+    output = await response_q.get()
+    return JSONResponse(output)
+
+
+async def server_loop(q):
+    pipe = pipeline(model="bert-base-uncased")
+    while True:
+        (string, response_q) = await q.get()
+        out = pipe(string)
+        await response_q.put(out)
+
+
+app = Starlette(
+    routes=[
+        Route("/", homepage, methods=["POST"]),
+    ],
+)
+
+
+@app.on_event("startup")
+async def startup_event():
+    q = asyncio.Queue()
+    app.model_queue = q
+    asyncio.create_task(server_loop(q))
+```
+
+이제 다음 명령어로 실행시킬 수 있습니다:
+
+```bash
+uvicorn server:app
+```
+
+이제 쿼리를 날려볼 수 있습니다:
+
+```bash
+curl -X POST -d "test [MASK]" http://localhost:8000/
+#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
+```
+
+자, 이제 웹 서버를 만드는 방법에 대한 좋은 개념을 알게 되었습니다!
+
+중요한 점은 모델을 **한 번만** 가져온다는 것입니다. 따라서 웹 서버에는 모델의 사본이 없습니다. 이런 방식은 불필요한 RAM이 사용되지 않습니다. 그런 다음 큐 메커니즘을 사용하면, 다음과 같은
+동적 배치를 사용하기 위해 추론 전 단계에 몇 개의 항목을 축적하는 것과 같은 멋진 작업을 할 수 있습니다:
+
+```py
+(string, rq) = await q.get()
+strings = []
+queues = []
+while True:
+    try:
+        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001)  # 1ms
+    except asyncio.exceptions.TimeoutError:
+        break
+    strings.append(string)
+    queues.append(rq)
+strings
+outs = pipe(strings, batch_size=len(strings))
+for rq, out in zip(queues, outs):
+    await rq.put(out)
+```
+
+<Tip warning={true}>
+위의 코드를 작동시키기 전에 당신의 시스템 자원이 충분한지 확인하세요!
+</Tip>
+
+제안된 코드는 가독성을 위해 최적화되었으며, 최상의 코드는 아닙니다.
+첫째, 배치 크기 제한이 없으며 이는 일반적으로 좋은 방식이 아닙니다.
+둘째, 모든 큐 가져오기에서 타임아웃이 재설정되므로 추론을 실행하기 전에 1ms보다 훨씬 오래 기다릴 수 있습니다(첫 번째 요청을 그만큼 지연시킴).
+
+단일 1ms 길이의 데드라인을 두는 편이 더 좋습니다.
+
+이 방식을 사용하면 큐가 비어 있어도 항상 1ms를 기다리게 될 것입니다. 
+큐에 아무것도 없을 때 추론을 원하는 경우에는 최선의 방법이 아닐 수 있습니다.
+하지만 배치 작업이 사용례에 따라 정말로 중요하다면 의미가 있을 수도 있습니다. 
+다시 말하지만, 최상의 솔루션은 없습니다.
+
+## 고려해야 할 몇 가지 사항[[few_things_you_might want_to_consider]]
+
+### 에러 확인[[error_checking]]
+
+프로덕션 환경에서는 문제가 발생할 여지가 많습니다. 
+메모리가 모자라거나, 공간이 부족하거나, 모델을 가져오는 데에 실패하거나, 쿼리가 잘못되었거나, 쿼리는 정확해도 모델 설정이 잘못되어 실행에 실패하는 등등 많은 경우가 존재합니다.
+
+일반적으로 서버가 사용자에게 오류를 출력하는 것이 좋으므로
+오류를 표시하기 위해 `try...except` 문을 많이 추가하는 것이 좋습니다. 
+하지만 보안 상황에 따라 모든 오류를 표시하는 것은 보안상 위험할 수도 있다는 점을 명심해야합니다.
+
+### 서킷 브레이킹[[circuit_breaking]]
+
+웹 서버는 일반적으로 서킷 브레이킹을 수행할 때 더 나은 상황에 직면합니다.
+즉, 이는 서버가 쿼리를 무기한 기다리는 대신 과부하 상태일 때 적절한 오류를 반환하는 것을 의미합니다.
+서버가 매우 오랜 시간 동안 대기하거나 적당한 시간이 지난 후에 504 에러를 반환하는 대신 503 에러를 빠르게 반환하게 하는 것입니다.
+
+제안된 코드에는 단일 큐가 있으므로 구현하기가 비교적 쉽습니다.
+큐 크기를 확인하는 것은 웹 서버가 과부하 상항 하에 있을 때 에러를 반환하기 위한 가장 기초적인 작업입니다.
+
+### 메인 쓰레드 차단[[blocking_the_main_thread]]
+
+현재 PyTorch는 비동기 처리를 지원하지 않으며, 실행 중에는 메인 스레드가 차단됩니다. 
+따라서 PyTorch를 별도의 스레드/프로세스에서 실행하도록 강제하는 것이 좋습니다.
+여기서는 이 작업이 수행되지 않았습니다. 왜냐하면 코드가 훨씬 더 복잡하기 때문입니다(주로 스레드, 비동기 처리, 큐가 서로 잘 맞지 않기 때문입니다).
+하지만 궁극적으로는 같은 작업을 수행하는 것입니다.
+
+단일 항목의 추론이 오래 걸린다면 (> 1초), 메인 쓰레드를 차단하는 것은 중요할 수 있습니다. 왜냐하면 이 경우 추론 중 모든 쿼리는 오류를 받기 전에 1초를 기다려야 하기 때문입니다.
+
+### 동적 배치[[dynamic_batching]]
+
+일반적으로, 배치 처리가 1개 항목을 한 번에 전달하는 것에 비해 반드시 성능 향상이 있는 것은 아닙니다(자세한 내용은 [`batching details`](./main_classes/pipelines#pipeline-batching)을 참고하세요).
+하지만 올바른 설정에서 사용하면 매우 효과적일 수 있습니다.
+API에는 기본적으로 속도 저하의 가능성이 매우 높기 때문에 동적 배치 처리가 없습니다.
+하지만 매우 큰 모델인 BLOOM 추론의 경우 동적 배치 처리는 모든 사람에게 적절한 경험을 제공하는 데 **필수**입니다.

From 31acba56976a7e2746e05f6050984c64006e572b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 26 Jul 2023 14:57:44 +0200
Subject: [PATCH 782/935] Fix `PvtModelIntegrationTest::test_inference_fp16`
 (#25106)

update

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/pvt/test_modeling_pvt.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index afc6fce79e6f..d2290e0a0281 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -317,14 +317,13 @@ def test_inference_fp16(self):
         r"""
         A small test to make sure that inference work in half precision without any problem.
         """
-        model = PvtForImageClassification.from_pretrained(
-            "Zetatech/pvt-tiny-224", torch_dtype=torch.float16, device_map="auto"
-        )
+        model = PvtForImageClassification.from_pretrained("Zetatech/pvt-tiny-224", torch_dtype=torch.float16)
+        model.to(torch_device)
         image_processor = PvtImageProcessor(size=224)
 
         image = prepare_img()
         inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device).astype(torch.float16)
+        pixel_values = inputs.pixel_values.to(torch_device, dtype=torch.float16)
 
         # forward pass to make sure inference works in fp16
         with torch.no_grad():

From 04a5c859b0045238edf589700b45446ee491c8cb Mon Sep 17 00:00:00 2001
From: David Reguera <33068707+nablabits@users.noreply.github.com>
Date: Wed, 26 Jul 2023 14:58:26 +0200
Subject: [PATCH 783/935] Add descriptive docstring to TemperatureLogitsWarper
 (#24892)

* Add descriptive docstring to TemperatureLogitsWarper

It addresses https://github.com/huggingface/transformers/issues/24783

* Remove niche features

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Commit suggestion

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Refactor the examples to simpler ones

* Add a missing comma

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Make args description more compact

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Remove extra text after making description more compact

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Fix linter

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/logits_process.py | 51 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index d5a87df092f7..e17cbae2037c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -172,11 +172,58 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class TemperatureLogitsWarper(LogitsWarper):
     r"""
-    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution), which effectively means
+    that it can control the randomness of the predicted tokens.
+
+    <Tip>
+
+    Make sure that `do_sample=True` is included in the `generate` arguments otherwise the temperature value won't have
+    any effect.
+
+    </Tip>
 
     Args:
         temperature (`float`):
-            The value used to module the logits distribution.
+            Strictly positive float value used to modulate the logits distribution. A value smaller than `1` decreases
+            randomness (and vice versa), with `0` being equivalent to shifting all probability mass to the most likely
+            token.
+
+    Examples:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> model.config.pad_token_id = model.config.eos_token_id
+    >>> model.generation_config.pad_token_id = model.config.eos_token_id
+    >>> input_context = "Hugging Face Company is"
+    >>> input_ids = tokenizer.encode(input_context, return_tensors="pt")
+
+    >>> torch.manual_seed(0)
+
+    >>> # With temperature=1, the default, we consistently get random outputs due to random sampling.
+    >>> outputs = model.generate(input_ids=input_ids, max_new_tokens=10, temperature=1, do_sample=True)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is one of these companies that is going to take a
+
+    >>> outputs = model.generate(input_ids=input_ids, max_new_tokens=10, temperature=1, do_sample=True)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is one of these companies, you can make a very
+
+    >>> # However, with temperature close to 0 , the output remains invariant.
+    >>> outputs = model.generate(input_ids=input_ids, max_new_tokens=10, temperature=0.0001, do_sample=True)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a company that has been around for over 20 years
+
+    >>> # even if we set a different seed.
+    >>> torch.manual_seed(42)
+    >>> outputs = model.generate(input_ids=input_ids, max_new_tokens=10, temperature=0.0001, do_sample=True)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a company that has been around for over 20 years
+    ```
     """
 
     def __init__(self, temperature: float):

From c53c8e490c158f505a9271f7f5d8248473da3d24 Mon Sep 17 00:00:00 2001
From: Leo <lcw-202@163.com>
Date: Wed, 26 Jul 2023 21:07:21 +0800
Subject: [PATCH 784/935] =?UTF-8?q?fix=20"UserWarning:=20Creating=20a=20te?=
 =?UTF-8?q?nsor=20from=20a=20list=20of=20numpy.ndarrays=20is=20=E2=80=A6?=
 =?UTF-8?q?=20(#24772)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix "UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor."

Co-authored-by: 刘长伟 <hzliuchw@corp.netease.com>
---
 src/transformers/tokenization_utils_base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index f5271aff667f..c2d242c9a376 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -700,8 +700,13 @@ def convert_to_tensors(
                 raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
             import torch
 
-            as_tensor = torch.tensor
             is_tensor = torch.is_tensor
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, list) and isinstance(value[0], np.ndarray):
+                    return torch.tensor(np.array(value))
+                return torch.tensor(value)
+
         elif tensor_type == TensorType.JAX:
             if not is_flax_available():
                 raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")

From 224da5df6956340a5c680e5b57b4914d4d7298b6 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 26 Jul 2023 15:09:59 +0200
Subject: [PATCH 785/935] update `use_auth_token` -> `token` (#25083)

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/configuration_utils.py       | 17 ++--
 src/transformers/feature_extraction_utils.py  | 30 ++++++-
 .../generation/configuration_utils.py         | 14 ++-
 src/transformers/image_processing_utils.py    | 30 ++++++-
 src/transformers/modeling_flax_utils.py       | 34 +++++--
 src/transformers/modeling_tf_utils.py         | 47 ++++++++--
 src/transformers/modeling_utils.py            | 27 +++++-
 src/transformers/models/auto/auto_factory.py  | 18 ++++
 .../models/auto/configuration_auto.py         | 11 +++
 .../models/auto/feature_extraction_auto.py    | 33 +++++--
 .../models/auto/image_processing_auto.py      | 33 +++++--
 .../models/auto/processing_auto.py            | 16 +++-
 .../models/auto/tokenization_auto.py          | 29 +++++-
 src/transformers/models/esm/convert_esm.py    |  4 +-
 ...rt_wav2vec2_seq2seq_original_to_pytorch.py |  4 +-
 .../models/wav2vec2/modeling_wav2vec2.py      | 17 +++-
 src/transformers/processing_utils.py          | 17 +++-
 src/transformers/tokenization_utils_base.py   | 28 ++++--
 src/transformers/tools/base.py                |  2 +-
 src/transformers/utils/hub.py                 | 89 +++++++++++++++----
 .../run_{{cookiecutter.example_shortcut}}.py  | 14 +--
 tests/generation/test_configuration_utils.py  | 10 +--
 tests/test_configuration_utils.py             |  4 +-
 23 files changed, 424 insertions(+), 104 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index cb5bb423dca4..3f177ba19fd9 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -435,6 +435,8 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        self._set_token_in_kwargs(kwargs)
+
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
@@ -463,11 +465,11 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=kwargs.get("token"),
             )
 
-    @classmethod
-    def _set_token_in_kwargs(self, kwargs, token=None):
+    @staticmethod
+    def _set_token_in_kwargs(kwargs, token=None):
         """Temporary method to deal with `token` and `use_auth_token`.
 
         This method is to avoid apply the same changes in all model config classes that overwrite `from_pretrained`.
@@ -490,8 +492,7 @@ def _set_token_in_kwargs(self, kwargs, token=None):
             token = use_auth_token
 
         if token is not None:
-            # change to `token` in a follow-up PR
-            kwargs["use_auth_token"] = token
+            kwargs["token"] = token
 
     @classmethod
     def from_pretrained(
@@ -612,6 +613,8 @@ def get_config_dict(
             `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
 
         """
+        cls._set_token_in_kwargs(kwargs)
+
         original_kwargs = copy.deepcopy(kwargs)
         # Get config dict associated with the base config file
         config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
@@ -635,7 +638,7 @@ def _get_config_dict(
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
+        token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
@@ -677,7 +680,7 @@ def _get_config_dict(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 9b0872c4983a..838827f8c5c2 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -360,8 +360,7 @@ def from_pretrained(
             token = use_auth_token
 
         if token is not None:
-            # change to `token` in a follow-up PR
-            kwargs["use_auth_token"] = token
+            kwargs["token"] = token
 
         feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -382,6 +381,18 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
@@ -410,7 +421,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=kwargs.get("token"),
             )
 
         return [output_feature_extractor_file]
@@ -434,10 +445,21 @@ def get_feature_extractor_dict(
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
@@ -471,7 +493,7 @@ def get_feature_extractor_dict(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                 )
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index d05c67b88e91..78a21d74adb3 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -361,6 +361,18 @@ def save_pretrained(
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
 
         if os.path.isfile(save_directory):
@@ -385,7 +397,7 @@ def save_pretrained(
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=kwargs.get("token"),
             )
 
     @classmethod
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index d0d98964beec..efb86a34d193 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -189,8 +189,7 @@ def from_pretrained(
             token = use_auth_token
 
         if token is not None:
-            # change to `token` in a follow-up PR
-            kwargs["use_auth_token"] = token
+            kwargs["token"] = token
 
         image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
 
@@ -211,6 +210,18 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
 
@@ -239,7 +250,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=kwargs.get("token"),
             )
 
         return [output_image_processor_file]
@@ -266,6 +277,7 @@ def get_image_processor_dict(
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
@@ -274,6 +286,16 @@ def get_image_processor_dict(
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
@@ -304,7 +326,7 @@ def get_image_processor_dict(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 51b3a941b99b..f3380af250f9 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -727,7 +727,7 @@ def from_pretrained(
                         "proxies": proxies,
                         "resume_download": resume_download,
                         "local_files_only": local_files_only,
-                        "use_auth_token": token,
+                        "token": token,
                         "user_agent": user_agent,
                         "revision": revision,
                         "subfolder": subfolder,
@@ -758,7 +758,7 @@ def from_pretrained(
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": token,
+                            "token": token,
                         }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -809,7 +809,7 @@ def from_pretrained(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
@@ -1019,7 +1019,13 @@ def from_pretrained(
             return model, unflatten_dict(state)
 
     def save_pretrained(
-        self, save_directory: Union[str, os.PathLike], params=None, push_to_hub=False, max_shard_size="10GB", **kwargs
+        self,
+        save_directory: Union[str, os.PathLike],
+        params=None,
+        push_to_hub=False,
+        max_shard_size="10GB",
+        token: Optional[Union[str, bool]] = None,
+        **kwargs,
     ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
@@ -1043,9 +1049,27 @@ def save_pretrained(
 
                 </Tip>
 
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -1118,7 +1142,7 @@ def save_pretrained(
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=token,
             )
 
     @classmethod
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 7bf284333946..94232e230f22 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2335,6 +2335,7 @@ def save_pretrained(
         max_shard_size: Union[int, str] = "10GB",
         create_pr: bool = False,
         safe_serialization: bool = False,
+        token: Optional[Union[str, bool]] = None,
         **kwargs,
     ):
         """
@@ -2371,9 +2372,27 @@ def save_pretrained(
                 Whether or not to create a PR with the uploaded files or directly commit.
             safe_serialization (`bool`, *optional*, defaults to `False`):
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -2478,7 +2497,7 @@ def save_pretrained(
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=token,
             )
 
     @classmethod
@@ -2665,7 +2684,7 @@ def from_pretrained(
                 resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
-                token=use_auth_token,
+                token=token,
                 revision=revision,
                 _from_auto=from_auto_class,
                 _from_pipeline=from_pipeline,
@@ -2752,7 +2771,7 @@ def from_pretrained(
                         "proxies": proxies,
                         "resume_download": resume_download,
                         "local_files_only": local_files_only,
-                        "use_auth_token": token,
+                        "token": token,
                         "user_agent": user_agent,
                         "revision": revision,
                         "subfolder": subfolder,
@@ -2799,7 +2818,7 @@ def from_pretrained(
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": token,
+                            "token": token,
                         }
                         if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -2846,7 +2865,7 @@ def from_pretrained(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 _commit_hash=commit_hash,
@@ -3036,6 +3055,8 @@ def push_to_hub(
         commit_message: Optional[str] = None,
         private: Optional[bool] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
+        token: Optional[Union[bool, str]] = None,
+        # (`use_auth_token` is deprecated)
         use_auth_token: Optional[Union[bool, str]] = None,
         create_pr: bool = False,
         **base_model_card_args,
@@ -3054,7 +3075,7 @@ def push_to_hub(
                 Message to commit while pushing. Will default to `"Upload model"`.
             private (`bool`, *optional*):
                 Whether or not the repository created should be private.
-            use_auth_token (`bool` or `str`, *optional*):
+            token (`bool` or `str`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
                 is not specified.
@@ -3079,6 +3100,16 @@ def push_to_hub(
         model.push_to_hub("huggingface/my-finetuned-bert")
         ```
         """
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         if "repo_path_or_name" in base_model_card_args:
             warnings.warn(
                 "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
@@ -3096,7 +3127,7 @@ def push_to_hub(
             working_dir = repo_id.split("/")[-1]
 
         repo_id = self._create_repo(
-            repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
+            repo_id, private=private, token=token, repo_url=repo_url, organization=organization
         )
 
         if use_temp_dir is None:
@@ -3121,7 +3152,7 @@ def push_to_hub(
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=use_auth_token,
+                token=token,
                 create_pr=create_pr,
             )
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 4d41ff49841c..9695e325f9b9 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1665,6 +1665,7 @@ def save_pretrained(
         max_shard_size: Union[int, str] = "10GB",
         safe_serialization: bool = False,
         variant: Optional[str] = None,
+        token: Optional[Union[str, bool]] = None,
         **kwargs,
     ):
         """
@@ -1704,9 +1705,27 @@ def save_pretrained(
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
             variant (`str`, *optional*):
                 If specified, weights are saved in the format pytorch_model.<variant>.bin.
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            kwargs["token"] = token
+
         # Checks if the model has been loaded in 8-bit
         if getattr(self, "is_loaded_in_8bit", False) and getattr(self, "is_8bit_serializable", False):
             warnings.warn(
@@ -1872,7 +1891,7 @@ def save_pretrained(
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=token,
             )
 
     def get_memory_footprint(self, return_buffers=True):
@@ -2513,7 +2532,7 @@ def from_pretrained(
                         "proxies": proxies,
                         "resume_download": resume_download,
                         "local_files_only": local_files_only,
-                        "use_auth_token": token,
+                        "token": token,
                         "user_agent": user_agent,
                         "revision": revision,
                         "subfolder": subfolder,
@@ -2558,7 +2577,7 @@ def from_pretrained(
                         has_file_kwargs = {
                             "revision": revision,
                             "proxies": proxies,
-                            "use_auth_token": token,
+                            "token": token,
                         }
                         if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
                             raise EnvironmentError(
@@ -2619,7 +2638,7 @@ def from_pretrained(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 753f2d9f8fb0..36d3435eb7b5 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -16,6 +16,7 @@
 import copy
 import importlib
 import os
+import warnings
 from collections import OrderedDict
 
 from ...configuration_utils import PretrainedConfig
@@ -449,8 +450,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             "revision",
             "subfolder",
             "use_auth_token",
+            "token",
         ]
         hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
+
+        token = hub_kwargs.pop("token", None)
+        use_auth_token = hub_kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        if token is not None:
+            hub_kwargs["token"] = token
+
         if not isinstance(config, PretrainedConfig):
             kwargs_orig = copy.deepcopy(kwargs)
             # ensure not to pollute the config object with torch_dtype="auto" - since it's
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 44d58313c0b2..7230c3f1fa19 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -987,6 +987,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> unused_kwargs
         {'foo': False}
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         kwargs["_from_auto"] = True
         kwargs["name_or_path"] = pretrained_model_name_or_path
         trust_remote_code = kwargs.pop("trust_remote_code", None)
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 09734d200461..9b3ab2b1705b 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -16,6 +16,7 @@
 import importlib
 import json
 import os
+import warnings
 from collections import OrderedDict
 from typing import Dict, Optional, Union
 
@@ -135,7 +136,7 @@ def get_feature_extractor_config(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     **kwargs,
@@ -164,7 +165,7 @@ def get_feature_extractor_config(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -176,7 +177,7 @@ def get_feature_extractor_config(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -198,6 +199,15 @@ def get_feature_extractor_config(
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     resolved_config_file = get_file_from_repo(
         pretrained_model_name_or_path,
         FEATURE_EXTRACTOR_NAME,
@@ -205,7 +215,7 @@ def get_feature_extractor_config(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
     )
@@ -269,7 +279,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -292,7 +302,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -307,6 +317,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
         >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index d86211f1f053..075fe0c96db0 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -16,6 +16,7 @@
 import importlib
 import json
 import os
+import warnings
 from collections import OrderedDict
 from typing import Dict, Optional, Union
 
@@ -143,7 +144,7 @@ def get_image_processor_config(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     **kwargs,
@@ -172,7 +173,7 @@ def get_image_processor_config(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -184,7 +185,7 @@ def get_image_processor_config(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -206,6 +207,15 @@ def get_image_processor_config(
     image_processor.save_pretrained("image-processor-test")
     image_processor_config = get_image_processor_config("image-processor-test")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     resolved_config_file = get_file_from_repo(
         pretrained_model_name_or_path,
         IMAGE_PROCESSOR_NAME,
@@ -213,7 +223,7 @@ def get_image_processor_config(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
     )
@@ -277,7 +287,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -300,7 +310,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -315,6 +325,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*)
         >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7377d069be35..dac242061d69 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -17,6 +17,7 @@
 import inspect
 import json
 import os
+import warnings
 from collections import OrderedDict
 
 # Build the list of all feature extractors
@@ -158,7 +159,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (`str` or *bool*, *optional*):
+            token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -181,7 +182,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -196,6 +197,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
         >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/")
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 370dc4bcb223..e49c7687bd04 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -17,6 +17,7 @@
 import importlib
 import json
 import os
+import warnings
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
@@ -426,7 +427,7 @@ def get_tokenizer_config(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
@@ -456,7 +457,7 @@ def get_tokenizer_config(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -471,7 +472,7 @@ def get_tokenizer_config(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -493,6 +494,15 @@ def get_tokenizer_config(
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     commit_hash = kwargs.get("_commit_hash", None)
     resolved_config_file = cached_file(
         pretrained_model_name_or_path,
@@ -501,7 +511,7 @@ def get_tokenizer_config(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
         subfolder=subfolder,
@@ -613,6 +623,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         >>> # Download vocabulary from huggingface.co and define model-specific arguments
         >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
         ```"""
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
 
diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py
index 6ac74d40e6f9..22ca3f5392c1 100644
--- a/src/transformers/models/esm/convert_esm.py
+++ b/src/transformers/models/esm/convert_esm.py
@@ -378,8 +378,8 @@ def convert_esm_checkpoint_to_pytorch(
     hf_tokenizer.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_repo:
-        model.push_to_hub(repo_id=push_to_repo, use_auth_token=auth_token)
-        hf_tokenizer.push_to_hub(repo_id=push_to_repo, use_auth_token=auth_token)
+        model.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
+        hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
index 8680f96e50d5..89690a5729c9 100644
--- a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
+++ b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
@@ -264,7 +264,7 @@ def convert_wav2vec2_checkpoint(
         add_adapter=True,
         adapter_stride=adapter_stride,
         adapter_kernel_size=adapter_kernel_size,
-        use_auth_token=True,
+        token_token=True,
         output_hidden_size=encoder_output_dim,
     )
     decoder_config = MBartConfig.from_pretrained(decoder_config_path)
@@ -282,7 +282,7 @@ def convert_wav2vec2_checkpoint(
     model = model[0].eval()
 
     # load feature extractor
-    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, use_auth_token=True)
+    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, token_token=True)
 
     # set weights for wav2vec2 encoder
     hf_encoder = Wav2Vec2Model(encoder_config)
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 3e48dc530dec..88d666cd534f 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1230,7 +1230,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
             local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or `bool`, *optional*):
+            token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
@@ -1281,10 +1281,21 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
         revision = kwargs.pop("revision", None)
         use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
 
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         model_path_or_id = self.config._name_or_path
         state_dict = None
 
@@ -1300,7 +1311,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     cache_dir=cache_dir,
                 )
@@ -1335,7 +1346,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     cache_dir=cache_dir,
                 )
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 3a760183e84f..e446c1214fb1 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -114,6 +114,18 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         os.makedirs(save_directory, exist_ok=True)
 
         if push_to_hub:
@@ -149,7 +161,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=kwargs.get("token"),
             )
 
     @classmethod
@@ -209,8 +221,7 @@ def from_pretrained(
             token = use_auth_token
 
         if token is not None:
-            # change to `token` in a follow-up PR
-            kwargs["use_auth_token"] = token
+            kwargs["token"] = token
 
         args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(*args)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index c2d242c9a376..c3d2c4eb8999 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1681,7 +1681,7 @@ def from_pretrained(
 
         <Tip>
 
-        Passing `use_auth_token=True` is required when you want to use a private model.
+        Passing `token=True` is required when you want to use a private model.
 
         </Tip>
 
@@ -1773,7 +1773,7 @@ def from_pretrained(
                     force_download=force_download,
                     resume_download=resume_download,
                     proxies=proxies,
-                    use_auth_token=token,
+                    token=token,
                     revision=revision,
                     local_files_only=local_files_only,
                     subfolder=subfolder,
@@ -1810,7 +1810,7 @@ def from_pretrained(
                     proxies=proxies,
                     resume_download=resume_download,
                     local_files_only=local_files_only,
-                    use_auth_token=token,
+                    token=token,
                     user_agent=user_agent,
                     revision=revision,
                     subfolder=subfolder,
@@ -1848,7 +1848,7 @@ def from_pretrained(
             pretrained_model_name_or_path,
             init_configuration,
             *init_inputs,
-            use_auth_token=token,
+            token=token,
             cache_dir=cache_dir,
             local_files_only=local_files_only,
             _commit_hash=commit_hash,
@@ -1863,7 +1863,7 @@ def _from_pretrained(
         pretrained_model_name_or_path,
         init_configuration,
         *init_inputs,
-        use_auth_token=None,
+        token=None,
         cache_dir=None,
         local_files_only=False,
         _commit_hash=None,
@@ -1880,7 +1880,7 @@ def _from_pretrained(
                 pretrained_model_name_or_path,
                 copy.deepcopy(init_configuration),
                 *init_inputs,
-                use_auth_token=use_auth_token,
+                token=token,
                 cache_dir=cache_dir,
                 local_files_only=local_files_only,
                 _commit_hash=_commit_hash,
@@ -1920,7 +1920,7 @@ def _from_pretrained(
             try:
                 config = AutoConfig.from_pretrained(
                     pretrained_model_name_or_path,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     cache_dir=cache_dir,
                     local_files_only=local_files_only,
                     _commit_hash=_commit_hash,
@@ -2139,6 +2139,18 @@ def save_pretrained(
         Returns:
             A tuple of `str`: The files saved.
         """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -2236,7 +2248,7 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=kwargs.get("use_auth_token"),
+                token=kwargs.get("token"),
             )
 
         return save_files
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index 90ddcf6aa82e..f4a0b31da432 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -506,7 +506,7 @@ def __init__(
         if device_map is not None:
             self.model_kwargs["device_map"] = device_map
         self.hub_kwargs = hub_kwargs
-        self.hub_kwargs["use_auth_token"] = token
+        self.hub_kwargs["token"] = token
 
         super().__init__()
 
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index f4c9e9b3c66e..1634abe6ea92 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -305,7 +305,7 @@ def cached_file(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
@@ -314,6 +314,7 @@ def cached_file(
     _raise_exceptions_for_missing_entries: bool = True,
     _raise_exceptions_for_connection_errors: bool = True,
     _commit_hash: Optional[str] = None,
+    **deprecated_kwargs,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
@@ -337,7 +338,7 @@ def cached_file(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -354,7 +355,7 @@ def cached_file(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -367,6 +368,15 @@ def cached_file(
     # Download a model weight from the Hub and cache it.
     model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
     ```"""
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     # Private arguments
     #     _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return
     #         None.
@@ -426,7 +436,7 @@ def cached_file(
             force_download=force_download,
             proxies=proxies,
             resume_download=resume_download,
-            use_auth_token=use_auth_token,
+            token=token,
             local_files_only=local_files_only,
         )
     except GatedRepoError as e:
@@ -490,10 +500,11 @@ def get_file_from_repo(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
+    **deprecated_kwargs,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
@@ -517,7 +528,7 @@ def get_file_from_repo(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -532,7 +543,7 @@ def get_file_from_repo(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -548,6 +559,15 @@ def get_file_from_repo(
     # This model does not have a tokenizer config so the result will be None.
     tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
     ```"""
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     return cached_file(
         path_or_repo_id=path_or_repo,
         filename=filename,
@@ -555,7 +575,7 @@ def get_file_from_repo(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=revision,
         local_files_only=local_files_only,
         subfolder=subfolder,
@@ -595,7 +615,8 @@ def has_file(
     filename: str,
     revision: Optional[str] = None,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **deprecated_kwargs,
 ):
     """
     Checks if a repo contains a given file without downloading it. Works for remote repos and local folders.
@@ -607,11 +628,20 @@ def has_file(
 
     </Tip>
     """
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if os.path.isdir(path_or_repo):
         return os.path.isfile(os.path.join(path_or_repo, filename))
 
     url = hf_hub_url(path_or_repo, filename=filename, revision=revision)
-    headers = build_hf_headers(use_auth_token=use_auth_token, user_agent=http_user_agent())
+    headers = build_hf_headers(token=token, user_agent=http_user_agent())
 
     r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=10)
     try:
@@ -647,7 +677,7 @@ def _create_repo(
         self,
         repo_id: str,
         private: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
         repo_url: Optional[str] = None,
         organization: Optional[str] = None,
     ) -> str:
@@ -671,11 +701,11 @@ def _create_repo(
                     repo_id = repo_id.split("/")[-1]
                 repo_id = f"{organization}/{repo_id}"
 
-        url = create_repo(repo_id=repo_id, token=use_auth_token, private=private, exist_ok=True)
+        url = create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
 
         # If the namespace is not there, add it or `upload_file` will complain
         if "/" not in repo_id and url != f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{repo_id}":
-            repo_id = get_full_repo_name(repo_id, token=use_auth_token)
+            repo_id = get_full_repo_name(repo_id, token=token)
         return repo_id
 
     def _get_files_timestamps(self, working_dir: Union[str, os.PathLike]):
@@ -749,7 +779,7 @@ def push_to_hub(
         use_temp_dir: Optional[bool] = None,
         commit_message: Optional[str] = None,
         private: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
         create_pr: bool = False,
         safe_serialization: bool = False,
@@ -770,7 +800,7 @@ def push_to_hub(
                 Message to commit while pushing. Will default to `"Upload {object}"`.
             private (`bool`, *optional*):
                 Whether or not the repository created should be private.
-            use_auth_token (`bool` or `str`, *optional*):
+            token (`bool` or `str`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
                 is not specified.
@@ -797,6 +827,17 @@ def push_to_hub(
         {object}.push_to_hub("huggingface/my-finetuned-bert")
         ```
         """
+        use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
         if "repo_path_or_name" in deprecated_kwargs:
             warnings.warn(
                 "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
@@ -814,7 +855,7 @@ def push_to_hub(
             working_dir = repo_id.split("/")[-1]
 
         repo_id = self._create_repo(
-            repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
+            repo_id, private=private, token=token, repo_url=repo_url, organization=organization
         )
 
         if use_temp_dir is None:
@@ -831,7 +872,7 @@ def push_to_hub(
                 repo_id,
                 files_timestamps,
                 commit_message=commit_message,
-                token=use_auth_token,
+                token=token,
                 create_pr=create_pr,
             )
 
@@ -923,11 +964,12 @@ def get_checkpoint_shard_files(
     proxies=None,
     resume_download=False,
     local_files_only=False,
-    use_auth_token=None,
+    token=None,
     user_agent=None,
     revision=None,
     subfolder="",
     _commit_hash=None,
+    **deprecated_kwargs,
 ):
     """
     For a given model:
@@ -941,6 +983,15 @@ def get_checkpoint_shard_files(
     """
     import json
 
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if not os.path.isfile(index_filename):
         raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
 
@@ -976,7 +1027,7 @@ def get_checkpoint_shard_files(
                 proxies=proxies,
                 resume_download=resume_download,
                 local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
+                token=token,
                 user_agent=user_agent,
                 revision=revision,
                 subfolder=subfolder,
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index e7a622edd715..c6b72f4fa30b 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -115,7 +115,7 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
+    token: bool = field(
         default=False,
         metadata={
             "help": "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
@@ -289,7 +289,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": True if model_args.token else None,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -303,7 +303,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": True if model_args.token else None,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -322,7 +322,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.token else None,
         )
     else:
         logger.info("Training new model from scratch")
@@ -336,14 +336,14 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.token else None,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -351,7 +351,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.token else None,
     )
 {% endif %}
 
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index c2dd7b005b75..f58b227c1459 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -144,7 +144,7 @@ def test_push_to_hub(self):
             temperature=0.7,
             length_penalty=1.0,
         )
-        config.push_to_hub("test-generation-config", use_auth_token=self._token)
+        config.push_to_hub("test-generation-config", token=self._token)
 
         new_config = GenerationConfig.from_pretrained(f"{USER}/test-generation-config")
         for k, v in config.to_dict().items():
@@ -156,9 +156,7 @@ def test_push_to_hub(self):
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                tmp_dir, repo_id="test-generation-config", push_to_hub=True, use_auth_token=self._token
-            )
+            config.save_pretrained(tmp_dir, repo_id="test-generation-config", push_to_hub=True, token=self._token)
 
         new_config = GenerationConfig.from_pretrained(f"{USER}/test-generation-config")
         for k, v in config.to_dict().items():
@@ -171,7 +169,7 @@ def test_push_to_hub_in_organization(self):
             temperature=0.7,
             length_penalty=1.0,
         )
-        config.push_to_hub("valid_org/test-generation-config-org", use_auth_token=self._token)
+        config.push_to_hub("valid_org/test-generation-config-org", token=self._token)
 
         new_config = GenerationConfig.from_pretrained("valid_org/test-generation-config-org")
         for k, v in config.to_dict().items():
@@ -184,7 +182,7 @@ def test_push_to_hub_in_organization(self):
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
             config.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-generation-config-org", push_to_hub=True, use_auth_token=self._token
+                tmp_dir, repo_id="valid_org/test-generation-config-org", push_to_hub=True, token=self._token
             )
 
         new_config = GenerationConfig.from_pretrained("valid_org/test-generation-config-org")
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
index fe278cc2b149..12a956a55eca 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/test_configuration_utils.py
@@ -119,7 +119,7 @@ def test_push_to_hub(self):
         config = BertConfig(
             vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
         )
-        config.push_to_hub("test-config", use_auth_token=self._token)
+        config.push_to_hub("test-config", token=self._token)
 
         new_config = BertConfig.from_pretrained(f"{USER}/test-config")
         for k, v in config.to_dict().items():
@@ -131,7 +131,7 @@ def test_push_to_hub(self):
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, use_auth_token=self._token)
+            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, token=self._token)
 
         new_config = BertConfig.from_pretrained(f"{USER}/test-config")
         for k, v in config.to_dict().items():

From d30cf3d02fc58aeee56447a5f51b60d10557aab8 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 26 Jul 2023 15:34:42 +0200
Subject: [PATCH 786/935] Fix past CI after #24334 (#25113)

update

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/generation/logits_process.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index e17cbae2037c..9f130dfa2e9c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -725,7 +725,9 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
                 torch.tensor(sequence_ids[:-1], dtype=input_ids.dtype, device=input_ids.device),
             ).prod(dim=1)
             bias[:, last_token] += torch.where(
-                matching_rows.bool(), sequence_bias, torch.tensor(0.0, device=input_ids.device)
+                matching_rows.bool(),
+                torch.tensor(sequence_bias, device=input_ids.device),
+                torch.tensor(0.0, device=input_ids.device),
             )
 
         # 5 - apply the bias to the scores

From 1486d2aec2c667aa2beeed5eaac6625c87577093 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 26 Jul 2023 15:09:17 +0100
Subject: [PATCH 787/935] Move common image processing methods to
 BaseImageProcessor (#25089)

Move out common methods
---
 src/transformers/image_processing_utils.py    | 53 +++++++++++++++++++
 .../models/beit/image_processing_beit.py      | 45 +---------------
 .../models/bit/image_processing_bit.py        | 45 ----------------
 .../models/blip/image_processing_blip.py      | 45 +---------------
 .../image_processing_bridgetower.py           | 47 +---------------
 .../image_processing_chinese_clip.py          | 45 ----------------
 .../models/clip/image_processing_clip.py      | 45 ----------------
 .../image_processing_conditional_detr.py      | 14 -----
 .../convnext/image_processing_convnext.py     | 45 ----------------
 .../image_processing_deformable_detr.py       | 14 -----
 .../models/deit/image_processing_deit.py      | 45 +---------------
 .../models/deta/image_processing_deta.py      | 14 -----
 .../models/detr/image_processing_detr.py      | 13 -----
 .../models/donut/image_processing_donut.py    | 45 ----------------
 .../models/dpt/image_processing_dpt.py        | 45 +---------------
 .../image_processing_efficientformer.py       | 53 -------------------
 .../image_processing_efficientnet.py          | 25 +--------
 .../models/flava/image_processing_flava.py    | 45 +---------------
 .../models/glpn/image_processing_glpn.py      | 24 +--------
 .../layoutlmv3/image_processing_layoutlmv3.py | 45 +---------------
 .../models/levit/image_processing_levit.py    | 47 +---------------
 .../image_processing_mask2former.py           | 13 -----
 .../maskformer/image_processing_maskformer.py | 13 -----
 .../image_processing_mobilenet_v1.py          | 53 -------------------
 .../image_processing_mobilenet_v2.py          | 53 -------------------
 .../mobilevit/image_processing_mobilevit.py   | 21 --------
 .../oneformer/image_processing_oneformer.py   | 14 -----
 .../models/owlvit/image_processing_owlvit.py  | 14 -----
 .../perceiver/image_processing_perceiver.py   | 45 +---------------
 .../poolformer/image_processing_poolformer.py | 45 ----------------
 .../models/pvt/image_processing_pvt.py        | 53 +------------------
 .../models/sam/image_processing_sam.py        | 45 +---------------
 .../segformer/image_processing_segformer.py   | 45 +---------------
 .../swin2sr/image_processing_swin2sr.py       | 24 +--------
 .../models/tvlt/image_processing_tvlt.py      | 52 ------------------
 .../videomae/image_processing_videomae.py     | 45 ----------------
 .../models/vilt/image_processing_vilt.py      | 45 +---------------
 .../models/vit/image_processing_vit.py        | 53 +------------------
 .../vit_hybrid/image_processing_vit_hybrid.py | 45 ----------------
 .../models/vivit/image_processing_vivit.py    | 24 ---------
 .../models/yolos/image_processing_yolos.py    | 14 -----
 41 files changed, 70 insertions(+), 1445 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index efb86a34d193..f30c6961af5a 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -23,6 +23,8 @@
 
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .image_transforms import normalize, rescale
+from .image_utils import ChannelDimension
 from .utils import (
     IMAGE_PROCESSOR_NAME,
     PushToHubMixin,
@@ -518,6 +520,57 @@ def __call__(self, images, **kwargs) -> BatchFeature:
     def preprocess(self, images, **kwargs) -> BatchFeature:
         raise NotImplementedError("Each image processor must implement its own preprocess method")
 
+    def rescale(
+        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
 
 VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"})
 
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index dd2ce35936e1..713ddd46d265 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import center_crop, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -189,49 +189,6 @@ def center_crop(
         size = get_size_dict(size, default_to_square=True, param_name="size")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def reduce_label(self, label: ImageInput) -> np.ndarray:
         label = to_numpy_array(label)
         # Avoid using underflow conversion
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index b88f7861d965..23086c9b53ec 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -23,8 +23,6 @@
     center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -173,49 +171,6 @@ def center_crop(
             raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index 50808ec65c5d..1727d202fefb 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import convert_to_rgb, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
@@ -137,49 +137,6 @@ def resize(
         output_size = (size["height"], size["width"])
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index ede52b6fe5e1..cb1281ddbbf8 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import PaddingMode, center_crop, normalize, pad, rescale, resize, to_channel_dimension_format
+from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
@@ -226,27 +226,6 @@ def resize(
         output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.rescale
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
     def center_crop(
         self,
         image: np.ndarray,
@@ -269,30 +248,6 @@ def center_crop(
         output_size = size["shortest_edge"]
         return center_crop(image, size=(output_size, output_size), data_format=data_format, **kwargs)
 
-    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def _pad_image(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index 5aa4eafbbba5..846c52c3330d 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -23,8 +23,6 @@
     center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -171,49 +169,6 @@ def center_crop(
         size = get_size_dict(size)
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index f9b5f3edde7e..21293cff489d 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -23,8 +23,6 @@
     center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -173,49 +171,6 @@ def center_crop(
             raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 390713c0a5ea..5bb90d5d74b1 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -28,7 +28,6 @@
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
@@ -943,19 +942,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 5e4d30c5c9e8..cfd5b38d76b9 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -158,49 +156,6 @@ def resize(
                 image, size=(shortest_edge, shortest_edge), resample=resample, data_format=data_format, **kwargs
             )
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index ae756c0a2b26..2b54921f27ed 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -28,7 +28,6 @@
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
@@ -941,19 +940,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index 3f9d5ee2d0ab..2e36868f2237 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import center_crop, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -159,49 +159,6 @@ def center_crop(
             raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index 573c71dafc28..d4734c9894ab 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -25,7 +25,6 @@
     PaddingMode,
     center_to_corners_format,
     corners_to_center_format,
-    normalize,
     pad,
     rescale,
     resize,
@@ -614,19 +613,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index af14c38dcc49..dcdf6f85df7f 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -27,7 +27,6 @@
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
@@ -916,18 +915,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """
         Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 294d42580dbd..db72f7f32658 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -21,9 +21,7 @@
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     get_resize_output_image_size,
-    normalize,
     pad,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -263,49 +261,6 @@ def resize(
         resized_image = resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
         return resized_image
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 4f5dbc44507e..eb49967be6b3 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -191,49 +191,6 @@ def resize(
         )
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/efficientformer/image_processing_efficientformer.py
index 81e3d798d119..c5b1dc87256b 100644
--- a/src/transformers/models/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/efficientformer/image_processing_efficientformer.py
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -175,57 +173,6 @@ def center_crop(
             raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index f4d2a88ee4d1..872535c9d1d7 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import center_crop, rescale, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -198,29 +198,6 @@ def rescale(
             rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
         return rescaled_image
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 3b5755d9b4da..ceb684bf3773 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import center_crop, resize, to_channel_dimension_format
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
@@ -383,49 +383,6 @@ def center_crop(
             raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def map_pixels(self, image: np.ndarray) -> np.ndarray:
         return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 7b0f316d2b51..290100ecd684 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -20,7 +20,7 @@
 import PIL.Image
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     ChannelDimension,
     PILImageResampling,
@@ -101,28 +101,6 @@ def resize(
         image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
         return image
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale the image by the given scaling factor `scale`.
-
-        Args:
-            image (`np.ndarray`):
-                The image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the output image. If `None`, the channel dimension format of the input
-                image is used. Can be one of:
-                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 5b67bd94dba4..cdf2d03e785e 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format, to_pil_image
+from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -184,49 +184,6 @@ def resize(
         output_size = (size["height"], size["width"])
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `Iterable[float]`):
-                Mean values to be used for normalization.
-            std (`float` or `Iterable[float]`):
-                Standard deviation values to be used for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py
index 6e1427294187..8663a3aaaca6 100644
--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Image processor class for LeViT."""
 
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Dict, Iterable, Optional, Union
 
 import numpy as np
 
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -184,49 +182,6 @@ def center_crop(
             raise ValueError(f"Size dict must have keys 'height' and 'width'. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 59cf03792531..0690f9536eef 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -24,7 +24,6 @@
 from ...image_transforms import (
     PaddingMode,
     get_resize_output_image_size,
-    normalize,
     pad,
     rescale,
     resize,
@@ -492,18 +491,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     def convert_segmentation_map_to_binary_masks(
         self,
         segmentation_map: "np.ndarray",
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index c40212a75a73..fdd0f2d9aacf 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -24,7 +24,6 @@
 from ...image_transforms import (
     PaddingMode,
     get_resize_output_image_size,
-    normalize,
     pad,
     rescale,
     resize,
@@ -503,18 +502,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     def convert_segmentation_map_to_binary_masks(
         self,
         segmentation_map: "np.ndarray",
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index 9ba6bb1685c0..52099c91b798 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -164,57 +162,6 @@ def center_crop(
         size = get_size_dict(size)
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index be8020a7ea47..1f5a8e5995fe 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -170,57 +168,6 @@ def center_crop(
             raise ValueError(f"The `size` parameter must contain the keys `height` and `width`. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index fe0cc6ab22e2..7f0be8995730 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -23,7 +23,6 @@
     center_crop,
     flip_channel_order,
     get_resize_output_image_size,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -161,26 +160,6 @@ def center_crop(
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
     def flip_channel_order(
         self, image: np.ndarray, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index 6f59cb661cc6..d8944773661a 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -25,7 +25,6 @@
 from ...image_transforms import (
     PaddingMode,
     get_resize_output_image_size,
-    normalize,
     pad,
     rescale,
     resize,
@@ -487,19 +486,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
     def convert_segmentation_map_to_binary_masks(
         self,
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 9268aac15b57..0dccdf129a34 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -23,7 +23,6 @@
 from ...image_transforms import (
     center_crop,
     center_to_corners_format,
-    normalize,
     rescale,
     resize,
     to_channel_dimension_format,
@@ -210,19 +209,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format, **kwargs)
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: List[float],
-        std: List[float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image with a certain mean and standard deviation.
-        """
-        return normalize(image, mean, std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/perceiver/image_processing_perceiver.py b/src/transformers/models/perceiver/image_processing_perceiver.py
index ecfd70261562..927cbf9b198e 100644
--- a/src/transformers/models/perceiver/image_processing_perceiver.py
+++ b/src/transformers/models/perceiver/image_processing_perceiver.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import center_crop, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -174,49 +174,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index d92ffaa3ddc7..211affce15cd 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -219,49 +217,6 @@ def center_crop(
             raise ValueError(f"size must contain 'height' and 'width' as keys. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/pvt/image_processing_pvt.py b/src/transformers/models/pvt/image_processing_pvt.py
index 767499dd2450..c12c8a91eac5 100644
--- a/src/transformers/models/pvt/image_processing_pvt.py
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -127,57 +127,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index 821b43624d07..2ee0b47250cc 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import convert_to_rgb, normalize, pad, rescale, resize, to_channel_dimension_format
+from ...image_transforms import convert_to_rgb, pad, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -212,49 +212,6 @@ def resize(
         output_height, output_width = self._get_preprocess_shape(input_size, size["longest_edge"])
         return resize(image, size=(output_height, output_width), resample=resample, data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index df92c940ce76..30d9d02f432d 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -156,49 +156,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def reduce_label(self, label: ImageInput) -> np.ndarray:
         label = to_numpy_array(label)
         # Avoid using underflow conversion
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index 5af2ed410631..0558fd467bf6 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import get_image_size, pad, rescale, to_channel_dimension_format
+from ...image_transforms import get_image_size, pad, to_channel_dimension_format
 from ...image_utils import ChannelDimension, ImageInput, make_list_of_images, to_numpy_array, valid_images
 from ...utils import TensorType, logging
 
@@ -57,28 +57,6 @@ def __init__(
         self.do_pad = do_pad
         self.pad_size = pad_size
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
     def pad(self, image: np.ndarray, size: int, data_format: Optional[Union[str, ChannelDimension]] = None):
         """
         Pad an image to make the height and width divisible by `size`.
diff --git a/src/transformers/models/tvlt/image_processing_tvlt.py b/src/transformers/models/tvlt/image_processing_tvlt.py
index d07ca31e2ffe..073f32977301 100644
--- a/src/transformers/models/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/tvlt/image_processing_tvlt.py
@@ -21,8 +21,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -208,56 +206,6 @@ def center_crop(
             raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def _preprocess_image(
         self,
         image: ImageInput,
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 3bc1ab5dd65b..f439608c579b 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -22,8 +22,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -187,49 +185,6 @@ def center_crop(
             raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def _preprocess_image(
         self,
         image: ImageInput,
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 930a2e653f30..2e9aad6a9513 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import PaddingMode, normalize, pad, rescale, resize, to_channel_dimension_format
+from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -229,49 +229,6 @@ def resize(
         output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean.
-            std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def _pad_image(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 71f50d8b9cab..8946778d8c44 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -127,57 +127,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def rescale(
-        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `List[float]`):
-                Image mean to use for normalization.
-            std (`float` or `List[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index a45ece6e5a38..4c3fbc96844e 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -23,8 +23,6 @@
     center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
-    normalize,
-    rescale,
     resize,
     to_channel_dimension_format,
 )
@@ -173,49 +171,6 @@ def center_crop(
             raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
         return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
 
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: Union[int, float],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ):
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`int` or `float`):
-                Scale to apply to the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index b6a1adaf08be..0ab5cded7985 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -24,7 +24,6 @@
 from ...image_transforms import (
     center_crop,
     get_resize_output_image_size,
-    normalize,
     rescale,
     resize,
     to_channel_dimension_format,
@@ -222,29 +221,6 @@ def rescale(
             image = image - (scale / 2)
         return rescale(image, scale=scale, data_format=data_format, **kwargs)
 
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, List[float]],
-        std: Union[float, List[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            image_mean (`float` or `List[float]`):
-                Image mean.
-            image_std (`float` or `List[float]`):
-                Image standard deviation.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
-
     def _preprocess_image(
         self,
         image: ImageInput,
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 372ab2b3ce3a..e37db77bec25 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -26,7 +26,6 @@
     center_to_corners_format,
     corners_to_center_format,
     id_to_rgb,
-    normalize,
     pad,
     rescale,
     resize,
@@ -852,19 +851,6 @@ def rescale(
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
-        """
-        Normalize the image with the given mean and standard deviation.
-        """
-        return normalize(image, mean=mean, std=std, data_format=data_format)
-
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         """

From b914ec98476c4fe44a48a1d8d65c424d0c97fd2a Mon Sep 17 00:00:00 2001
From: Eric Bezzam <ebezzam@gmail.com>
Date: Wed, 26 Jul 2023 17:08:57 +0200
Subject: [PATCH 788/935] Fix ViT docstring regarding default dropout values.
 (#25118)

Fix docstring for dropout.
---
 src/transformers/models/vit/configuration_vit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index ed760189634f..d87b673b7530 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -55,9 +55,9 @@ class ViTConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

From 659829b6ae558dd2e178462a797bf8b1a749f070 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 26 Jul 2023 16:23:30 +0100
Subject: [PATCH 789/935] MaskFormer - enable return_dict in order to compile
 (#25052)

* Enable return_dict in order to compile

* Update tests
---
 .../models/maskformer/modeling_maskformer.py  | 76 ++++++++++++----
 .../maskformer/test_modeling_maskformer.py    | 91 ++++++++++++++-----
 2 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 98f0cdb5044f..8a9a950cbd9d 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -1254,11 +1254,16 @@ def __init__(self, *args, feature_size: int = 256, mask_feature_size: int = 256,
         self.fpn = MaskFormerFPNModel(*args, feature_size=feature_size, **kwargs)
         self.mask_projection = nn.Conv2d(feature_size, mask_feature_size, kernel_size=3, padding=1)
 
-    def forward(self, features: List[Tensor], output_hidden_states: bool = False) -> MaskFormerPixelDecoderOutput:
+    def forward(
+        self, features: List[Tensor], output_hidden_states: bool = False, return_dict: bool = True
+    ) -> MaskFormerPixelDecoderOutput:
         fpn_features = self.fpn(features)
         # we use the last feature map
         last_feature_projected = self.mask_projection(fpn_features[-1])
 
+        if not return_dict:
+            return (last_feature_projected, tuple(fpn_features)) if output_hidden_states else (last_feature_projected,)
+
         return MaskFormerPixelDecoderOutput(
             last_hidden_state=last_feature_projected, hidden_states=tuple(fpn_features) if output_hidden_states else ()
         )
@@ -1387,9 +1392,20 @@ def __init__(self, config: MaskFormerConfig):
             lateral_widths=feature_channels[:-1],
         )
 
-    def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> MaskFormerPixelLevelModuleOutput:
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> MaskFormerPixelLevelModuleOutput:
         features = self.encoder(pixel_values).feature_maps
-        decoder_output = self.decoder(features, output_hidden_states)
+        decoder_output = self.decoder(features, output_hidden_states, return_dict=return_dict)
+
+        if not return_dict:
+            last_hidden_state = decoder_output[0]
+            outputs = (features[-1], last_hidden_state)
+            if output_hidden_states:
+                hidden_states = decoder_output[1]
+                outputs = outputs + (tuple(features),) + (hidden_states,)
+            return outputs
+
         return MaskFormerPixelLevelModuleOutput(
             # the last feature is actually the output from the last layer
             encoder_last_hidden_state=features[-1],
@@ -1414,7 +1430,11 @@ def __init__(self, in_features: int, config: MaskFormerConfig):
         self.decoder = DetrDecoder(config=config.decoder_config)
 
     def forward(
-        self, image_features: Tensor, output_hidden_states: bool = False, output_attentions: bool = False
+        self,
+        image_features: Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+        return_dict: Optional[bool] = None,
     ) -> DetrDecoderOutput:
         if self.input_projection is not None:
             image_features = self.input_projection(image_features)
@@ -1438,7 +1458,7 @@ def forward(
             query_position_embeddings=queries_embeddings,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=None,
+            return_dict=return_dict,
         )
         return decoder_output
 
@@ -1593,9 +1613,11 @@ def forward(
         if pixel_mask is None:
             pixel_mask = torch.ones((batch_size, height, width), device=pixel_values.device)
 
-        pixel_level_module_output = self.pixel_level_module(pixel_values, output_hidden_states)
-        image_features = pixel_level_module_output.encoder_last_hidden_state
-        pixel_embeddings = pixel_level_module_output.decoder_last_hidden_state
+        pixel_level_module_output = self.pixel_level_module(
+            pixel_values, output_hidden_states, return_dict=return_dict
+        )
+        image_features = pixel_level_module_output[0]
+        pixel_embeddings = pixel_level_module_output[1]
 
         transformer_module_output = self.transformer_module(image_features, output_hidden_states, output_attentions)
         queries = transformer_module_output.last_hidden_state
@@ -1606,9 +1628,9 @@ def forward(
         hidden_states = None
 
         if output_hidden_states:
-            encoder_hidden_states = pixel_level_module_output.encoder_hidden_states
-            pixel_decoder_hidden_states = pixel_level_module_output.decoder_hidden_states
-            transformer_decoder_hidden_states = transformer_module_output.hidden_states
+            encoder_hidden_states = pixel_level_module_output[2]
+            pixel_decoder_hidden_states = pixel_level_module_output[3]
+            transformer_decoder_hidden_states = transformer_module_output[1]
             hidden_states = encoder_hidden_states + pixel_decoder_hidden_states + transformer_decoder_hidden_states
 
         output = MaskFormerModelOutput(
@@ -1803,13 +1825,25 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs: MaskFormerModelOutput = self.model(
+        raw_outputs = self.model(
             pixel_values,
             pixel_mask,
             output_hidden_states=output_hidden_states or self.config.use_auxiliary_loss,
-            return_dict=True,
+            return_dict=return_dict,
             output_attentions=output_attentions,
         )
+        # We need to have raw_outputs optionally be returned as a dict to use torch.compile. For backwards
+        # compatibility we convert to a dataclass for the rest of the model logic
+        outputs = MaskFormerModelOutput(
+            encoder_last_hidden_state=raw_outputs[0],
+            pixel_decoder_last_hidden_state=raw_outputs[1],
+            transformer_decoder_last_hidden_state=raw_outputs[2],
+            encoder_hidden_states=raw_outputs[3] if output_hidden_states else None,
+            pixel_decoder_hidden_states=raw_outputs[4] if output_hidden_states else None,
+            transformer_decoder_hidden_states=raw_outputs[5] if output_hidden_states else None,
+            hidden_states=raw_outputs[6] if output_hidden_states else None,
+            attentions=raw_outputs[-1] if output_attentions else None,
+        )
 
         loss, loss_dict, auxiliary_logits = None, None, None
 
@@ -1827,16 +1861,18 @@ def forward(
         if not output_auxiliary_logits:
             auxiliary_logits = None
 
-        output = MaskFormerForInstanceSegmentationOutput(
+        if not return_dict:
+            output = tuple(
+                v
+                for v in (loss, class_queries_logits, masks_queries_logits, auxiliary_logits, *outputs.values())
+                if v is not None
+            )
+            return output
+
+        return MaskFormerForInstanceSegmentationOutput(
             loss=loss,
             **outputs,
             class_queries_logits=class_queries_logits,
             masks_queries_logits=masks_queries_logits,
             auxiliary_logits=auxiliary_logits,
         )
-
-        if not return_dict:
-            output = tuple(v for v in output.values())
-            if loss is not None:
-                output = ((loss)) + output
-        return output
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index c69a0c94ced2..c37127cea732 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Testing suite for the PyTorch MaskFormer model. """
 
+import copy
 import inspect
 import unittest
 
@@ -54,6 +55,8 @@ def __init__(
         max_size=32 * 6,
         num_labels=4,
         mask_feature_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -65,6 +68,9 @@ def __init__(
         self.max_size = max_size
         self.num_labels = num_labels
         self.mask_feature_size = mask_feature_size
+        # This is passed to the decoder config. We add it to the model tester here for testing
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]).to(
@@ -91,11 +97,12 @@ def get_config(self):
             ),
             decoder_config=DetrConfig(
                 decoder_ffn_dim=64,
-                decoder_layers=2,
+                decoder_layers=self.num_hidden_layers,
+                decoder_attention_heads=self.num_attention_heads,
                 encoder_ffn_dim=64,
-                encoder_layers=2,
+                encoder_layers=self.num_hidden_layers,
+                encoder_attention_heads=self.num_attention_heads,
                 num_queries=self.num_queries,
-                decoder_attention_heads=2,
                 d_model=self.mask_feature_size,
             ),
             mask_feature_size=self.mask_feature_size,
@@ -196,6 +203,27 @@ def setUp(self):
         self.model_tester = MaskFormerModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MaskFormerConfig, has_text_modality=False)
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class in [MaskFormerForInstanceSegmentation]:
+                inputs_dict["mask_labels"] = torch.zeros(
+                    (
+                        self.model_tester.batch_size,
+                        self.model_tester.num_labels,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                    ),
+                    dtype=torch.float32,
+                    device=torch_device,
+                )
+                inputs_dict["class_labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.num_labels), dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -265,26 +293,47 @@ def test_hidden_states_output(self):
         self.model_tester.create_and_check_maskformer_model(config, **inputs, output_hidden_states=True)
 
     def test_attention_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
 
         for model_class in self.all_model_classes:
-            model = model_class(config).to(torch_device)
-            outputs = model(**inputs, output_attentions=True)
-            self.assertTrue(outputs.attentions is not None)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-        # only MaskFormerForInstanceSegmentation has the loss
-        model_class = self.all_model_classes[1]
-        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
-
-        model = model_class(config)
-        model.to(torch_device)
-        model.train()
-
-        loss = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels).loss
-        loss.backward()
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # Check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            # encoder_hidden_states, pixel_decoder_hidden_states, transformer_decoder_hidden_states, hidden_states
+            added_hidden_states = 4
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
 
     def test_retain_grad_hidden_states_attentions(self):
         # only MaskFormerForInstanceSegmentation has the loss

From 1689aea73346816b936b84932e12b774974e61a6 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 26 Jul 2023 18:30:38 +0100
Subject: [PATCH 790/935] Move center_crop to BaseImageProcessor (#25122)

---
 src/transformers/image_processing_utils.py    | 26 ++++++++++++++++++-
 .../models/beit/image_processing_beit.py      | 24 +----------------
 .../models/bit/image_processing_bit.py        | 25 ------------------
 .../image_processing_chinese_clip.py          | 23 ----------------
 .../models/clip/image_processing_clip.py      | 25 ------------------
 .../models/deit/image_processing_deit.py      | 26 +------------------
 .../image_processing_efficientformer.py       | 25 ------------------
 .../image_processing_efficientnet.py          | 26 +------------------
 .../models/flava/image_processing_flava.py    | 26 +------------------
 .../models/levit/image_processing_levit.py    | 24 -----------------
 .../image_processing_mobilenet_v1.py          | 23 ----------------
 .../image_processing_mobilenet_v2.py          | 25 ------------------
 .../mobilevit/image_processing_mobilevit.py   | 25 ------------------
 .../poolformer/image_processing_poolformer.py | 25 ------------------
 .../models/tvlt/image_processing_tvlt.py      | 25 ------------------
 .../videomae/image_processing_videomae.py     | 25 ------------------
 .../vit_hybrid/image_processing_vit_hybrid.py | 25 ------------------
 .../models/vivit/image_processing_vivit.py    | 25 ------------------
 18 files changed, 29 insertions(+), 419 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index f30c6961af5a..078858909183 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -23,7 +23,7 @@
 
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
-from .image_transforms import normalize, rescale
+from .image_transforms import center_crop, normalize, rescale
 from .image_utils import ChannelDimension
 from .utils import (
     IMAGE_PROCESSOR_NAME,
@@ -571,6 +571,30 @@ def normalize(
         """
         return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
 
 VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"})
 
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 713ddd46d265..ff6564dfcb43 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -167,28 +167,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size, default_to_square=True, param_name="size")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def reduce_label(self, label: ImageInput) -> np.ndarray:
         label = to_numpy_array(label)
         # Avoid using underflow conversion
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 23086c9b53ec..f26ef239d016 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
     resize,
@@ -147,30 +146,6 @@ def resize(
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index 846c52c3330d..974f2ea075a7 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
     resize,
@@ -147,28 +146,6 @@ def resize(
         )
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index 21293cff489d..34a1f6bce3c4 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
     resize,
@@ -147,30 +146,6 @@ def resize(
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index 2e36868f2237..d63a1832a5f4 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -135,30 +135,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(crop_size["height"], crop_size["width"])`. If the input size is smaller than
-        `crop_size` along any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/efficientformer/image_processing_efficientformer.py
index c5b1dc87256b..ce552c6dd0a7 100644
--- a/src/transformers/models/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/efficientformer/image_processing_efficientformer.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -149,30 +148,6 @@ def resize(
             raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
         return resize(image, size=size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 872535c9d1d7..eaefb9c10150 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, rescale, resize, to_channel_dimension_format
+from ...image_transforms import rescale, resize, to_channel_dimension_format
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -144,30 +144,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(crop_size["height"], crop_size["width"])`. If the input size is smaller than
-        `crop_size` along any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def rescale(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index ceb684bf3773..75e15ddb7b49 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import center_crop, resize, to_channel_dimension_format
+from ...image_transforms import resize, to_channel_dimension_format
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
@@ -359,30 +359,6 @@ def resize(
             image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def map_pixels(self, image: np.ndarray) -> np.ndarray:
         return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
 
diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py
index 8663a3aaaca6..fc96d7bf3272 100644
--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -159,29 +158,6 @@ def resize(
             image, size=(size_dict["height"], size_dict["width"]), resample=resample, data_format=data_format, **kwargs
         )
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Dict `{"height": int, "width": int}` specifying the size of the output image after cropping.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"Size dict must have keys 'height' and 'width'. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index 52099c91b798..eb63a0fd35f1 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -140,28 +139,6 @@ def resize(
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index 1f5a8e5995fe..fac1fe4e6d19 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -144,30 +143,6 @@ def resize(
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys `height` and `width`. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 7f0be8995730..8432f835535a 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     flip_channel_order,
     get_resize_output_image_size,
     resize,
@@ -136,30 +135,6 @@ def resize(
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to size `(size["height], size["width"])`. If the input size is smaller than `size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def flip_channel_order(
         self, image: np.ndarray, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index 211affce15cd..03774f401870 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -193,30 +192,6 @@ def resize(
 
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `crop_size` along
-        any edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"size must contain 'height' and 'width' as keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/tvlt/image_processing_tvlt.py b/src/transformers/models/tvlt/image_processing_tvlt.py
index 073f32977301..0b4d928a3e01 100644
--- a/src/transformers/models/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/tvlt/image_processing_tvlt.py
@@ -19,7 +19,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -182,30 +181,6 @@ def resize(
             raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def _preprocess_image(
         self,
         image: ImageInput,
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index f439608c579b..3992e5c5a79d 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
@@ -161,30 +160,6 @@ def resize(
             raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def _preprocess_image(
         self,
         image: ImageInput,
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index 4c3fbc96844e..cd853093bb10 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     convert_to_rgb,
     get_resize_output_image_size,
     resize,
@@ -147,30 +146,6 @@ def resize(
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
-        returned result will always be of size `size`).
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image in the form of a dictionary with keys `height` and `width`.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def preprocess(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 0ab5cded7985..2aa7a911fed1 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -22,7 +22,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    center_crop,
     get_resize_output_image_size,
     rescale,
     resize,
@@ -168,30 +167,6 @@ def resize(
             raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
-    def center_crop(
-        self,
-        image: np.ndarray,
-        size: Dict[str, int],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
-
-        Args:
-            image (`np.ndarray`):
-                Image to center crop.
-            size (`Dict[str, int]`):
-                Size of the output image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-        """
-        size = get_size_dict(size)
-        if "height" not in size or "width" not in size:
-            raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
-        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
-
     def rescale(
         self,
         image: np.ndarray,

From a0042379269bea9182c1f87e6b2eee4ba4c8cce8 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Thu, 27 Jul 2023 07:11:43 +0530
Subject: [PATCH 791/935] fix deepspeed load best model at end when the model
 gets sharded (#25057)

---
 src/transformers/trainer.py | 107 ++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 54 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4ccad5b276d2..32e13b7ea170 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2093,71 +2093,70 @@ def _load_best_model(self):
         best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
 
         model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-        if (
+        if self.is_deepspeed_enabled:
+            deepspeed_load_checkpoint(self.model_wrapped, self.state.best_model_checkpoint)
+        elif (
             os.path.exists(best_model_path)
             or os.path.exists(best_safe_model_path)
             or os.path.exists(best_adapter_model_path)
             or os.path.exists(best_safe_adapter_model_path)
         ):
-            if self.is_deepspeed_enabled:
-                deepspeed_load_checkpoint(self.model_wrapped, self.state.best_model_checkpoint)
-            else:
-                has_been_loaded = True
-                if is_sagemaker_mp_enabled():
-                    if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
-                        # If the 'user_content.pt' file exists, load with the new smp api.
-                        # Checkpoint must have been saved with the new smp api.
-                        smp.resume_from_checkpoint(
-                            path=self.state.best_model_checkpoint,
-                            tag=WEIGHTS_NAME,
-                            partial=False,
-                            load_optimizer=False,
-                        )
-                    else:
-                        # If the 'user_content.pt' file does NOT exist, load with the old smp api.
-                        # Checkpoint must have been saved with the old smp api.
-                        if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
-                            state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
-                        else:
-                            state_dict = torch.load(best_model_path, map_location="cpu")
-
-                        state_dict["_smp_is_partial"] = False
-                        load_result = model.load_state_dict(state_dict, strict=True)
-                elif self.is_fsdp_enabled:
-                    load_result = load_fsdp_model(
-                        self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint
+            has_been_loaded = True
+            if is_sagemaker_mp_enabled():
+                if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
+                    # If the 'user_content.pt' file exists, load with the new smp api.
+                    # Checkpoint must have been saved with the new smp api.
+                    smp.resume_from_checkpoint(
+                        path=self.state.best_model_checkpoint,
+                        tag=WEIGHTS_NAME,
+                        partial=False,
+                        load_optimizer=False,
                     )
                 else:
-                    if is_peft_available() and isinstance(model, PeftModel):
-                        # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
-                        if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
-                            if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
-                                model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
-                                # Load_adapter has no return value present, modify it when appropriate.
-                                from torch.nn.modules.module import _IncompatibleKeys
-
-                                load_result = _IncompatibleKeys([], [])
-                            else:
-                                logger.warning(
-                                    "The intermediate checkpoints of PEFT may not be saved correctly, "
-                                    f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
-                                    "Check some examples here: https://github.com/huggingface/peft/issues/96"
-                                )
-                                has_been_loaded = False
+                    # If the 'user_content.pt' file does NOT exist, load with the old smp api.
+                    # Checkpoint must have been saved with the old smp api.
+                    if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                        state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                    else:
+                        state_dict = torch.load(best_model_path, map_location="cpu")
+
+                    state_dict["_smp_is_partial"] = False
+                    load_result = model.load_state_dict(state_dict, strict=True)
+            elif self.is_fsdp_enabled:
+                load_result = load_fsdp_model(
+                    self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint
+                )
+            else:
+                if is_peft_available() and isinstance(model, PeftModel):
+                    # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
+                    if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+                        if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
+                            model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
+                            # Load_adapter has no return value present, modify it when appropriate.
+                            from torch.nn.modules.module import _IncompatibleKeys
+
+                            load_result = _IncompatibleKeys([], [])
                         else:
-                            logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
+                            logger.warning(
+                                "The intermediate checkpoints of PEFT may not be saved correctly, "
+                                f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
+                                "Check some examples here: https://github.com/huggingface/peft/issues/96"
+                            )
                             has_been_loaded = False
                     else:
-                        # We load the model state dict on the CPU to avoid an OOM error.
-                        if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
-                            state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
-                        else:
-                            state_dict = torch.load(best_model_path, map_location="cpu")
+                        logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
+                        has_been_loaded = False
+                else:
+                    # We load the model state dict on the CPU to avoid an OOM error.
+                    if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                        state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                    else:
+                        state_dict = torch.load(best_model_path, map_location="cpu")
 
-                        # If the model is on the GPU, it still works!
-                        # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
-                        # which takes *args instead of **kwargs
-                        load_result = model.load_state_dict(state_dict, False)
+                    # If the model is on the GPU, it still works!
+                    # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                    # which takes *args instead of **kwargs
+                    load_result = model.load_state_dict(state_dict, False)
                 if not is_sagemaker_mp_enabled() and has_been_loaded:
                     self._issue_warnings_after_load(load_result)
         elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):

From de9e3b59456a2a5607facbe4d160b5116062a912 Mon Sep 17 00:00:00 2001
From: Pbihao <1435343052@qq.com>
Date: Thu, 27 Jul 2023 20:34:02 +0800
Subject: [PATCH 792/935] fix delete all checkpoints when save_total_limit is
 set to 1 (#25136)

---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 32e13b7ea170..c4a64997b4d2 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1960,7 +1960,7 @@ def _inner_training_loop(
         # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
         if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
             for checkpoint in checkpoints_sorted:
-                if checkpoint != self.state.best_model_checkpoint:
+                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
                     logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
                     shutil.rmtree(checkpoint)
 

From 9429642e2d511395b9dedac7769393a3e1f1530d Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 27 Jul 2023 14:43:18 +0200
Subject: [PATCH 793/935] [`T5/LlamaTokenizer`] default legacy to `None` to not
 always warn (#25131)

default legacy to None
---
 src/transformers/models/llama/tokenization_llama.py | 10 ++++++----
 src/transformers/models/t5/tokenization_t5.py       |  9 +++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 00dbc117a4c5..110ffdce7583 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -111,7 +111,7 @@ def __init__(
         add_bos_token=True,
         add_eos_token=False,
         clean_up_tokenization_spaces=False,
-        legacy=True,
+        legacy=None,
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
@@ -131,11 +131,13 @@ def __init__(
             legacy=legacy,
             **kwargs,
         )
-        if legacy:
+        if legacy is None:
             logger.warning_once(
-                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
-                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
+                f"You are using the default legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
+                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly."
             )
+            legacy = True
+
         self.legacy = legacy
         self.vocab_file = vocab_file
         self.add_bos_token = add_bos_token
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index b76ad2d19598..caccd9e8961b 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -148,7 +148,7 @@ def __init__(
         extra_ids=100,
         additional_special_tokens=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        legacy=True,
+        legacy=None,
         **kwargs,
     ) -> None:
         # Add extra_ids to the special token list
@@ -163,11 +163,12 @@ def __init__(
                     " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                     " tokens"
                 )
-        if legacy:
+        if legacy is None:
             logger.warning_once(
-                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
-                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
+                f"You are using the default legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
+                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly."
             )
+            legacy = True
 
         self.legacy = legacy
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

From 9a220ce30c1213d10caf32bb3f0f750d54fde219 Mon Sep 17 00:00:00 2001
From: Bram Vanroy <2779410+BramVanroy@users.noreply.github.com>
Date: Thu, 27 Jul 2023 15:09:27 +0200
Subject: [PATCH 794/935] Clarify 4/8 bit loading log message (#25134)

* clarify 4/8 bit loading log message

* make style
---
 src/transformers/modeling_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9695e325f9b9..a6eb2b46881c 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2734,8 +2734,10 @@ def from_pretrained(
 
             llm_int8_skip_modules = quantization_config.llm_int8_skip_modules
             load_in_8bit_fp32_cpu_offload = quantization_config.llm_int8_enable_fp32_cpu_offload
-
-            logger.info("Detected 8-bit loading: activating 8-bit loading for this model")
+            if load_in_8bit:
+                logger.info("Detected 8-bit loading: activating 8-bit loading for this model")
+            else:
+                logger.info("Detected 4-bit loading: activating 4-bit loading for this model")
 
             # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
             if llm_int8_skip_modules is None:

From a1c4954d25ca030c85319dd78395a4eff816e852 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 27 Jul 2023 09:11:28 -0400
Subject: [PATCH 795/935] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8Cha?=
 =?UTF-8?q?nge=20default=20from=20`adamw=5Fhf`=20to=20`adamw=5Ftorch`=20?=
 =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20(#25109)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Change defaults

* Sylvain's comments
---
 src/transformers/training_args.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b409a84bed8b..b6e35b73b9ff 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -493,7 +493,7 @@ class TrainingArguments:
             - `"tpu_metrics_debug"`: print debug metrics on TPU
 
             The options should be separated by whitespaces.
-        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
+        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
             The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or
             adafactor.
         optim_args (`str`, *optional*):
@@ -1034,12 +1034,12 @@ class TrainingArguments:
         default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
     )
 
-    default_optim = "adamw_hf"
+    default_optim = "adamw_torch"
     # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out
     # if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"):
     #     default_optim = "adamw_torch_fused"
     # and update the doc above to:
-    # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_hf"`):
+    # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`):
     optim: Union[OptimizerNames, str] = field(
         default=default_optim,
         metadata={"help": "The optimizer to use."},
@@ -2421,7 +2421,7 @@ def set_push_to_hub(
 
     def set_optimizer(
         self,
-        name: Union[str, OptimizerNames] = "adamw_hf",
+        name: Union[str, OptimizerNames] = "adamw_torch",
         learning_rate: float = 5e-5,
         weight_decay: float = 0,
         beta1: float = 0.9,
@@ -2433,7 +2433,7 @@ def set_optimizer(
         A method that regroups all arguments linked to the optimizer and its hyperparameters.
 
         Args:
-            name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
+            name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
                 The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
                 `"adamw_anyprecision"` or `"adafactor"`.
             learning_rate (`float`, *optional*, defaults to 5e-5):

From 9cea3e7b80be16090df2fcb593524b769b377d9d Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 27 Jul 2023 16:24:52 +0200
Subject: [PATCH 796/935] [`MptConfig`] support from pretrained args (#25116)

* support from pretrained args

* draft addition of tests

* update test

* use parrent assert true

* Update src/transformers/models/mpt/configuration_mpt.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

---------

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 .../models/mpt/configuration_mpt.py           | 37 ++++++++++++++++---
 tests/models/mpt/test_modeling_mpt.py         | 16 +++++++-
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index 9601fe5b3e06..fced045f71e1 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -101,6 +101,23 @@ def __init__(
                 f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}"
             )
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if config_dict.get("model_type") == "mpt":
+            config_dict = config_dict["attn_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
 
 class MptConfig(PretrainedConfig):
     """
@@ -180,6 +197,7 @@ class MptConfig(PretrainedConfig):
         "hidden_size": "d_model",
         "num_hidden_layers": "n_layers",
     }
+    is_composition = True
 
     def __init__(
         self,
@@ -204,6 +222,7 @@ def __init__(
         initializer_range=0.02,
         **kwargs,
     ):
+        self.attn_config = attn_config
         self.d_model = d_model
         self.n_heads = n_heads
         self.n_layers = n_layers
@@ -222,20 +241,25 @@ def __init__(
         self.layer_norm_epsilon = layer_norm_epsilon
         self.use_cache = use_cache
         self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+
+    @property
+    def attn_config(self):
+        return self._attn_config
 
+    @attn_config.setter
+    def attn_config(self, attn_config):
         if attn_config is None:
-            self.attn_config = MptAttentionConfig()
+            self._attn_config = MptAttentionConfig()
         elif isinstance(attn_config, dict):
-            self.attn_config = MptAttentionConfig(**attn_config)
+            self._attn_config = MptAttentionConfig(**attn_config)
         elif isinstance(attn_config, MptAttentionConfig):
-            self.attn_config = attn_config
+            self._attn_config = attn_config
         else:
             raise ValueError(
                 f"`attn_config` has to be either a `MptAttentionConfig` or a dictionary. Received: {type(attn_config)}"
             )
 
-        super().__init__(**kwargs)
-
     def to_dict(self):
         """
         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
@@ -245,7 +269,8 @@ def to_dict(self):
         """
         output = copy.deepcopy(self.__dict__)
         output["attn_config"] = (
-            self.attn_config.to_dict() if not isinstance(self.attn_config, dict) else self.attn_config
+            self._attn_config.to_dict() if not isinstance(self.attn_config, dict) else self.attn_config
         )
+        del output["_attn_config"]
         output["model_type"] = self.__class__.model_type
         return output
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 82c157a4bcfe..f3fc6d35951c 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -327,6 +327,20 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+class MptConfigTester(ConfigTester):
+    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
+        super().__init__(parent, config_class, has_text_modality, common_properties, **kwargs)
+
+    def test_attn_config_as_dict(self):
+        config = self.config_class(**self.inputs_dict, attn_config={"attn_impl": "flash", "softmax_scale": None})
+        self.parent.assertTrue(config.attn_config.attn_impl == "flash")
+        self.parent.assertTrue(config.attn_config.softmax_scale is None)
+
+    def run_common_tests(self):
+        self.test_attn_config_as_dict()
+        return super().run_common_tests()
+
+
 @require_torch
 class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
@@ -353,7 +367,7 @@ class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
 
     def setUp(self):
         self.model_tester = MptModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MptConfig, n_embd=37)
+        self.config_tester = MptConfigTester(self, config_class=MptConfig, n_embd=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()

From 0b92ae3489577ea5a1ced167f7b60ac243f333a1 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Thu, 27 Jul 2023 16:35:17 +0200
Subject: [PATCH 797/935] Add offload support to Bark (#25037)

* initial Bark offload proposal

* use hooks instead of manually offloading

* add test of bark offload to cpu feature

* Apply nit suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update docstrings of offload

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* remove unecessary set_seed in Bark tests

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/bark/modeling_bark.py | 107 ++++++++++++++++--
 tests/models/bark/test_modeling_bark.py       |  41 ++++++-
 2 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index ae70a91584c6..1cc42bc811de 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -23,8 +23,13 @@
 
 from ...generation.logits_process import AlternatingCodebooksLogitsProcessor, SuppressTokensLogitsProcessor
 from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...modeling_utils import PreTrainedModel, get_parameter_device
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_accelerate_available,
+    logging,
+)
 from ..auto import AutoModel
 from .configuration_bark import (
     BarkCoarseConfig,
@@ -288,6 +293,26 @@ def _init_weights(self, module):
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+
+        return get_parameter_device(self)
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, BarkCausalModel) or isinstance(module, BarkFineModel) or isinstance(module, BarkModel):
             module.gradient_checkpointing = value
@@ -1376,6 +1401,63 @@ def __init__(self, config):
 
         self.config = config
 
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        # for bark_model, device must be verified on its sub-models
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self.semantic, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.semantic.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+
+    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
+        r"""
+        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
+        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
+        the next sub-model runs.
+
+        Args:
+            gpu_id (`int`, *optional*, defaults to 0):
+                GPU id on which the sub-models will be loaded and offloaded.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu")
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
+        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
+
+        hook = None
+        for cpu_offloaded_model in [
+            self.semantic,
+            self.coarse_acoustics,
+            self.fine_acoustics,
+        ]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        self.fine_acoustics_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.codec_model_hook = hook
+
     def codec_decode(self, fine_output):
         """Turn quantized audio codes into audio array using encodec."""
 
@@ -1402,13 +1484,13 @@ def generate(
                 longest generation among the batch.
             history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                 Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
-        kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
+            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
 
-            - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
-            - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
-              semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
+                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
+                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
 
-            This means you can, for example, specify a generation strategy for all sub-models except one.
+                This means you can, for example, specify a generation strategy for all sub-models except one.
         Returns:
             torch.LongTensor: Output generated audio.
 
@@ -1490,9 +1572,20 @@ def generate(
             **kwargs_fine,
         )
 
+        if getattr(self, "fine_acoustics_hook", None) is not None:
+            # Manually offload fine_acoustics to CPU
+            # and load codec_model to GPU
+            # since bark doesn't use codec_model forward pass
+            self.fine_acoustics_hook.offload()
+            self.codec_model = self.codec_model.to(self.device)
+
         # 4. Decode the output and generate audio array
         audio = self.codec_decode(output)
 
+        if getattr(self, "codec_model_hook", None) is not None:
+            # Offload codec_model to CPU
+            self.codec_model_hook.offload()
+
         return audio
 
     def can_generate(self) -> bool:
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index e2826fcfa22d..4141e5c188a8 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -31,7 +31,7 @@
     BarkFineGenerationConfig,
     BarkSemanticGenerationConfig,
 )
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -989,3 +989,42 @@ def test_generate_end_to_end_with_sub_models_args(self):
                 coarse_temperature=0.2,
                 fine_temperature=0.1,
             )
+
+    @require_torch_gpu
+    @slow
+    def test_generate_end_to_end_with_offload(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            # standard generation
+            output_with_no_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
+
+            torch.cuda.empty_cache()
+
+            memory_before_offload = torch.cuda.memory_allocated()
+            model_memory_footprint = self.model.get_memory_footprint()
+
+            # activate cpu offload
+            self.model.enable_cpu_offload()
+
+            memory_after_offload = torch.cuda.memory_allocated()
+
+            # checks if the model have been offloaded
+
+            # CUDA memory usage after offload should be near 0, leaving room to small differences
+            room_for_difference = 1.1
+            self.assertGreater(
+                (memory_before_offload - model_memory_footprint) * room_for_difference, memory_after_offload
+            )
+
+            # checks if device is the correct one
+            self.assertEqual(self.model.device.type, torch_device)
+
+            # checks if hooks exist
+            self.assertTrue(hasattr(self.model.semantic, "_hf_hook"))
+
+            # output with cpu offload
+            output_with_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
+
+        # checks if same output
+        self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())

From 0c790ddbd1c91250b26bab4308acbf271df063a7 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 27 Jul 2023 17:42:07 +0200
Subject: [PATCH 798/935] More `token` things (#25146)

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/dynamic_module_utils.py | 38 ++++++++++++++++++------
 src/transformers/modeling_tf_utils.py    |  2 +-
 src/transformers/pipelines/__init__.py   | 27 +++++++++++++----
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index cccd616fa2a3..6c00fcbb9a73 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -20,6 +20,7 @@
 import shutil
 import signal
 import sys
+import warnings
 from pathlib import Path
 from typing import Dict, Optional, Union
 
@@ -172,11 +173,12 @@ def get_cached_module_file(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     repo_type: Optional[str] = None,
     _commit_hash: Optional[str] = None,
+    **deprecated_kwargs,
 ):
     """
     Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
@@ -205,7 +207,7 @@ def get_cached_module_file(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
+        token (`str` or *bool*, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -219,13 +221,22 @@ def get_cached_module_file(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
     Returns:
         `str`: The path to the module inside the cache.
     """
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
@@ -252,7 +263,7 @@ def get_cached_module_file(
             proxies=proxies,
             resume_download=resume_download,
             local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
+            token=token,
             revision=revision,
             repo_type=repo_type,
             _commit_hash=_commit_hash,
@@ -310,7 +321,7 @@ def get_cached_module_file(
                     force_download=force_download,
                     resume_download=resume_download,
                     proxies=proxies,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     revision=revision,
                     local_files_only=local_files_only,
                     _commit_hash=commit_hash,
@@ -337,7 +348,7 @@ def get_class_from_dynamic_module(
     force_download: bool = False,
     resume_download: bool = False,
     proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     repo_type: Optional[str] = None,
@@ -382,7 +393,7 @@ def get_class_from_dynamic_module(
         proxies (`Dict[str, str]`, *optional*):
             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or `bool`, *optional*):
+        token (`str` or `bool`, *optional*):
             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
             when running `huggingface-cli login` (stored in `~/.huggingface`).
         revision (`str`, *optional*, defaults to `"main"`):
@@ -400,7 +411,7 @@ def get_class_from_dynamic_module(
 
     <Tip>
 
-    Passing `use_auth_token=True` is required when you want to use a private model.
+    Passing `token=True` is required when you want to use a private model.
 
     </Tip>
 
@@ -418,6 +429,15 @@ def get_class_from_dynamic_module(
     # module.
     cls = get_class_from_dynamic_module("sgugger/my-bert-model--modeling.MyBertModel", "sgugger/another-bert-model")
     ```"""
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     # Catch the name of the repo if it's specified in `class_reference`
     if "--" in class_reference:
         repo_id, class_reference = class_reference.split("--")
@@ -435,7 +455,7 @@ def get_class_from_dynamic_module(
         force_download=force_download,
         resume_download=resume_download,
         proxies=proxies,
-        use_auth_token=use_auth_token,
+        token=token,
         revision=code_revision,
         local_files_only=local_files_only,
         repo_type=repo_type,
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 94232e230f22..9548b1d67f68 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -3056,7 +3056,7 @@ def push_to_hub(
         private: Optional[bool] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
         token: Optional[Union[bool, str]] = None,
-        # (`use_auth_token` is deprecated)
+        # (`use_auth_token` is deprecated: we have to keep it here as we don't have **kwargs)
         use_auth_token: Optional[Union[bool, str]] = None,
         create_pr: bool = False,
         **base_model_card_args,
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index df98fef497af..2810a4713ad3 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -413,11 +413,20 @@ def get_supported_tasks() -> List[str]:
     return PIPELINE_REGISTRY.get_supported_tasks()
 
 
-def get_task(model: str, use_auth_token: Optional[str] = None) -> str:
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     if is_offline_mode():
         raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
     try:
-        info = model_info(model, token=use_auth_token)
+        info = model_info(model, token=token)
     except Exception as e:
         raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
     if not info.pipeline_tag:
@@ -501,7 +510,7 @@ def pipeline(
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
-    use_auth_token: Optional[Union[str, bool]] = None,
+    token: Optional[Union[str, bool]] = None,
     device: Optional[Union[int, str, "torch.device"]] = None,
     device_map=None,
     torch_dtype=None,
@@ -654,10 +663,18 @@ def pipeline(
         model_kwargs = {}
     # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
     # this is to keep BC).
-    use_auth_token = model_kwargs.pop("use_auth_token", use_auth_token)
+    use_auth_token = model_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
     hub_kwargs = {
         "revision": revision,
-        "use_auth_token": use_auth_token,
+        "token": token,
         "trust_remote_code": trust_remote_code,
         "_commit_hash": None,
     }

From e93103632b7c16f638a9ed0a2750bff4cc234941 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 27 Jul 2023 18:24:56 +0100
Subject: [PATCH 799/935] Add bloom flax (#25094)

* First commit

* step 1 working

* add alibi

* placeholder for `scan`

* add matrix mult alibi

* beta scaling factor for bmm

* working v1 - simple forward pass

* move layer_number from attribute to arg in call

* partial functioning scan

* hacky working scan

* add more modifs

* add test

* update scan for new kwarg order

* fix position_ids problem

* fix bug in attention layer

* small fix

- do the alibi broadcasting only once

* prelim refactor

* finish refactor

* alibi shifting

* incorporate dropout_add to attention module

* make style

* make padding work again

* update

* remove bogus file

* up

* get generation to work

* clean code a bit

* added small tests

* adding albii test

* make CI tests pass:

- change init weight
- add correct tuple for output attention
- add scan test
- make CI tests work

* fix few nits

* fix nit onnx

* fix onnx nit

* add missing dtype args to nn.Modules

* remove debugging statements

* fix scan generate

* Update modeling_flax_bloom.py

* Update test_modeling_flax_bloom.py

* Update test_modeling_flax_bloom.py

* Update test_modeling_flax_bloom.py

* fix small test issue + make style

* clean up

* Update tests/models/bloom/test_modeling_flax_bloom.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* fix function name

* small fix test

* forward contrib credits from PR17761

* Fix failing test

* fix small typo documentation

* fix non passing test

- remove device from build alibi

* refactor call

- refactor `FlaxBloomBlockCollection` module

* make style

* upcast to fp32

* cleaner way to upcast

* remove unused args

* remove layer number

* fix scan test

* make style

* fix i4 casting

* fix slow test

* Update src/transformers/models/bloom/modeling_flax_bloom.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* remove `layer_past`

* refactor a bit

* fix `scan` slow test

* remove useless import

* major changes

- remove unused code
- refactor a bit
- revert import `torch`

* major refactoring

- change build alibi

* remove scan

* fix tests

* make style

* clean-up alibi

* add integration tests

* up

* fix batch norm conversion

* style

* style

* update pt-fx cross tests

* update copyright

* Update src/transformers/modeling_flax_pytorch_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* per-weight check

* style

* line formats

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Co-authored-by: haileyschoelkopf <haileyschoelkopf@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/de/index.md                       |   2 +-
 docs/source/en/index.md                       |   2 +-
 docs/source/en/model_doc/bloom.md             |  10 +
 src/transformers/__init__.py                  |   8 +
 .../modeling_flax_pytorch_utils.py            |  25 +-
 .../models/auto/modeling_flax_auto.py         |   2 +
 src/transformers/models/bloom/__init__.py     |  28 +-
 .../models/bloom/modeling_flax_bloom.py       | 734 ++++++++++++++++++
 .../models/gptj/modeling_flax_gptj.py         |   3 +-
 src/transformers/utils/dummy_flax_objects.py  |  21 +
 tests/models/bloom/test_modeling_bloom.py     |  27 +
 .../models/bloom/test_modeling_flax_bloom.py  | 251 ++++++
 12 files changed, 1106 insertions(+), 7 deletions(-)
 create mode 100644 src/transformers/models/bloom/modeling_flax_bloom.py
 create mode 100644 tests/models/bloom/test_modeling_flax_bloom.py

diff --git a/docs/source/de/index.md b/docs/source/de/index.md
index b2661345adff..4742a99f643c 100644
--- a/docs/source/de/index.md
+++ b/docs/source/de/index.md
@@ -218,7 +218,7 @@ Flax), PyTorch, und/oder TensorFlow haben.
 |       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ✅      |
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 5063df249487..62f0469aa097 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -300,7 +300,7 @@ Flax), PyTorch, and/or TensorFlow.
 |        BlenderbotSmall        |       ✅        |         ✅         |      ✅      |
 |             BLIP              |       ✅        |         ✅         |      ❌      |
 |            BLIP-2             |       ✅        |         ❌         |      ❌      |
-|             BLOOM             |       ✅        |         ❌         |      ❌      |
+|             BLOOM             |       ✅        |         ❌         |      ✅      |
 |          BridgeTower          |       ✅        |         ❌         |      ❌      |
 |           CamemBERT           |       ✅        |         ✅         |      ❌      |
 |            CANINE             |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md
index 3fba4af3de22..3c155fa58782 100644
--- a/docs/source/en/model_doc/bloom.md
+++ b/docs/source/en/model_doc/bloom.md
@@ -85,3 +85,13 @@ See also:
 
 [[autodoc]] BloomForQuestionAnswering
     - forward
+
+## FlaxBloomModel
+
+[[autodoc]] FlaxBloomModel
+    - __call__
+
+## FlaxBloomForCausalLM
+
+[[autodoc]] FlaxBloomForCausalLM
+    - __call__
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5b4a75215aa7..7994a5b6ed33 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3892,6 +3892,13 @@
             "FlaxBlenderbotSmallPreTrainedModel",
         ]
     )
+    _import_structure["models.bloom"].extend(
+        [
+            "FlaxBloomForCausalLM",
+            "FlaxBloomModel",
+            "FlaxBloomPreTrainedModel",
+        ]
+    )
     _import_structure["models.clip"].extend(
         [
             "FlaxCLIPModel",
@@ -7275,6 +7282,7 @@
             FlaxBlenderbotSmallModel,
             FlaxBlenderbotSmallPreTrainedModel,
         )
+        from .models.bloom import FlaxBloomForCausalLM, FlaxBloomModel, FlaxBloomPreTrainedModel
         from .models.clip import (
             FlaxCLIPModel,
             FlaxCLIPPreTrainedModel,
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index 21d312647f9a..79d91da49729 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -135,7 +135,21 @@ def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool:
 
 def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
     # convert pytorch tensor to numpy
-    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+    # numpy currently does not support bfloat16, need to go over float32 in this case to not lose precision
+    try:
+        import torch  # noqa: F401
+    except ImportError:
+        logger.error(
+            "Loading a PyTorch model in Flax, requires both PyTorch and Flax to be installed. Please see"
+            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation"
+            " instructions."
+        )
+        raise
+
+    weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
+    pt_state_dict = {
+        k: v.numpy() if not v.dtype == torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
+    }
 
     model_prefix = flax_model.base_model_prefix
 
@@ -163,6 +177,7 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
     # Need to change some parameters name to match Flax names
     for pt_key, pt_tensor in pt_state_dict.items():
         pt_tuple_key = tuple(pt_key.split("."))
+        is_bfloat_16 = weight_dtypes[pt_key] == torch.bfloat16
 
         # remove base model prefix if necessary
         has_base_model_prefix = pt_tuple_key[0] == model_prefix
@@ -197,11 +212,15 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
                 continue
 
             # also add unexpected weight so that warning is thrown
-            flax_state_dict[("params",) + flax_key] = jnp.asarray(flax_tensor)
+            flax_state_dict[("params",) + flax_key] = (
+                jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
+            )
 
         else:
             # also add unexpected weight so that warning is thrown
-            flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+            flax_state_dict[flax_key] = (
+                jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16)
+            )
 
     return unflatten_dict(flax_state_dict)
 
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index 44ef84448119..ebc768963429 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -35,6 +35,7 @@
         ("big_bird", "FlaxBigBirdModel"),
         ("blenderbot", "FlaxBlenderbotModel"),
         ("blenderbot-small", "FlaxBlenderbotSmallModel"),
+        ("bloom", "FlaxBloomModel"),
         ("clip", "FlaxCLIPModel"),
         ("distilbert", "FlaxDistilBertModel"),
         ("electra", "FlaxElectraModel"),
@@ -139,6 +140,7 @@
         ("bart", "FlaxBartForCausalLM"),
         ("bert", "FlaxBertForCausalLM"),
         ("big_bird", "FlaxBigBirdForCausalLM"),
+        ("bloom", "FlaxBloomForCausalLM"),
         ("electra", "FlaxElectraForCausalLM"),
         ("gpt-sw3", "FlaxGPT2LMHeadModel"),
         ("gpt2", "FlaxGPT2LMHeadModel"),
diff --git a/src/transformers/models/bloom/__init__.py b/src/transformers/models/bloom/__init__.py
index c91b3d0f385a..32e8617e8270 100644
--- a/src/transformers/models/bloom/__init__.py
+++ b/src/transformers/models/bloom/__init__.py
@@ -14,7 +14,13 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
 
 
 _import_structure = {
@@ -44,6 +50,19 @@
         "BloomForQuestionAnswering",
     ]
 
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_bloom"] = [
+        "FlaxBloomForCausalLM",
+        "FlaxBloomModel",
+        "FlaxBloomPreTrainedModel",
+    ]
+
+
 if TYPE_CHECKING:
     from .configuration_bloom import BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP, BloomConfig, BloomOnnxConfig
 
@@ -71,6 +90,13 @@
             BloomPreTrainedModel,
         )
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_bloom import FlaxBloomForCausalLM, FlaxBloomModel, FlaxBloomPreTrainedModel
 else:
     import sys
 
diff --git a/src/transformers/models/bloom/modeling_flax_bloom.py b/src/transformers/models/bloom/modeling_flax_bloom.py
new file mode 100644
index 000000000000..187230f35ab9
--- /dev/null
+++ b/src/transformers/models/bloom/modeling_flax_bloom.py
@@ -0,0 +1,734 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. Team and Bigscience Workshop. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax BLOOM model."""
+
+import math
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, dot_product_attention_weights, make_causal_mask
+from flax.linen.activation import tanh
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutput,
+)
+from ...modeling_flax_utils import FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_bloom import BloomConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "bigscience/bloom"
+_CONFIG_FOR_DOC = "BloomConfig"
+
+
+BLOOM_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+BLOOM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BloomTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def build_alibi_tensor(attention_mask: jnp.ndarray, num_heads: int, dtype: Optional[jnp.dtype] = jnp.float32):
+    """
+    Flax implementation of the BLOOM Alibi tensor. BLOOM Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    Link to paper: https://arxiv.org/abs/2108.12409
+
+    Args:
+        attention_mask (`jnp.ndarray`):
+            Token-wise attention mask, this should be of shape `(batch_size, max_seq_len)`.
+        num_heads (`int`):
+            Number of attention heads.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            The data type (dtype) of the output tensor.
+
+    Returns: Alibi tensor of shape `(batch_size * num_heads, 1, max_seq_len)`.
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = jnp.array(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=jnp.float32)
+    powers = jnp.arange(1, 1 + closest_power_of_2, dtype=jnp.float32)
+    slopes = jax.lax.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = jnp.array(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=jnp.float32)
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = jnp.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=jnp.float32)
+        slopes = jnp.cat([slopes, jax.lax.pow(extra_base, extra_powers)], axis=0)
+
+    # Note: the Alibi tensor will added to the attention bias that will be applied to the query, key product of attention
+    # therefore, Alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # so that the query_length dimension will then be broadcast correctly.
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(axis=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    alibi = jnp.expand_dims(alibi, axis=2)
+    return jnp.asarray(alibi, dtype)
+
+
+class FlaxBloomAttention(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.hidden_size = self.config.hidden_size
+        self.num_heads = self.config.n_head
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by `num_heads` (got `hidden_size`: {self.hidden_size} and "
+                f"`num_heads`: {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.query_key_value = dense(self.hidden_size * 3)
+        self.dense = dense(self.hidden_size)
+        self.resid_dropout = nn.Dropout(rate=self.config.hidden_dropout)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:-1] + (self.num_heads, self.head_dim * 3))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key
+            # positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        residual,
+        alibi,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # proj q, k, v
+        fused_qkv = self.query_key_value(hidden_states)
+        fused_qkv = self._split_heads(fused_qkv)
+        query, key, value = jnp.split(fused_qkv, 3, axis=-1)
+
+        causal_attention_mask = make_causal_mask(attention_mask, dtype="bool")
+
+        # for fast decoding causal attention mask should be shifted
+        causal_attention_mask_shift = (
+            self.variables["cache"]["cache_index"] if self.has_variable("cache", "cached_key") else 0
+        )
+
+        # fast decoding for generate requires special attention_mask
+        if self.has_variable("cache", "cached_key"):
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_attention_mask = jax.lax.dynamic_slice(
+                causal_attention_mask,
+                (0, 0, causal_attention_mask_shift, 0),
+                (1, 1, seq_length, max_decoder_length),
+            )
+
+        # broadcast causal attention mask & attention mask to fit for merge
+        causal_attention_mask = jnp.broadcast_to(
+            causal_attention_mask, (batch_size,) + causal_attention_mask.shape[1:]
+        )
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_attention_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_attention_mask)
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        mask_value = jnp.finfo(self.dtype).min
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, mask_value).astype(self.dtype),
+        )
+
+        attention_bias = attention_bias + alibi
+
+        # Cast in fp32 if the original dtype is different from fp32
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
+
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
+            deterministic=deterministic,
+            dtype=attention_dtype,
+        )
+
+        # Cast back in the original dtype if the native dtype is not fp32
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.dense(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        attn_output = attn_output + residual
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class BloomGELU(nn.Module):
+    def setup(self):
+        self.dtype = jnp.float32
+
+    def __call__(self, x):
+        return x * 0.5 * (1.0 + tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+class FlaxBloomMLP(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+
+        self.dense_h_to_4h = nn.Dense(4 * hidden_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.dense_4h_to_h = nn.Dense(hidden_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.hidden_dropout = nn.Dropout(self.config.hidden_dropout)
+        self.act = BloomGELU()
+
+    def __call__(self, hidden_states, residual, deterministic: bool = True):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        intermediate_output = self.dense_4h_to_h(hidden_states)
+
+        intermediate_output = intermediate_output + residual
+        hidden_states = self.hidden_dropout(intermediate_output, deterministic=deterministic)
+
+        return hidden_states
+
+
+class FlaxBloomBlock(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.input_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        self.self_attention = FlaxBloomAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        self.mlp = FlaxBloomMLP(self.config, dtype=self.dtype)
+
+        self.apply_residual_connection_post_layernorm = self.config.apply_residual_connection_post_layernorm
+        self.hidden_dropout = self.config.hidden_dropout
+
+    def __call__(
+        self,
+        hidden_states,
+        alibi,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # layer norm before saving residual if config calls for it
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # self-attention
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            residual=residual,
+            alibi=alibi,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        post_layernorm = self.post_attention_layernorm(attention_output)
+
+        # set residual based on config
+        if self.apply_residual_connection_post_layernorm:
+            residual = post_layernorm
+        else:
+            residual = attention_output
+
+        output = self.mlp(post_layernorm, residual, deterministic=deterministic)
+
+        outputs = (output,) + outputs
+
+        return outputs
+
+
+class FlaxBloomPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BloomConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: BloomConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        past_key_values: dict = None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # If past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+        # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+        # changed by FlaxBloomAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxBloomBlockCollection(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = [
+            FlaxBloomBlock(self.config, name=str(layer_number), dtype=self.dtype)
+            for layer_number in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        alibi,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for layer_number in range(self.config.num_hidden_layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = self.layers[layer_number](
+                hidden_states,
+                alibi=alibi,
+                attention_mask=attention_mask,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxBloomModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+class FlaxBloomModule(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+
+        # word embeddings (no positional embedding layer)
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.embed_dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+        # post-embedding layernorm
+        self.word_embeddings_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        # transformer layers
+        self.h = FlaxBloomBlockCollection(self.config, dtype=self.dtype)
+
+        # final layernorm
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        inputs_embeds = self.word_embeddings(input_ids)
+        # do post-embedding layernorm
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        # build alibi depending on `attention_mask`
+        alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+
+        outputs = self.h(
+            hidden_states,
+            alibi=alibi,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in [outputs[0], outputs[-1]] if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
+    BLOOM_START_DOCSTRING,
+)
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoModel with GPTNeo->Bloom
+class FlaxBloomModel(FlaxBloomPreTrainedModel):
+    module_class = FlaxBloomModule
+
+
+append_call_sample_docstring(FlaxBloomModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxBloomForCausalLMModule(nn.Module):
+    config: BloomConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxBloomModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["word_embeddings"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    BLOOM_START_DOCSTRING,
+)
+class FlaxBloomForCausalLM(FlaxBloomPreTrainedModel):
+    module_class = FlaxBloomForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for
+        # x > input_ids.shape[-1] and x < cache_length. But since Bloom uses a causal mask,
+        # those positions are masked anyway. Thus, we can create a single static attention_mask here,
+        # which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        return model_kwargs
+
+
+append_call_sample_docstring(FlaxBloomForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index 6270355129ff..8ec53aec46d5 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -194,7 +194,8 @@ def _concatenate_to_cache(self, key, value, query, attention_mask):
             cached_value.value = value
             num_updated_cache_vectors = query.shape[1]
             cache_index.value = cache_index.value + num_updated_cache_vectors
-            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key
+            # positions that have already been generated and cached, not the remaining zero elements.
             pad_mask = jnp.broadcast_to(
                 jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                 tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index cc3f48cb25e2..78be4ef747e9 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -520,6 +520,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxBloomForCausalLM(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxBloomModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxBloomPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxCLIPModel(metaclass=DummyObject):
     _backends = ["flax"]
 
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index f7ef199febdb..4e9b837c8adc 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -487,6 +487,33 @@ def test_batch_generation_padd(self):
             tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
         )
 
+    @slow
+    @require_torch_gpu
+    def test_batch_generated_text(self):
+        path_560m = "bigscience/bloom-560m"
+
+        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
+        model = model.eval()
+        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
+
+        input_sentences = [
+            "Hello what is",
+            "Running a quick test with the",
+        ]
+        inputs = tokenizer(input_sentences, return_tensors="pt", padding=True, truncation=True)
+        generated_ids = model.generate(
+            inputs["input_ids"].cuda(), attention_mask=inputs["attention_mask"], max_length=20
+        )
+        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # these generations match those of the PyTorch model
+        EXPECTED_GENERATIONS = [
+            "Hello what is the best way to get the data from the server? I have tried",
+            "Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
+        ]
+
+        self.assertListEqual(generated_text, EXPECTED_GENERATIONS)
+
 
 @require_torch
 class BloomEmbeddingTest(unittest.TestCase):
diff --git a/tests/models/bloom/test_modeling_flax_bloom.py b/tests/models/bloom/test_modeling_flax_bloom.py
new file mode 100644
index 000000000000..0e49039afe59
--- /dev/null
+++ b/tests/models/bloom/test_modeling_flax_bloom.py
@@ -0,0 +1,251 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+from transformers import BloomConfig, BloomTokenizerFast, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from ...generation.test_flax_utils import FlaxGenerationTesterMixin
+from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
+
+
+if is_flax_available():
+    import os
+
+    # The slow tests are often failing with OOM error on GPU
+    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
+    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
+    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
+
+    import jax.numpy as jnp
+
+    from transformers import FlaxBloomForCausalLM, FlaxBloomModel
+
+
+def prepare_bloom_inputs_dict(config, input_ids, attention_mask=None):
+    if attention_mask is None:
+        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
+    return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@require_flax
+class FlaxBloomModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        n_layer=2,
+        n_head=4,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        attention_probs_dropout_prob=0.1,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        initializer_range=0.02,
+        apply_residual_connection_post_layernorm=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = n_layer
+        self.num_attention_heads = n_head
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.initializer_range = initializer_range
+        self.is_encoder_decoder = False
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+
+    def prepare_config_and_inputs(self):
+        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
+        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
+
+        config = BloomConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            hidden_dropout=self.hidden_dropout,
+            attention_dropout=self.attention_probs_dropout_prob,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            is_encoder_decoder=False,
+            use_cache=False,
+        )
+        inputs_dict = prepare_bloom_inputs_dict(config, input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
+        max_length = 20
+        model = model_class_name(config)
+
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = jnp.ones((input_ids.shape[0], max_length), dtype="i4")
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_length)
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+        )
+
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            attention_mask=attention_mask,
+            past_key_values=outputs_cache.past_key_values,
+        )
+
+        outputs = model(input_ids)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
+        max_length = 20
+        model = model_class_name(config)
+
+        input_ids, attention_mask = (
+            inputs_dict["input_ids"],
+            inputs_dict["attention_mask"],
+        )
+
+        attention_mask_cache = jnp.concatenate(
+            [
+                attention_mask,
+                jnp.zeros((attention_mask.shape[0], max_length - attention_mask.shape[1])),
+            ],
+            axis=-1,
+        )
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_length)
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask_cache,
+            past_key_values=past_key_values,
+        )
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            past_key_values=outputs_cache.past_key_values,
+            attention_mask=attention_mask_cache,
+        )
+
+        outputs = model(input_ids, attention_mask=attention_mask)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+
+@require_flax
+class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+    all_model_classes = (FlaxBloomModel, FlaxBloomForCausalLM) if is_flax_available() else ()
+    all_generative_model_classes = () if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxBloomModelTester(self)
+
+    def test_use_cache_forward(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
+
+    def test_use_cache_forward_with_attn_mask(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("bigscience/bloom-560m")
+            input_ids = np.ones((1, 1)) * model.config.eos_token_id
+            outputs = model(input_ids)
+            self.assertIsNotNone(outputs)
+
+
+@slow
+@require_flax
+class FlaxBloomGenerationTest(unittest.TestCase):
+    all_model_classes = (FlaxBloomForCausalLM) if is_flax_available() else ()
+    all_generative_model_classes = () if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_id = "bigscience/bloom-560m"
+        self.tokenizer = BloomTokenizerFast.from_pretrained(self.model_id, padding_side="left")
+        self.model_tester = FlaxBloomModelTester(self)
+        self.model = FlaxBloomForCausalLM.from_pretrained(self.model_id, from_pt=True, revision="gs555750")
+
+    def test_model_batched_gen(self):
+        # tests if the model outputs the same generation for the same batched input
+        input_sentences = [
+            "Hello there is this string is definitely longer I believe that",
+            "Hello there is this string is definitely longer I believe that",
+        ]
+        inputs = self.tokenizer(input_sentences, return_tensors="np", padding=True, truncation=True)
+        sequences_fx = self.model.generate(**inputs, max_length=20).sequences
+        self.assertEqual(sequences_fx[0].tolist(), sequences_fx[1].tolist())
+
+    def test_model_batched_padding_left(self):
+        # tests if the model outputs the same generation for an input that is part of a batch
+        # and a single input
+        input_sentences_batch = [
+            "Hello there is this string is definitely longer I believe that",
+            "Hi I want to order",
+        ]
+        inputs = self.tokenizer(input_sentences_batch, return_tensors="np", padding=True, truncation=True)
+        sequences_fx_batch = self.model.generate(**inputs, max_length=20).sequences
+
+        input_sentence_simple = "Hi I want to order"
+        inputs_simple = self.tokenizer(input_sentence_simple, return_tensors="np")
+        sequences_fx_simple = self.model.generate(**inputs_simple, max_length=20).sequences
+
+        self.assertEqual(sequences_fx_batch[1][6:].tolist(), sequences_fx_simple[0][:-6].tolist())
+
+    def test_batch_generated_text(self):
+        input_sentences = [
+            "Hello what is",
+            "Running a quick test with the",
+        ]
+        inputs = self.tokenizer(input_sentences, return_tensors="np", padding=True, truncation=True)
+        generated_ids = self.model.generate(**inputs, max_length=20).sequences
+        generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # these generations match those of the PyTorch model, ensuring correctness
+        EXPECTED_GENERATIONS = [
+            "Hello what is the best way to get the data from the server? I have tried",
+            "Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
+        ]
+
+        self.assertListEqual(generated_text, EXPECTED_GENERATIONS)

From 400e76ef11d94a12c255fe1a598966e1d6021511 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 27 Jul 2023 13:41:50 -0400
Subject: [PATCH 800/935] Add new model in doc table of content (#25148)

---
 .../commands/add_new_model_like.py            | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index a232ad34dbb4..df86a22799a5 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -23,6 +23,8 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union
 
+import yaml
+
 from ..models import auto as auto_module
 from ..models.auto.configuration_auto import model_type_to_module_name
 from ..utils import is_flax_available, is_tf_available, is_torch_available, logging
@@ -1268,6 +1270,56 @@ def duplicate_doc_file(
         f.write("\n".join(new_blocks))
 
 
+def insert_model_in_doc_toc(old_model_patterns, new_model_patterns):
+    """
+    Insert the new model in the doc TOC, in the same section as the old model.
+
+    Args:
+        old_model_patterns (`ModelPatterns`): The patterns for the old model.
+        new_model_patterns (`ModelPatterns`): The patterns for the new model.
+    """
+    toc_file = REPO_PATH / "docs" / "source" / "en" / "_toctree.yml"
+    with open(toc_file, "r", encoding="utf8") as f:
+        content = yaml.safe_load(f)
+
+    # Get to the model API doc
+    api_idx = 0
+    while content[api_idx]["title"] != "API":
+        api_idx += 1
+    api_doc = content[api_idx]["sections"]
+
+    model_idx = 0
+    while api_doc[model_idx]["title"] != "Models":
+        model_idx += 1
+    model_doc = api_doc[model_idx]["sections"]
+
+    # Find the base model in the Toc
+    old_model_type = old_model_patterns.model_type
+    section_idx = 0
+    while section_idx < len(model_doc):
+        sections = [entry["local"] for entry in model_doc[section_idx]["sections"]]
+        if f"model_doc/{old_model_type}" in sections:
+            break
+
+        section_idx += 1
+
+    if section_idx == len(model_doc):
+        old_model = old_model_patterns.model_name
+        new_model = new_model_patterns.model_name
+        print(f"Did not find {old_model} in the table of content, so you will need to add {new_model} manually.")
+        return
+
+    # Add the new model in the same toc
+    toc_entry = {"local": f"model_doc/{new_model_patterns.model_type}", "title": new_model_patterns.model_name}
+    model_doc[section_idx]["sections"].append(toc_entry)
+    model_doc[section_idx]["sections"] = sorted(model_doc[section_idx]["sections"], key=lambda s: s["title"].lower())
+    api_doc[model_idx]["sections"] = model_doc
+    content[api_idx]["sections"] = api_doc
+
+    with open(toc_file, "w", encoding="utf-8") as f:
+        f.write(yaml.dump(content, allow_unicode=True))
+
+
 def create_new_model_like(
     model_type: str,
     new_model_patterns: ModelPatterns,
@@ -1407,6 +1459,7 @@ def disable_fx_test(filename: Path) -> bool:
     # 5. Add doc file
     doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{old_model_patterns.model_type}.md"
     duplicate_doc_file(doc_file, old_model_patterns, new_model_patterns, frameworks=frameworks)
+    insert_model_in_doc_toc(old_model_patterns, new_model_patterns)
 
     # 6. Warn the user for duplicate patterns
     if old_model_patterns.model_type == old_model_patterns.checkpoint:

From 6232c380f269c337fa9a92f4bed7cb37bc188ca8 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Fri, 28 Jul 2023 11:40:08 +0200
Subject: [PATCH 801/935] Fix `.push_to_hub` and cleanup `get_full_repo_name`
 usage (#25120)

* Fix .push_to_hub and cleanup get_full_repo_name usage

* Do not rely on Python bool conversion magic

* request changes
---
 .../run_image_captioning_flax.py              | 18 +++----
 .../language-modeling/run_bart_dlm_flax.py    | 18 +++----
 .../flax/language-modeling/run_clm_flax.py    | 18 +++----
 .../flax/language-modeling/run_mlm_flax.py    | 18 +++----
 .../flax/language-modeling/run_t5_mlm_flax.py | 18 +++----
 examples/flax/question-answering/run_qa.py    | 18 +++----
 .../summarization/run_summarization_flax.py   | 18 +++----
 .../flax/text-classification/run_flax_glue.py | 18 +++----
 .../flax/token-classification/run_flax_ner.py | 18 +++----
 .../flax/vision/run_image_classification.py   | 18 +++----
 .../run_image_classification_no_trainer.py    | 16 +++---
 .../image-pretraining/run_mim_no_trainer.py   | 17 +++---
 .../language-modeling/run_clm_no_trainer.py   | 16 +++---
 .../language-modeling/run_mlm_no_trainer.py   | 16 +++---
 .../multiple-choice/run_swag_no_trainer.py    | 16 +++---
 .../run_qa_beam_search_no_trainer.py          | 16 +++---
 .../question-answering/run_qa_no_trainer.py   | 16 +++---
 .../run_semantic_segmentation_no_trainer.py   | 16 +++---
 .../run_wav2vec2_pretraining_no_trainer.py    | 16 +++---
 .../run_summarization_no_trainer.py           | 16 +++---
 .../run_glue_no_trainer.py                    | 16 +++---
 .../run_ner_no_trainer.py                     | 16 +++---
 .../translation/run_translation_no_trainer.py | 16 +++---
 .../luke/run_luke_ner_no_trainer.py           | 16 +++---
 src/transformers/file_utils.py                |  3 +-
 src/transformers/keras_callbacks.py           |  8 ++-
 src/transformers/modeling_tf_utils.py         | 13 ++---
 src/transformers/trainer.py                   | 19 ++++---
 src/transformers/training_args.py             |  2 +-
 src/transformers/utils/__init__.py            |  2 +-
 src/transformers/utils/hub.py                 | 53 ++++++++++---------
 31 files changed, 265 insertions(+), 240 deletions(-)

diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index ef9c515da450..f434752e7414 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -53,7 +53,7 @@
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.utils import get_full_repo_name, is_offline_mode, send_example_telemetry
+from transformers.utils import is_offline_mode, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -424,14 +424,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index e863f97e0d62..c1df13d08fad 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -59,7 +59,7 @@
     set_seed,
 )
 from transformers.models.bart.modeling_flax_bart import shift_tokens_right
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -496,14 +496,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 656bfa680776..c028c3e94b71 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -58,7 +58,7 @@
     set_seed,
 )
 from transformers.testing_utils import CaptureLogger
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -372,14 +372,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     #  Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index ae289b84708f..0d12de06940b 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -59,7 +59,7 @@
     is_tensorboard_available,
     set_seed,
 )
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -410,14 +410,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index f3cec97b2e82..89e0b9e987a1 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -59,7 +59,7 @@
     set_seed,
 )
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -537,14 +537,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 48ed6173c77e..29e1de744c6b 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -55,7 +55,7 @@
     PreTrainedTokenizerFast,
     is_tensorboard_available,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -462,14 +462,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # region Load Data
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index 2d7e0acbf584..fc7be6edbffe 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -56,7 +56,7 @@
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.utils import get_full_repo_name, is_offline_mode, send_example_telemetry
+from transformers.utils import is_offline_mode, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -452,14 +452,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 1e6c6c1b0fa4..8985c3a2f783 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -49,7 +49,7 @@
     TrainingArguments,
     is_tensorboard_available,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -342,14 +342,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 6f773e2f8c7e..8d7f263f5a3c 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -49,7 +49,7 @@
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -398,14 +398,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 6a88f0f8d67b..49e147e310c3 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -54,7 +54,7 @@
     is_tensorboard_available,
     set_seed,
 )
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 logger = logging.getLogger(__name__)
@@ -293,14 +293,14 @@ def main():
 
     # Handle the repository creation
     if training_args.push_to_hub:
-        if training_args.hub_model_id is None:
-            repo_name = get_full_repo_name(
-                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
-            )
-        else:
-            repo_name = training_args.hub_model_id
-        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
-        repo = Repository(training_args.output_dir, clone_from=repo_name, token=training_args.hub_token)
+        # Retrieve of infer repo_name
+        repo_name = training_args.hub_model_id
+        if repo_name is None:
+            repo_name = Path(training_args.output_dir).absolute().name
+        # Create repo and retrieve repo_id
+        repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
+        # Clone repo locally
+        repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
 
     # Initialize datasets and pre-processing transforms
     # We use torchvision here for faster pre-processing
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 27b4bee144f2..c416273132ed 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -42,7 +42,7 @@
 
 import transformers
 from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -236,12 +236,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 5f83fd2da8fe..f8c2cdbd3ce2 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -25,7 +25,7 @@
 from accelerate import Accelerator, DistributedType
 from accelerate.utils import set_seed
 from datasets import load_dataset
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor
 from tqdm.auto import tqdm
@@ -41,7 +41,7 @@
     SchedulerType,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -406,11 +406,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index dc619acb9a69..58954bd85367 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -52,7 +52,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -286,12 +286,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index b57a7d331c88..af1e8d5209f9 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -52,7 +52,7 @@
     SchedulerType,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -295,12 +295,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 7f5969934e7d..26ae2d49443c 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -52,7 +52,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import PaddingStrategy, check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -313,12 +313,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 79aaa22cdb1b..6d0045505949 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -51,7 +51,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -328,12 +328,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 7f13c250e7a2..2a7cdc727352 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -52,7 +52,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -366,12 +366,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index bcad655727c8..af9880901ca3 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -45,7 +45,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -350,12 +350,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index 603202e696cf..6bde6d2b7d0f 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -43,7 +43,7 @@
     set_seed,
 )
 from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
-from transformers.utils import get_full_repo_name, send_example_telemetry
+from transformers.utils import send_example_telemetry
 
 
 logger = get_logger(__name__)
@@ -418,12 +418,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub and not args.preprocessing_only:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
         elif args.output_dir is not None:
             os.makedirs(args.output_dir, exist_ok=True)
     accelerator.wait_for_everyone()
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index bfefbee2505a..543b6bfaa206 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -51,7 +51,7 @@
     SchedulerType,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, is_offline_mode, send_example_telemetry
+from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -360,12 +360,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index a21a698ec904..e137f8763857 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -43,7 +43,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -240,12 +240,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 8ad0b6d7e47d..dc57b514e728 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -51,7 +51,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -295,12 +295,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 70f831620d1e..77efa14f4f3c 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -52,7 +52,7 @@
     default_data_collator,
     get_scheduler,
 )
-from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
+from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
@@ -340,12 +340,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
 
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
index 6b59643cf70b..f12a7d76d80c 100644
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py
@@ -29,7 +29,7 @@
 import torch
 from accelerate import Accelerator, DistributedDataParallelKwargs
 from datasets import ClassLabel, load_dataset, load_metric
-from huggingface_hub import Repository
+from huggingface_hub import Repository, create_repo
 from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
@@ -45,7 +45,6 @@
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
@@ -258,11 +257,14 @@ def main():
     # Handle the repository creation
     if accelerator.is_main_process:
         if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
         elif args.output_dir is not None:
             os.makedirs(args.output_dir, exist_ok=True)
     accelerator.wait_for_everyone()
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 921974634d0d..d710296fc0f5 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -17,6 +17,8 @@
 This module should not be update anymore and is only left for backward compatibility.
 """
 
+from huggingface_hub import get_full_repo_name  # for backward compatibility
+
 from . import __version__
 
 # Backward compatibility imports, to make sure all those objects can be found in file_utils
@@ -71,7 +73,6 @@
     define_sagemaker_information,
     get_cached_models,
     get_file_from_repo,
-    get_full_repo_name,
     get_torch_version,
     has_file,
     http_user_agent,
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 15b4edf07d1c..3bb4e859b1c6 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -12,7 +12,6 @@
 
 from . import IntervalStrategy, PreTrainedTokenizerBase
 from .modelcard import TrainingSummary
-from .utils import get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
@@ -334,14 +333,13 @@ def __init__(
             raise ValueError("Please supply a positive integer argument for save_steps when save_strategy == 'steps'!")
         self.save_steps = save_steps
         output_dir = Path(output_dir)
+
+        # Create repo and retrieve repo_id
         if hub_model_id is None:
             hub_model_id = output_dir.absolute().name
-        if "/" not in hub_model_id:
-            hub_model_id = get_full_repo_name(hub_model_id, token=hub_token)
+        self.hub_model_id = create_repo(repo_id=hub_model_id, exist_ok=True, token=hub_token).repo_id
 
         self.output_dir = output_dir
-        self.hub_model_id = hub_model_id
-        create_repo(self.hub_model_id, exist_ok=True)
         self.repo = Repository(str(self.output_dir), clone_from=self.hub_model_id, token=hub_token)
 
         self.tokenizer = tokenizer
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 9548b1d67f68..ebdafdbe8738 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1357,21 +1357,16 @@ def load_repo_checkpoint(self, repo_path_or_name):
                 "Checkpoint loading failed as no optimizer is attached to the model. "
                 "This is most likely caused by the model not being compiled."
             )
-        if not os.path.isdir(repo_path_or_name):
+        if os.path.isdir(repo_path_or_name):
+            local_dir = repo_path_or_name
+        else:
             # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
             repo_files = list_repo_files(repo_path_or_name)
             for file in ("checkpoint/weights.h5", "checkpoint/extra_data.pickle"):
                 if file not in repo_files:
                     raise FileNotFoundError(f"Repo {repo_path_or_name} does not contain checkpoint file {file}!")
-            if "/" not in repo_path_or_name:
-                model_id = repo_path_or_name
-                repo_path_or_name = self.get_full_repo_name(repo_path_or_name)
-            else:
-                model_id = repo_path_or_name.split("/")[-1]
-            repo = Repository(model_id, clone_from=f"https://huggingface.co/{repo_path_or_name}")
+            repo = Repository(repo_path_or_name.split("/")[-1], clone_from=repo_path_or_name)
             local_dir = repo.local_dir
-        else:
-            local_dir = repo_path_or_name
 
         # Now make sure the repo actually has a checkpoint in it.
         checkpoint_dir = os.path.join(local_dir, "checkpoint")
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c4a64997b4d2..319c36e7c874 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -129,7 +129,6 @@
     WEIGHTS_NAME,
     can_return_loss,
     find_labels,
-    get_full_repo_name,
     is_accelerate_available,
     is_apex_available,
     is_datasets_available,
@@ -3396,22 +3395,22 @@ def init_git_repo(self, at_init: bool = False):
         """
         if not self.is_world_process_zero():
             return
-        if self.args.hub_model_id is None:
+
+        # Make sure the repo exists + retrieve "real" repo_id
+        repo_name = self.args.hub_model_id
+        if repo_name is None:
             repo_name = Path(self.args.output_dir).absolute().name
-        else:
-            repo_name = self.args.hub_model_id
-        if "/" not in repo_name:
-            repo_name = get_full_repo_name(repo_name, token=self.args.hub_token)
+        repo_id = create_repo(
+            repo_id=repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True
+        ).repo_id
 
-        # Make sure the repo exists.
-        create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True)
         try:
-            self.repo = Repository(self.args.output_dir, clone_from=repo_name, token=self.args.hub_token)
+            self.repo = Repository(self.args.output_dir, clone_from=repo_id, token=self.args.hub_token)
         except EnvironmentError:
             if self.args.overwrite_output_dir and at_init:
                 # Try again after wiping output_dir
                 shutil.rmtree(self.args.output_dir)
-                self.repo = Repository(self.args.output_dir, clone_from=repo_name, token=self.args.hub_token)
+                self.repo = Repository(self.args.output_dir, clone_from=repo_id, token=self.args.hub_token)
             else:
                 raise
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b6e35b73b9ff..8189c22fe5a1 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -24,6 +24,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from huggingface_hub import get_full_repo_name
 from packaging import version
 
 from .debug_utils import DebugOption
@@ -38,7 +39,6 @@
 from .utils import (
     ExplicitEnum,
     cached_property,
-    get_full_repo_name,
     is_accelerate_available,
     is_safetensors_available,
     is_sagemaker_dp_enabled,
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index bca5440f8e02..3a5dea46161d 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from huggingface_hub import get_full_repo_name  # for backward compatibility
 from packaging import version
 
 from .. import __version__
@@ -79,7 +80,6 @@
     extract_commit_hash,
     get_cached_models,
     get_file_from_repo,
-    get_full_repo_name,
     has_file,
     http_user_agent,
     is_offline_mode,
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 1634abe6ea92..9fba97f098f1 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -36,7 +36,6 @@
     get_hf_file_metadata,
     hf_hub_download,
     hf_hub_url,
-    whoami,
 )
 from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get
 from huggingface_hub.utils import (
@@ -690,6 +689,10 @@ def _create_repo(
                 "The `repo_url` argument is deprecated and will be removed in v5 of Transformers. Use `repo_id` "
                 "instead."
             )
+            if repo_id is not None:
+                raise ValueError(
+                    "`repo_id` and `repo_url` are both specified. Please set only the argument `repo_id`."
+                )
             repo_id = repo_url.replace(f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/", "")
         if organization is not None:
             warnings.warn(
@@ -702,11 +705,7 @@ def _create_repo(
                 repo_id = f"{organization}/{repo_id}"
 
         url = create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
-
-        # If the namespace is not there, add it or `upload_file` will complain
-        if "/" not in repo_id and url != f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{repo_id}":
-            repo_id = get_full_repo_name(repo_id, token=token)
-        return repo_id
+        return url.repo_id
 
     def _get_files_timestamps(self, working_dir: Union[str, os.PathLike]):
         """
@@ -786,8 +785,7 @@ def push_to_hub(
         **deprecated_kwargs,
     ) -> str:
         """
-        Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in
-        `repo_path_or_name`.
+        Upload the {object_files} to the 🤗 Model Hub.
 
         Parameters:
             repo_id (`str`):
@@ -838,22 +836,35 @@ def push_to_hub(
                 )
             token = use_auth_token
 
-        if "repo_path_or_name" in deprecated_kwargs:
+        repo_path_or_name = deprecated_kwargs.pop("repo_path_or_name", None)
+        if repo_path_or_name is not None:
+            # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer
+            # repo_id from the folder path, if it exists.
             warnings.warn(
                 "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
-                "`repo_id` instead."
+                "`repo_id` instead.",
+                FutureWarning,
             )
-            repo_id = deprecated_kwargs.pop("repo_path_or_name")
+            if repo_id is not None:
+                raise ValueError(
+                    "`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`."
+                )
+            if os.path.isdir(repo_path_or_name):
+                # repo_path: infer repo_id from the path
+                repo_id = repo_id.split(os.path.sep)[-1]
+                working_dir = repo_id
+            else:
+                # repo_name: use it as repo_id
+                repo_id = repo_path_or_name
+                working_dir = repo_id.split("/")[-1]
+        else:
+            # Repo_id is passed correctly: infer working_dir from it
+            working_dir = repo_id.split("/")[-1]
+
         # Deprecation warning will be sent after for repo_url and organization
         repo_url = deprecated_kwargs.pop("repo_url", None)
         organization = deprecated_kwargs.pop("organization", None)
 
-        if os.path.isdir(repo_id):
-            working_dir = repo_id
-            repo_id = repo_id.split(os.path.sep)[-1]
-        else:
-            working_dir = repo_id.split("/")[-1]
-
         repo_id = self._create_repo(
             repo_id, private=private, token=token, repo_url=repo_url, organization=organization
         )
@@ -877,14 +888,6 @@ def push_to_hub(
             )
 
 
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
 def send_example_telemetry(example_name, *example_args, framework="pytorch"):
     """
     Sends telemetry that helps tracking the examples use.

From c1dba1111b44d1bebbe3c2e42ce08ed60bf85dff Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Fri, 28 Jul 2023 14:14:27 +0200
Subject: [PATCH 802/935] Add test when downloading from gated repo (#25039)

---
 tests/utils/test_hub_utils.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/utils/test_hub_utils.py b/tests/utils/test_hub_utils.py
index 540847d667d2..e2a3f351d601 100644
--- a/tests/utils/test_hub_utils.py
+++ b/tests/utils/test_hub_utils.py
@@ -36,6 +36,9 @@
 CACHE_DIR = os.path.join(TRANSFORMERS_CACHE, "models--hf-internal-testing--tiny-random-bert")
 FULL_COMMIT_HASH = "9b8c223d42b2188cb49d29af482996f9d0f3e5a6"
 
+GATED_REPO = "hf-internal-testing/dummy-gated-model"
+README_FILE = "README.md"
+
 
 class GetFromCacheTests(unittest.TestCase):
     def test_cached_file(self):
@@ -124,3 +127,13 @@ def test_get_file_from_repo_local(self):
             self.assertEqual(get_file_from_repo(tmp_dir, "a.txt"), str(filename))
 
             self.assertIsNone(get_file_from_repo(tmp_dir, "b.txt"))
+
+    def test_get_file_gated_repo(self):
+        """Test download file from a gated repo fails with correct message when not authenticated."""
+        with self.assertRaisesRegex(EnvironmentError, "You are trying to access a gated repo."):
+            cached_file(GATED_REPO, README_FILE, use_auth_token=False)
+
+    def test_has_file_gated_repo(self):
+        """Test check file existence from a gated repo fails with correct message when not authenticated."""
+        with self.assertRaisesRegex(EnvironmentError, "is a gated repository"):
+            has_file(GATED_REPO, README_FILE, use_auth_token=False)

From 2a787201041b22cdabb2a6a4fb7fd92fdcc6eeef Mon Sep 17 00:00:00 2001
From: YQ <ranchlai@163.com>
Date: Fri, 28 Jul 2023 20:17:24 +0800
Subject: [PATCH 803/935] override .cuda() to check if model is already
 quantized (#25166)

---
 src/transformers/modeling_utils.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index a6eb2b46881c..7fb1c62dae85 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1912,6 +1912,16 @@ def get_memory_footprint(self, return_buffers=True):
             mem = mem + mem_bufs
         return mem
 
+    def cuda(self, *args, **kwargs):
+        # Checks if the model has been loaded in 8-bit
+        if getattr(self, "is_quantized", False):
+            raise ValueError(
+                "Calling `cuda()` is not supported for `4-bit` or `8-bit` quantized models. Please use the model as it is, since the"
+                " model has already been set to the correct devices and casted to the correct `dtype`."
+            )
+        else:
+            return super().cuda(*args, **kwargs)
+
     def to(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
         if getattr(self, "is_quantized", False):

From d23d2c27c2cd5beeb79f1d54c1a2b7be646f5a47 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 28 Jul 2023 20:19:10 +0800
Subject: [PATCH 804/935] Represent query_length in a different way to solve
 jit issue (#25164)

Fix jit trace
---
 src/transformers/models/mpt/modeling_mpt.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index c71571558622..e1e176568bfb 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -154,9 +154,7 @@ def forward(
 
         attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
 
-        query_length = seq_length
-        if past_key_value is not None:
-            query_length += past_key_value[0].shape[2]
+        query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
 
         if position_bias is not None:
             if len(position_bias.shape) != 3:

From afa96fffdf1bbd10fa0c9dbe0524c4e9e24011cc Mon Sep 17 00:00:00 2001
From: Alan Ji <hzji210@gmail.com>
Date: Fri, 28 Jul 2023 20:20:10 +0800
Subject: [PATCH 805/935] make run_generation more generic for other devices
 (#25133)

* make run_generation more generic for other devices

* use Accelerate to support any device type it supports.

* make style

* fix error usage of accelerator.prepare_model

* use `PartialState` to make sure everything is running on the right device

---------

Co-authored-by: statelesshz <jihuazhong1@huawei.com>
---
 .../pytorch/text-generation/requirements.txt  |  1 +
 .../pytorch/text-generation/run_generation.py | 31 ++++++++++---------
 .../run_generation_contrastive_search.py      | 31 +++++++++----------
 3 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/examples/pytorch/text-generation/requirements.txt b/examples/pytorch/text-generation/requirements.txt
index 0ef50f181f64..324a8cfb1c29 100644
--- a/examples/pytorch/text-generation/requirements.txt
+++ b/examples/pytorch/text-generation/requirements.txt
@@ -1,3 +1,4 @@
+accelerate >= 0.21.0
 sentencepiece != 0.1.92
 protobuf
 torch >= 1.3
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index 75221934da85..557b75572c99 100755
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -23,8 +23,9 @@
 import logging
 from typing import Tuple
 
-import numpy as np
 import torch
+from accelerate import PartialState
+from accelerate.utils import set_seed
 
 from transformers import (
     AutoTokenizer,
@@ -88,13 +89,6 @@
 with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
 
 
-def set_seed(args):
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
 #
 # Functions to prepare models' input
 #
@@ -327,7 +321,11 @@ def main():
     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
 
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--use_cpu",
+        action="store_true",
+        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+    )
     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
     parser.add_argument(
         "--fp16",
@@ -337,12 +335,13 @@ def main():
     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
     args = parser.parse_args()
 
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    # Initialize the distributed state.
+    distributed_state = PartialState(cpu=args.use_cpu)
 
-    logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")
+    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 
-    set_seed(args)
+    if args.seed is not None:
+        set_seed(args.seed)
 
     # Initialize the model and tokenizer
     try:
@@ -355,7 +354,9 @@ def main():
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model = model_class.from_pretrained(args.model_name_or_path)
-    model.to(args.device)
+
+    # Set the model to the right device
+    model.to(distributed_state.device)
 
     if args.fp16:
         model.half()
@@ -382,7 +383,7 @@ def main():
     else:
         prefix = args.prefix if args.prefix else args.padding_text
         encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
-    encoded_prompt = encoded_prompt.to(args.device)
+    encoded_prompt = encoded_prompt.to(distributed_state.device)
 
     if encoded_prompt.size()[-1] == 0:
         input_ids = None
diff --git a/examples/pytorch/text-generation/run_generation_contrastive_search.py b/examples/pytorch/text-generation/run_generation_contrastive_search.py
index 117f063a6dd9..91781f05185f 100755
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@@ -23,8 +23,8 @@
 import argparse
 import logging
 
-import numpy as np
-import torch
+from accelerate import PartialState
+from accelerate.utils import set_seed
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -37,13 +37,6 @@
 logger = logging.getLogger(__name__)
 
 
-def set_seed(args):
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -73,7 +66,11 @@ def main():
     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
 
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--use_cpu",
+        action="store_true",
+        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+    )
     parser.add_argument(
         "--fp16",
         action="store_true",
@@ -81,12 +78,13 @@ def main():
     )
     args = parser.parse_args()
 
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    # Initialize the distributed state.
+    distributed_state = PartialState(cpu=args.use_cpu)
 
-    logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")
+    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 
-    set_seed(args)
+    if args.seed is not None:
+        set_seed(args.seed)
 
     # Initialize the model and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
@@ -94,7 +92,8 @@ def main():
 
     # tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
     # model = OPTForCausalLM.from_pretrained(args.model_name_or_path)
-    model.to(args.device)
+    # Set the model to the right device
+    model.to(distributed_state.device)
 
     if args.fp16:
         model.half()
@@ -103,7 +102,7 @@ def main():
     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
 
     inputs = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False)
-    inputs = {key: value.to(args.device) for key, value in inputs.items()}
+    inputs = {key: value.to(distributed_state.device) for key, value in inputs.items()}
 
     output_sequences = model.generate(
         **inputs,

From 3cbc560d03035481eecdc4bdc3a0b19628cdac08 Mon Sep 17 00:00:00 2001
From: Alexander Markov <almarkv@yandex.ru>
Date: Fri, 28 Jul 2023 13:28:04 +0100
Subject: [PATCH 806/935] added compiled model support for inference (#25124)

* added compiled model support for inference

* linter

* Fix tests

* linter

* linter

* remove inference mode from pipelines

* Linter

---------

Co-authored-by: amarkov <alexander@inworld.ai>
---
 src/transformers/pipelines/base.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 524e15580e46..0911b24912be 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -27,8 +27,6 @@
 from os.path import abspath, exists
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from packaging import version
-
 from ..dynamic_module_utils import custom_object_save
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
 from ..image_processing_utils import BaseImageProcessor
@@ -1015,12 +1013,7 @@ def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict
         raise NotImplementedError("postprocess not implemented")
 
     def get_inference_context(self):
-        inference_context = (
-            torch.inference_mode
-            if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.9.0")
-            else torch.no_grad
-        )
-        return inference_context
+        return torch.no_grad
 
     def forward(self, model_inputs, **forward_params):
         with self.device_placement():

From d53b8ad78077bbcebf6727fce0280fea26e29c7f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 28 Jul 2023 15:33:45 +0200
Subject: [PATCH 807/935] Update `use_auth_token` -> `token` in example scripts
 (#25167)

* pytorch examples

* tensorflow examples

* flax examples

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../flax/image-captioning/run_image_captioning_flax.py |  6 +++---
 examples/flax/language-modeling/run_bart_dlm_flax.py   | 10 +++++-----
 examples/flax/language-modeling/run_clm_flax.py        | 10 +++++-----
 examples/flax/language-modeling/run_mlm_flax.py        | 10 +++++-----
 examples/flax/language-modeling/run_t5_mlm_flax.py     | 10 +++++-----
 examples/flax/question-answering/run_qa.py             |  6 +++---
 examples/flax/summarization/run_summarization_flax.py  | 10 +++++-----
 examples/flax/text-classification/run_flax_glue.py     |  6 +++---
 examples/flax/token-classification/run_flax_ner.py     |  8 ++++----
 examples/flax/vision/run_image_classification.py       |  6 +++---
 .../audio-classification/run_audio_classification.py   |  6 +++---
 examples/pytorch/contrastive-image-text/run_clip.py    |  4 ++--
 .../image-classification/run_image_classification.py   |  6 +++---
 examples/pytorch/image-pretraining/run_mae.py          |  2 +-
 examples/pytorch/image-pretraining/run_mim.py          |  2 +-
 .../pytorch/image-pretraining/run_mim_no_trainer.py    |  2 +-
 examples/pytorch/language-modeling/run_clm.py          |  2 +-
 examples/pytorch/language-modeling/run_mlm.py          |  2 +-
 examples/pytorch/language-modeling/run_plm.py          |  2 +-
 examples/pytorch/multiple-choice/run_swag.py           |  6 +++---
 examples/pytorch/question-answering/run_qa.py          |  6 +++---
 .../pytorch/question-answering/run_qa_beam_search.py   |  6 +++---
 examples/pytorch/question-answering/run_seq2seq_qa.py  |  6 +++---
 .../semantic-segmentation/run_semantic_segmentation.py |  6 +++---
 .../run_speech_recognition_seq2seq.py                  |  8 ++++----
 examples/pytorch/summarization/run_summarization.py    |  6 +++---
 examples/pytorch/text-classification/run_glue.py       |  6 +++---
 examples/pytorch/text-classification/run_xnli.py       |  6 +++---
 examples/pytorch/token-classification/run_ner.py       |  8 ++++----
 examples/pytorch/translation/run_translation.py        |  6 +++---
 examples/tensorflow/contrastive-image-text/run_clip.py |  8 ++++----
 .../image-classification/run_image_classification.py   |  6 +++---
 examples/tensorflow/multiple-choice/run_swag.py        |  6 +++---
 examples/tensorflow/question-answering/run_qa.py       |  6 +++---
 examples/tensorflow/summarization/run_summarization.py |  6 +++---
 examples/tensorflow/text-classification/run_glue.py    |  6 +++---
 .../text-classification/run_text_classification.py     |  8 ++++----
 examples/tensorflow/translation/run_translation.py     |  6 +++---
 38 files changed, 116 insertions(+), 116 deletions(-)

diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index f434752e7414..c6d0f9ce3408 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -475,18 +475,18 @@ def main():
         model_args.model_name_or_path,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
 
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index c1df13d08fad..4452fb8d46ec 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -576,14 +576,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         raise ValueError(
@@ -596,13 +596,13 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             vocab_size=len(tokenizer),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         config = BartConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -707,7 +707,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config.vocab_size = len(tokenizer)
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index c028c3e94b71..65e50c6a4ecd 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -463,13 +463,13 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -480,14 +480,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         raise ValueError(
@@ -501,7 +501,7 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         model = FlaxAutoModelForCausalLM.from_config(
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index 0d12de06940b..be6dc78b7d03 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -495,13 +495,13 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -512,14 +512,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         raise ValueError(
@@ -638,7 +638,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         model = FlaxAutoModelForMaskedLM.from_config(
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index 89e0b9e987a1..57f7d7e31bc6 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -617,14 +617,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         raise ValueError(
@@ -637,13 +637,13 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             vocab_size=len(tokenizer),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         config = T5Config.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -738,7 +738,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config.vocab_size = len(tokenizer)
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 29e1de744c6b..2b59185c9184 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -520,14 +520,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     # endregion
 
@@ -874,7 +874,7 @@ def write_eval_metric(summary_writer, eval_metrics, step):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
     )
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index fc7be6edbffe..d10453ef46a6 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -503,13 +503,13 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -520,14 +520,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         raise ValueError(
@@ -541,7 +541,7 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         model = FlaxAutoModelForSeq2SeqLM.from_config(
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 8985c3a2f783..0ed62ee8ce33 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -411,17 +411,17 @@ def main():
         model_args.model_name_or_path,
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         use_fast=not model_args.use_slow_tokenizer,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = FlaxAutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # Preprocessing the datasets
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 8d7f263f5a3c..06f410a3ea1f 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -490,7 +490,7 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
     if config.model_type in {"gpt2", "roberta"}:
@@ -498,7 +498,7 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
             add_prefix_space=True,
         )
     else:
@@ -506,14 +506,14 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     model = FlaxAutoModelForTokenClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # Preprocessing the datasets
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 49e147e310c3..40331b167802 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -338,7 +338,7 @@ def main():
             num_labels=len(train_dataset.classes),
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
@@ -346,7 +346,7 @@ def main():
             num_labels=len(train_dataset.classes),
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -358,7 +358,7 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         model = FlaxAutoModelForImageClassification.from_config(
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index d610b0470a26..9e6a0bdff08e 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -280,7 +280,7 @@ def main():
         return_attention_mask=model_args.attention_mask,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # `datasets` takes care of automatically loading and resampling the audio,
@@ -340,7 +340,7 @@ def compute_metrics(eval_pred):
         finetuning_task="audio-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -348,7 +348,7 @@ def compute_metrics(eval_pred):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index fef7c9fec351..7c0a03099159 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -336,14 +336,14 @@ def main():
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     model = AutoModel.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     config = model.config
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index f905c2cc2139..143a15712d52 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -276,7 +276,7 @@ def compute_metrics(p):
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForImageClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -284,14 +284,14 @@ def compute_metrics(p):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # Define torchvision transforms to be applied to each image.
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 6685933b1a5a..3ae1caa20548 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -280,7 +280,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         logger.info("Training new model from scratch")
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 01324da1884a..21de0b24f7b6 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -357,7 +357,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         logger.info("Training new model from scratch")
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index f8c2cdbd3ce2..44066daee52a 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -497,7 +497,7 @@ def main():
             config=config,
             cache_dir=args.cache_dir,
             revision=args.model_revision,
-            use_auth_token=True if args.use_auth_token else None,
+            token=True if args.use_auth_token else None,
         )
     else:
         logger.info("Training new model from scratch")
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index e4fb4bd7b7b1..1ee03f100719 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -415,7 +415,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
             torch_dtype=torch_dtype,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 78a9eebaa4cf..b6a490bfb3d8 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -403,7 +403,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index d35e78f17090..015e8573568e 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -383,7 +383,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index f9836ce93463..e9c3d089b4ba 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -314,14 +314,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForMultipleChoice.from_pretrained(
         model_args.model_name_or_path,
@@ -329,7 +329,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # When using your own dataset or a different dataset from swag, you will probably need to change this.
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 854bf9aca13e..415a7ddac728 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -322,14 +322,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -337,7 +337,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # Tokenizer check: this script requires a fast tokenizer.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 84e2948a2d94..71ea498e4ec0 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -320,13 +320,13 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = XLNetTokenizerFast.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = XLNetForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -334,7 +334,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # Preprocessing the datasets.
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index e232dacf1c44..f2d9a77b0144 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -367,14 +367,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -382,7 +382,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index e200f658d456..dc191136b604 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -379,7 +379,7 @@ def compute_metrics(eval_pred):
         id2label=id2label,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSemanticSegmentation.from_pretrained(
         model_args.model_name_or_path,
@@ -387,13 +387,13 @@ def compute_metrics(eval_pred):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # Define torchvision transforms to be applied to each image + target.
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index ce86ba1086f3..29c99e5ab2a2 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -370,7 +370,7 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
@@ -383,21 +383,21 @@ def main():
         model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     if model.config.decoder_start_token_id is None:
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 9d1b38f2c267..d5f4243e056d 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -417,14 +417,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -432,7 +432,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index e361a735500f..50bbeba7ff0b 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -361,14 +361,14 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -376,7 +376,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index a3fdf5e81593..84751df54823 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -278,7 +278,7 @@ def main():
         finetuning_task="xnli",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -286,7 +286,7 @@ def main():
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -294,7 +294,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 34c6e6faa4d5..256c862a194e 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -348,7 +348,7 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
@@ -358,7 +358,7 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
             add_prefix_space=True,
         )
     else:
@@ -367,7 +367,7 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
 
     model = AutoModelForTokenClassification.from_pretrained(
@@ -376,7 +376,7 @@ def get_label_list(labels):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 1004e77e489e..015e58c97b5f 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -366,14 +366,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -381,7 +381,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index 3f96d5d14e22..d3e24a44a764 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -362,14 +362,14 @@ def main():
             model_args.image_processor_name or model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
         with training_args.strategy.scope():
             model = TFAutoModel.from_pretrained(
                 model_args.model_name_or_path,
                 cache_dir=model_args.cache_dir,
                 revision=model_args.model_revision,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=True if model_args.use_auth_token else None,
             )
     else:
         # Load image_processor, in this script we only use this to get the mean and std for normalization.
@@ -377,14 +377,14 @@ def main():
             model_args.image_processor_name or model_args.vision_model_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
         with training_args.strategy.scope():
             model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
                 vision_model_name_or_path=model_args.vision_model_name_or_path,
                 text_model_name_or_path=model_args.text_model_name_or_path,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=True if model_args.use_auth_token else None,
             )
     config = model.config
 
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index c8dc86ffa1fe..83f15fea41d8 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -309,13 +309,13 @@ def main():
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -435,7 +435,7 @@ def compute_metrics(p):
             from_pt=bool(".bin" in model_path),
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
         num_replicas = training_args.strategy.num_replicas_in_sync
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 4122f724b703..94416c381fc9 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -335,14 +335,14 @@ def main():
         config_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     # endregion
 
@@ -428,7 +428,7 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
 
         num_replicas = training_args.strategy.num_replicas_in_sync
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index a602e81e5bf9..a0e847ec4e86 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -338,14 +338,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     # endregion
 
@@ -625,7 +625,7 @@ def compute_metrics(p: EvalPrediction):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
         if training_args.do_train:
             training_dataset = model.prepare_tf_dataset(
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index e0a3ea404b90..c691aa9ebecf 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -388,14 +388,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -513,7 +513,7 @@ def postprocess_text(preds, labels):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index b8cfdf253ae1..ae2a0de6c733 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -284,14 +284,14 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     # endregion
 
@@ -374,7 +374,7 @@ def compute_metrics(preds, label_ids):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
         # endregion
 
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 81d5683d1eea..f6031fc49779 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -301,20 +301,20 @@ def main():
             num_labels=num_labels,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     else:
         config = AutoConfig.from_pretrained(
             config_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     # endregion
 
@@ -402,7 +402,7 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
         # endregion
 
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 67c4c1c4bbab..9004f648856f 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -352,14 +352,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=True if model_args.use_auth_token else None,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -466,7 +466,7 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=True if model_args.use_auth_token else None,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch

From add0895dd98c1a193bf0ada2f3575fcd5e256cb2 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 28 Jul 2023 16:45:09 +0200
Subject: [PATCH 808/935] [`Mpt`] Fix mpt slow test (#25170)

fix mpt slow test
---
 tests/models/mpt/test_modeling_mpt.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index f3fc6d35951c..cb39fd08f523 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -432,7 +432,7 @@ def test_generation_8k(self):
         )
 
         input_text = "Hello"
-        expected_output = "Hello my name is [name] and I am a [type] at [company]. I have a [number]"
+        expected_output = """Hello, I\'m a new user of the forum. I have a question about the "Solaris"""
 
         inputs = tokenizer(input_text, return_tensors="pt")
         outputs = model.generate(**inputs, max_new_tokens=20)
@@ -450,7 +450,9 @@ def test_generation(self):
         )
 
         input_text = "Hello"
-        expected_output = "Hello my name is Kaitlyn and I am a senior at the University of Wisconsin-Stout. I am major"
+        expected_output = (
+            "Hello and welcome to the first day of the new release countdown for the month of May!\nToday"
+        )
 
         inputs = tokenizer(input_text, return_tensors="pt")
         outputs = model.generate(**inputs, max_new_tokens=20)

From dd9d45b6ecc0861847e21d461187711331a56138 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 28 Jul 2023 17:00:10 +0200
Subject: [PATCH 809/935] [`InstructBlip`] Fix instructblip slow test (#25171)

* fix instruct blip slow test

* Update tests/models/instructblip/test_modeling_instructblip.py
---
 tests/models/instructblip/test_modeling_instructblip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index e3fe2074e602..fab53e4d929d 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -537,7 +537,7 @@ def test_inference_vicuna_7b(self):
             logits = model(**inputs).logits
 
         expected_slice = torch.tensor(
-            [[-3.5410, -12.2812, 8.2812], [-5.2500, -12.0938, 7.8398], [-4.1523, -13.8281, 9.0000]],
+            [[-3.5020, -12.3281, 8.4453], [-5.1406, -11.9609, 7.8711], [-4.0430, -13.4375, 9.1172]],
             device=torch_device,
         )
         self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))

From 31f137c04f9dfb0a9892237a9ca2bf23c3df6c11 Mon Sep 17 00:00:00 2001
From: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Date: Sat, 29 Jul 2023 02:06:37 +0900
Subject: [PATCH 810/935] =?UTF-8?q?=F0=9F=8C=90=C2=A0[i18n-KO]=20Translate?=
 =?UTF-8?q?d=C2=A0`transformers=5Fagents.md`=20to=20Korean=20(#24881)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: transformers_agents.md

* docs: ko: transformers_agents.md

* feat: deepl draft

* fix: manual edits

* fix: resolve suggestions

Co-authored-by: Juntae <79131091+sronger@users.noreply.github.com>
Co-authored-by: Injin Paek <71638597+eenzeenee@users.noreply.github.com>

---------

Co-authored-by: Juntae <79131091+sronger@users.noreply.github.com>
Co-authored-by: Injin Paek <71638597+eenzeenee@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml           |   2 +
 docs/source/ko/transformers_agents.md | 328 ++++++++++++++++++++++++++
 2 files changed, 330 insertions(+)
 create mode 100644 docs/source/ko/transformers_agents.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index e142f05b46b6..6956db092bc4 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -21,6 +21,8 @@
     title: 🤗 Accelerate로 분산 학습 구성하기
   - local: model_sharing
     title: 만든 모델 공유하기
+  - local: transformers_agents
+    title: 에이전트
   title: 튜토리얼
 - sections:
   - sections:
diff --git a/docs/source/ko/transformers_agents.md b/docs/source/ko/transformers_agents.md
new file mode 100644
index 000000000000..eeb00761e9a7
--- /dev/null
+++ b/docs/source/ko/transformers_agents.md
@@ -0,0 +1,328 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers Agent [[transformers-agent]]
+
+<Tip warning={true}>
+
+Transformers Agent는 실험 중인 API로 언제든지 변경될 수 있습니다. 
+API 또는 기반 모델이 변경되기 쉽기 때문에 에이전트가 반환하는 결과도 달라질 수 있습니다.
+
+</Tip>
+
+Transformers 버전 4.29.0.에서 *도구*와 *에이전트*라는 컨셉을 도입했습니다. [이 colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj)에서 사용해볼 수 있습니다.
+
+간단히 말하면, Agent는 트랜스포머 위에 자연어 API를 제공합니다. 
+엄선된 도구 세트를 정의하고, 자연어를 해석하여 이러한 도구를 사용할 수 있는 에이전트를 설계했습니다. 
+이 API는 확장이 가능하도록 설계 되었습니다. 
+주요 도구를 선별해두었지만, 커뮤니티에서 개발한 모든 도구를 사용할 수 있도록 시스템을 쉽게 확장할 수 있는 방법도 보여드리겠습니다.
+
+몇 가지 예를 통해 새로운 API로 무엇을 할 수 있는지 살펴보겠습니다. 
+이 API는 특히 멀티모달 작업에서 강력하므로 이미지를 생성하고 텍스트를 소리내어 읽어보겠습니다.
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **Input**                                                                                                                   | **Output**                        |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **Input**                                                                                                               | **Output**                                   |
+|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **Input**                                                                                                                   | **Output**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## 바로 시작하기 [[quickstart]]
+
+`agent.run`을 사용하려면 먼저 대규모 언어 모델(LLM)인 에이전트를 인스턴스화해야 합니다. 
+저희는 openAI 모델뿐만 아니라 BigCode 및 OpenAssistant의 오픈소스 대체 모델도 지원합니다. 
+openAI 모델의 성능이 더 우수하지만(단, openAI API 키가 필요하므로 무료로 사용할 수 없음), 
+Hugging Face는 BigCode와 OpenAssistant 모델의 엔드포인트에 대한 무료 액세스를 제공하고 있습니다.
+
+우선 모든 기본 종속성을 설치하려면 `agents`를 추가로 설치하세요.
+```bash
+pip install transformers[agents]
+```
+
+openAI 모델을 사용하려면 `openai` 종속성을 설치한 후 [`OpenAiAgent`]를 인스턴스화합니다:
+
+```bash
+pip install openai
+```
+
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+BigCode 또는 OpenAssistant를 사용하려면 먼저 로그인하여 Inference API에 액세스하세요:
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+그런 다음 에이전트를 인스턴스화합니다.
+
+```py
+from transformers import HfAgent
+
+# Starcoder
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
+```
+
+현재 Hugging Face에서 무료로 제공하는 추론 API를 사용하고 있습니다. 
+이 모델에 대한 자체 추론 엔드포인트가 있는 경우(또는 다른 엔드포인트가 있는 경우) 위의 URL을 해당 URL 엔드포인트로 바꿀 수 있습니다.
+
+<Tip>
+
+StarCoder와 OpenAssistant는 무료로 사용할 수 있으며 간단한 작업에서 놀라울 정도로 잘 작동합니다. 
+그러나 더 복잡한 프롬프트를 처리할 때는 체크포인트가 잘 작동하지 않습니다. 
+이러한 문제가 발생하면 OpenAI 모델을 사용해 보시기 바랍니다. 아쉽게도 오픈소스는 아니지만 현재로서는 더 나은 성능을 제공합니다.
+
+</Tip>
+
+이제 준비가 완료되었습니다! 이제 자유롭게 사용할 수 있는 두 가지 API에 대해 자세히 알아보겠습니다.
+
+### 단일 실행 (run) [[single-execution-(run)]] 
+
+단일 실행 방법은 에이전트의 [`~Agent.run`] 메소드를 사용하는 경우입니다:
+
+```py
+agent.run("Draw me a picture of rivers and lakes.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+수행하려는 작업에 적합한 도구를 자동으로 선택하여 적절하게 실행합니다. 
+동일한 명령어에서 하나 또는 여러 개의 작업을 수행할 수 있습니다
+(다만, 명령어가 복잡할수록 에이전트가 실패할 가능성이 높아집니다).
+
+```py
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+
+모든 [`~Agent.run`] 작업은 독립적이므로 다른 작업으로 여러 번 연속해서 실행할 수 있습니다.
+
+`agent`는 큰 언어 모델일 뿐이므로 프롬프트에 약간의 변화를 주면 완전히 다른 결과가 나올 수 있다는 점에 유의하세요. 
+수행하려는 작업을 최대한 명확하게 설명하는 것이 중요합니다. 
+좋은 프롬프트를 작성하는 방법은 [여기](custom_tools#writing-good-user-inputs)에서 자세히 확인할 수 있습니다.
+
+여러 실행에 걸쳐 상태를 유지하거나 텍스트가 아닌 개체를 에이전트에게 전달하려는 경우에는 에이전트가 사용할 변수를 지정할 수 있습니다. 
+예를 들어 강과 호수의 첫 번째 이미지를 생성한 뒤, 
+모델이 해당 그림에 섬을 추가하도록 다음과 같이 요청할 수 있습니다:
+
+```python
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
+```
+
+<Tip>
+
+이 방법은 모델이 요청을 이해하지 못하고 도구를 혼합할 때 유용할 수 있습니다. 예를 들면 다음과 같습니다:
+
+```py
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+여기서 모델은 두 가지 방식으로 해석할 수 있습니다:
+- `text-to-image`이 바다에서 헤엄치는 카피바라를 생성하도록 합니다.
+- 또는 `text-to-image`이 카피바라를 생성한 다음 `image-transformation` 도구를 사용하여 바다에서 헤엄치도록 합니다.
+
+첫 번째 시나리오를 강제로 실행하려면 프롬프트를 인수로 전달하여 실행할 수 있습니다:
+
+```py
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### 대화 기반 실행 (chat) [[chat-based-execution-(chat)]]
+
+에이전트는 [`~Agent.chat`] 메소드를 사용하는 대화 기반 접근 방식도 있습니다:
+
+```py
+agent.chat("Generate a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+이 방식은 여러 명령어에 걸쳐 상태를 유지하고자 할 때 흥미로운 접근 방식입니다. 
+실험용으로 더 좋지만 복잡한 명령어보다는 
+단일 명령어([`~Agent.run`] 메소드가 더 잘 처리하는 명령어)에 훨씬 더 잘 작동하는 경향이 있습니다.
+
+이 메소드는 텍스트가 아닌 유형이나 특정 프롬프트를 전달하려는 경우 인수를 받을 수도 있습니다.
+
+### ⚠️ 원격 실행 [[remote-execution]]
+
+데모 목적과 모든 설정에서 사용할 수 있도록 
+에이전트가 접근할 수 있는 몇 가지 기본 도구에 대한 원격 실행기를 만들었습니다. 
+이러한 도구는 [inference endpoints](https://huggingface.co/inference-endpoints)를 사용하여 만들어졌습니다. 
+원격 실행기 도구를 직접 설정하는 방법을 보려면 [사용자 정의 도구 가이드](./custom_tools)를 읽어보시기 바랍니다.
+
+원격 도구로 실행하려면 [`~Agent.run`] 또는 [`~Agent.chat`] 중 하나에 `remote=True`를 지정하기만 하면 됩니다.
+
+예를 들어 다음 명령은 많은 RAM이나 GPU 없이도 모든 장치에서 효율적으로 실행할 수 있습니다:
+
+```py
+agent.run("Draw me a picture of rivers and lakes", remote=True)
+```
+
+[`~Agent.chat`]도 마찬가지입니다:
+
+```py
+agent.chat("Draw me a picture of rivers and lakes", remote=True)
+```
+
+### 여기서 무슨 일이 일어나는 거죠? 도구란 무엇이고, 에이전트란 무엇인가요? [[whats-happening-here-what-are-tools-and-what-are-agents]]
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
+#### 에이전트 [[agents]]
+
+여기서 "에이전트"는 대규모 언어 모델이며, 특정 도구 모음에 접근할 수 있도록 프롬프트하고 있습니다.
+
+LLM은 작은 코드 샘플을 생성하는 데 상당히 능숙하므로,
+이 장점을 활용해 도구 모음을 사용하여 작업을 수행하는 작은 코드 샘플을 제공하라는 메시지를 표시합니다. 
+그런 다음 에이전트에게 제공하는 작업과 제공하는 도구에 대한 설명으로 이 프롬프트가 완료됩니다. 
+이렇게 하면 사용 중인 도구들의 문서에 접근할 수 있으며, 해당 도구들의 입력과 출력을 예상하고, 관련된 코드를 생성할 수 있습니다.
+
+#### 도구 [[tools]]
+
+도구는 매우 간단합니다. 이름과 설명이 있는 단일 기능으로 구성되어 있습니다. 
+그런 다음 이러한 도구의 설명을 사용하여 상담원에게 프롬프트를 표시합니다. 
+이 프롬프트를 통해 상담원에게 쿼리에서 요청된 작업을 수행하기 위해 도구를 활용하는 방법을 보여줍니다.
+
+에이전트가 매우 원자적인 도구를 사용하여 더 나은 코드를 작성하기 때문에 파이프라인이 아닌 완전히 새로운 도구를 사용합니다. 
+파이프라인은 더 많이 리팩터링되며 종종 여러 작업을 하나로 결합합니다. 
+도구는 하나의 매우 간단한 작업에만 집중하도록 되어 있습니다.
+
+#### 코드 실행?! [[code-execution]]
+
+그런 다음 이 코드는 도구와 함께 전달된 입력 세트에 대해 작은 Python 인터프리터를 사용하여 실행됩니다. 
+"임의 코드 실행이라니!"이라고 비명을 지르는 소리가 들리겠지만, 그렇지 않은 이유를 설명하겠습니다.
+
+호출할 수 있는 함수는 제공한 도구와 인쇄 기능뿐이므로 이미 실행할 수 있는 기능이 제한되어 있습니다. 
+Hugging Face 도구로 제한되어 있다면 안전할 것입니다. 
+
+그리고 어트리뷰트 조회나 가져오기를 허용하지 않으므로
+(어차피 작은 함수 집합에 입/출력을 전달할 때는 필요하지 않아야 합니다) 
+가장 명백한 공격(어차피 LLM에 출력하라는 메시지를 표시해야 합니다)은 문제가 되지 않습니다. 
+매우 안전하게 하고 싶다면 추가 인수 return_code=True를 사용하여 run() 메소드를 실행하면 됩니다.
+이 경우 에이전트가 실행할 코드를 반환하고 실행할지 여부를 결정할 수 있습니다.
+
+불법적인 연산을 수행하려고 하거나 에이전트가 생성한 코드에 일반적인 파이썬 오류가 있는 경우 
+실행이 중지됩니다.
+
+### 엄선된 도구 모음 [[a-curated-set-of-tools]]
+
+저희는 이러한 에이전트들의 역량을 강화할 수 있는 일련의 도구를 확인하고 있습니다. 
+다음은 연동된 도구의 최신 목록입니다:
+
+- **문서 질문 답변**: 이미지 형식의 문서(예: PDF)가 주어지면 이 문서에 대한 질문에 답변합니다. ([Donut](./model_doc/donut))
+- **텍스트 질문 답변**: 긴 텍스트와 질문이 주어지면 텍스트에서 질문에 답변합니다. ([Flan-T5](./model_doc/flan-t5))
+- **무조건 이미지 캡셔닝**: 이미지에 캡션을 답니다! ([BLIP](./model_doc/blip))
+- **이미지 질문 답변**: 이미지가 주어지면 이 이미지에 대한 질문에 답변하기. ([VILT](./model_doc/vilt))
+- **이미지 분할**: 이미지와 프롬프트가 주어지면 해당 프롬프트의 분할 마스크를 출력합니다. ([CLIPSeg](./model_doc/clipseg))
+- **음성을 텍스트로 변환**: 사람이 말하는 오디오 녹음이 주어지면 음성을 텍스트로 변환합니다. ([Whisper](./model_doc/whisper))
+- **텍스트 음성 변환**: 텍스트를 음성으로 변환합니다. ([SpeechT5](./model_doc/speecht5))
+- **제로 샷(zero-shot) 텍스트 분류**: 텍스트와 레이블 목록이 주어지면 텍스트와 가장 관련 있는 레이블을 식별합니다. ([BART](./model_doc/bart))
+- **텍스트 요약**: 긴 텍스트를 한 문장 또는 몇 문장으로 요약합니다. ([BART](./model_doc/bart))
+- **번역**: 텍스트를 지정된 언어로 번역합니다. ([NLLB](./model_doc/nllb))
+
+이러한 도구는 트랜스포머에 통합되어 있으며, 예를 들어 수동으로도 사용할 수 있습니다:
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### 사용자 정의 도구 [[custom-tools]]
+
+엄선된 도구 세트도 있지만, 이 구현이 제공하는 가장 큰 가치는 사용자 지정 도구를 빠르게 만들고 공유할 수 있다는 점입니다.
+
+도구의 코드를 Hugging Face Space나 모델 저장소에 푸시하면 에이전트에게 직접 도구를 활용할 수 있습니다.  [`huggingface-tools` organization](https://huggingface.co/huggingface-tools)에 몇 가지 **트랜스포머에 구애받지 않는** 툴을 추가했습니다:
+
+- **텍스트 다운로더**: 웹 URL에서 텍스트를 다운로드합니다.
+- **텍스트 이미지 변환**: 프롬프트에 따라 이미지를 생성하여 안정적인 확산을 활용합니다.
+- **이미지 변환**: 초기 이미지와 프롬프트가 주어진 이미지를 수정하고, 안정적인 확산을 활용하는 지시 픽셀 2 픽셀을 활용합니다.
+- **텍스트 비디오 변환**: 프롬프트에 따라 작은 비디오를 생성하며, damo-vilab을 활용합니다.
+
+저희가 처음부터 사용하고 있는 텍스트-이미지 변환 도구는 [*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)에 있는 원격 도구입니다! 저희는 이 도구와 다른 조직에 이러한 도구를 계속 출시하여 이 구현을 더욱 강화할 것입니다.
+
+에이전트는 기본적으로 [`huggingface-tools`](https://huggingface.co/huggingface-tools)에 있는 도구에 접근할 수 있습니다.
+[다음 가이드](custom_tools)에서 도구를 작성하고 공유하는 방법과 Hub에 있는 사용자 지정 도구를 활용하는 방법에 대해 설명합니다.
+
+### 코드 생성[[code-generation]]
+
+지금까지 에이전트를 사용하여 작업을 수행하는 방법을 보여드렸습니다. 하지만 에이전트는 매우 제한된 Python 인터프리터를 사용하여 실행할 코드만 생성하고 있습니다. 다른 설정에서 생성된 코드를 사용하려는 경우 에이전트에게 도구 정의 및 정확한 가져오기와 함께 코드를 반환하라는 메시지를 표시할 수 있습니다.
+
+예를 들어 다음 명령어는 
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+다음 코드를 반환합니다.
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+이 코드는 직접 수정하고 실행할 수 있습니다.
\ No newline at end of file

From c90e14fb0ffc7be4e4da8256dedd6934f9ceb7c1 Mon Sep 17 00:00:00 2001
From: Yoni Gottesman <yonigo10@gmail.com>
Date: Fri, 28 Jul 2023 20:20:24 +0300
Subject: [PATCH 811/935] Fix beam search to sample at least 1 non eos token
 (#25103) (#25115)

---
 src/transformers/generation/utils.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 7465e88161b6..aa1f377b283a 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -3068,9 +3068,10 @@ def beam_search(
             vocab_size = next_token_scores.shape[-1]
             next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
 
-            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
             next_token_scores, next_tokens = torch.topk(
-                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
             )
 
             next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
@@ -3746,9 +3747,10 @@ def group_beam_search(
                 # reshape for beam search
                 next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
 
-                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+                # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+                n_eos_tokens = len(eos_token_id) if eos_token_id else 0
                 next_token_scores, next_tokens = torch.topk(
-                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                    next_token_scores, max(2, 1 + n_eos_tokens) * group_size, dim=1, largest=True, sorted=True
                 )
 
                 next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
@@ -4119,9 +4121,10 @@ def constrained_beam_search(
             vocab_size = next_token_scores.shape[-1]
             next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
 
-            # Sample 2 next tokens for each beam (so we have some spare tokens and match output of beam search)
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
             next_token_scores, next_tokens = torch.topk(
-                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
             )
 
             next_indices = (next_tokens / vocab_size).long()

From 03f98f96836477f6f5b86957d3ce98778cad5d94 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 28 Jul 2023 18:50:15 +0100
Subject: [PATCH 812/935] [MusicGen] Fix integration tests (#25169)

* move to device

* update with cuda values

* fix fp16

* more rigorous
---
 .../models/musicgen/modeling_musicgen.py      |  5 +-
 .../models/musicgen/test_modeling_musicgen.py | 48 ++++++++++---------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 11a0c4083cb7..0db3fc8e3335 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -773,10 +773,7 @@ def forward(
         past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
 
         if inputs_embeds is None:
-            inputs_embeds = torch.zeros((bsz, seq_len, self.d_model), device=input_ids.device)
-
-            for codebook in range(num_codebooks):
-                inputs_embeds += self.embed_tokens[codebook](input[:, codebook])
+            inputs_embeds = sum([self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks)])
 
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index dbae06dbf3f8..772285316140 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -267,8 +267,8 @@ def test_greedy_generate_dict_outputs(self):
             model = model_class(config).to(torch_device).eval()
             output_greedy, output_generate = self._greedy_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 max_length=max_length,
                 output_scores=True,
                 output_hidden_states=True,
@@ -293,8 +293,8 @@ def test_greedy_generate_dict_outputs_use_cache(self):
             model = model_class(config).to(torch_device).eval()
             output_greedy, output_generate = self._greedy_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 max_length=max_length,
                 output_scores=True,
                 output_hidden_states=True,
@@ -324,8 +324,8 @@ def test_sample_generate(self):
             # check `generate()` and `sample()` are equal
             output_sample, output_generate = self._sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 max_length=max_length,
                 num_return_sequences=3,
                 logits_processor=logits_processor,
@@ -356,8 +356,8 @@ def test_sample_generate_dict_output(self):
 
             output_sample, output_generate = self._sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 max_length=max_length,
                 num_return_sequences=1,
                 logits_processor=logits_processor,
@@ -964,8 +964,8 @@ def test_greedy_generate_dict_outputs(self):
             model = model_class(config).to(torch_device).eval()
             output_greedy, output_generate = self._greedy_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 decoder_input_ids=decoder_input_ids,
                 max_length=max_length,
                 output_scores=True,
@@ -989,8 +989,8 @@ def test_greedy_generate_dict_outputs_use_cache(self):
             model = model_class(config).to(torch_device).eval()
             output_greedy, output_generate = self._greedy_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 decoder_input_ids=decoder_input_ids,
                 max_length=max_length,
                 output_scores=True,
@@ -1019,8 +1019,8 @@ def test_sample_generate(self):
             # check `generate()` and `sample()` are equal
             output_sample, output_generate = self._sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 decoder_input_ids=decoder_input_ids,
                 max_length=max_length,
                 num_return_sequences=1,
@@ -1050,8 +1050,8 @@ def test_sample_generate_dict_output(self):
 
             output_sample, output_generate = self._sample_generate(
                 model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
+                input_ids=input_ids.to(torch_device),
+                attention_mask=attention_mask.to(torch_device),
                 decoder_input_ids=decoder_input_ids,
                 max_length=max_length,
                 num_return_sequences=3,
@@ -1089,8 +1089,12 @@ def test_generate_fp16(self):
             model = model_class(config).eval().to(torch_device)
             if torch_device == "cuda":
                 model.half()
-            model.generate(**input_dict, max_new_tokens=10)
-            model.generate(**input_dict, do_sample=True, max_new_tokens=10)
+            # greedy
+            model.generate(input_dict["input_ids"], attention_mask=input_dict["attention_mask"], max_new_tokens=10)
+            # sampling
+            model.generate(
+                input_dict["input_ids"], attention_mask=input_dict["attention_mask"], do_sample=True, max_new_tokens=10
+            )
 
 
 def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
@@ -1230,8 +1234,8 @@ def test_generate_unconditional_sampling(self):
         # fmt: off
         EXPECTED_VALUES = torch.tensor(
             [
-                0.0765, 0.0758, 0.0749, 0.0759, 0.0759, 0.0771, 0.0775, 0.0760,
-                0.0762, 0.0765, 0.0767, 0.0760, 0.0738, 0.0714, 0.0713, 0.0730,
+                -0.0099, -0.0140, 0.0079, 0.0080, -0.0046,  0.0065, -0.0068, -0.0185,
+                 0.0105,  0.0059, 0.0329, 0.0249, -0.0204, -0.0341, -0.0465,  0.0053,
             ]
         )
         # fmt: on
@@ -1312,8 +1316,8 @@ def test_generate_text_prompt_sampling(self):
         # fmt: off
         EXPECTED_VALUES = torch.tensor(
             [
-                -0.0047, -0.0094, -0.0028, -0.0018, -0.0057, -0.0007, -0.0104, -0.0211,
-                -0.0097, -0.0150, -0.0066, -0.0004, -0.0201, -0.0325, -0.0326, -0.0098,
+                -0.0111, -0.0154, 0.0047, 0.0058, -0.0068,  0.0012, -0.0109, -0.0229,
+                 0.0010, -0.0038, 0.0167, 0.0042, -0.0421, -0.0610, -0.0764, -0.0326,
             ]
         )
         # fmt: on

From 05cda5df3405e6a2ee4ecf8f7e1b2300ebda472e Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 28 Jul 2023 19:52:51 +0100
Subject: [PATCH 813/935] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20?=
 =?UTF-8?q?=20Fix=20rescale=20ViVit=20Efficientnet=20(#25174)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix rescaling bug

* Add tests

* Update integration tests

* Fix up

* Update src/transformers/image_transforms.py

* Update test - new possible order in list
---
 src/transformers/image_transforms.py          |  3 ++-
 .../image_processing_efficientnet.py          | 19 ++++++++++++-------
 .../models/vivit/image_processing_vivit.py    | 19 +++++++++++++------
 .../test_image_processing_efficientnet.py     | 14 ++++++++++++++
 .../vivit/test_image_processing_vivit.py      | 14 ++++++++++++++
 tests/models/vivit/test_modeling_vivit.py     |  2 +-
 ...ipelines_zero_shot_image_classification.py |  1 +
 7 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 63585d5e038b..71afaaf268da 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -110,10 +110,11 @@ def rescale(
     if not isinstance(image, np.ndarray):
         raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
+    image = image.astype(dtype)
+
     rescaled_image = image * scale
     if data_format is not None:
         rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
-    rescaled_image = rescaled_image.astype(dtype)
     return rescaled_image
 
 
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index eaefb9c10150..8873a8006997 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -153,7 +153,13 @@ def rescale(
         **kwargs,
     ):
         """
-        Rescale an image by a scale factor. image = image * scale.
+        Rescale an image by a scale factor.
+
+        If offset is True, the image is rescaled between [-1, 1].
+            image = image * scale * 2 - 1
+
+        If offset is False, the image is rescaled between [0, 1].
+            image = image * scale
 
         Args:
             image (`np.ndarray`):
@@ -165,13 +171,12 @@ def rescale(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
+        scale = scale * 2 if offset else scale
+        rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
+
         if offset:
-            rescaled_image = (image - 127.5) * scale
-            if data_format is not None:
-                rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
-            rescaled_image = rescaled_image.astype(np.float32)
-        else:
-            rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
+            rescaled_image = rescaled_image - 1
+
         return rescaled_image
 
     def preprocess(
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 2aa7a911fed1..41666e99999f 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -167,6 +167,7 @@ def resize(
             raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
+    # Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale
     def rescale(
         self,
         image: np.ndarray,
@@ -178,23 +179,29 @@ def rescale(
         """
         Rescale an image by a scale factor.
 
-        If offset is `True`, image scaled between [-1, 1]: image = (image - 127.5) * scale. If offset is `False`, image
-        scaled between [0, 1]: image = image * scale
+        If offset is True, the image is rescaled between [-1, 1].
+            image = image * scale * 2 - 1
+
+        If offset is False, the image is rescaled between [0, 1].
+            image = image * scale
 
         Args:
             image (`np.ndarray`):
                 Image to rescale.
             scale (`int` or `float`):
                 Scale to apply to the image.
-           offset (`bool`, *optional*):
+            offset (`bool`, *optional*):
                 Whether to scale the image in both negative and positive directions.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        image = image.astype(np.float32)
+        scale = scale * 2 if offset else scale
+        rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
+
         if offset:
-            image = image - (scale / 2)
-        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+            rescaled_image = rescaled_image - 1
+
+        return rescaled_image
 
     def _preprocess_image(
         self,
diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.py b/tests/models/efficientnet/test_image_processing_efficientnet.py
index 8e4fad3b0840..11aee2d01c5c 100644
--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -193,3 +193,17 @@ def test_call_pytorch(self):
                 self.image_processor_tester.size["width"],
             ),
         )
+
+    def test_rescale(self):
+        # EfficientNet optionally rescales between -1 and 1 instead of the usual 0 and 1
+        image = np.arange(0, 256, 1, dtype=np.uint8).reshape(1, 8, 32)
+
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 255)
+        expected_image = image.astype(np.float32) * (2 / 255.0) - 1
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
+        expected_image = image.astype(np.float32) / 255.0
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
index f33553db0f25..75a3e0264c0a 100644
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -212,3 +212,17 @@ def test_call_pytorch(self):
                 self.image_processor_tester.crop_size["width"],
             ),
         )
+
+    def test_rescale(self):
+        # ViVit optionally rescales between -1 and 1 instead of the usual 0 and 1
+        image = np.arange(0, 256, 1, dtype=np.uint8).reshape(1, 8, 32)
+
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 255)
+        expected_image = image.astype(np.float32) * (2 / 255.0) - 1
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
+
+        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
+        expected_image = image.astype(np.float32) / 255.0
+        self.assertTrue(np.allclose(rescaled_image, expected_image))
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index 43db8bad7bde..ed032e4bdd32 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -345,6 +345,6 @@ def test_inference_for_video_classification(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         # taken from original model
-        expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658]).to(torch_device)
+        expected_slice = torch.tensor([-0.9498, 2.7971, -1.4049, 0.1024, -1.8353]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4))
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index fbbfc78cae3c..197019f42e7b 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -85,6 +85,7 @@ def test_small_model_pt(self):
             [
                 [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
                 [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "c"}, {"score": 0.333, "label": "b"}],
+                [{"score": 0.333, "label": "b"}, {"score": 0.333, "label": "a"}, {"score": 0.333, "label": "c"}],
             ],
         )
 

From 4a564490e149620282b9e946027e851f5f93aeae Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 31 Jul 2023 11:21:11 +0100
Subject: [PATCH 814/935] Musicgen: CFG is manually added  (#25173)

---
 .../models/musicgen/modeling_musicgen.py      | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 0db3fc8e3335..a65683993624 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -27,7 +27,7 @@
 
 from ...activations import ACT2FN
 from ...generation.configuration_utils import GenerationConfig
-from ...generation.logits_process import LogitsProcessorList
+from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
 from ...generation.stopping_criteria import StoppingCriteriaList
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -1351,7 +1351,12 @@ def generate(
             and generation_config.do_sample is True
         )
 
-        # 8. prepare distribution pre_processing samplers
+        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
+            generation_config.guidance_scale = None
+
+        # 9. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             generation_config=generation_config,
             input_ids_seq_length=input_ids_seq_length,
@@ -1360,7 +1365,7 @@ def generate(
             logits_processor=logits_processor,
         )
 
-        # 9. prepare stopping criteria
+        # 10. prepare stopping criteria
         stopping_criteria = self._get_stopping_criteria(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
@@ -1372,7 +1377,7 @@ def generate(
                     f"but is {generation_config.num_return_sequences}."
                 )
 
-            # 8. run greedy search
+            # 11. run greedy search
             outputs = self.greedy_search(
                 input_ids,
                 logits_processor=logits_processor,
@@ -1386,7 +1391,7 @@ def generate(
             )
 
         elif is_sample_gen_mode:
-            # 9. prepare logits warper
+            # 11. prepare logits warper
             logits_warper = self._get_logits_warper(generation_config)
 
             # expand input_ids with `num_return_sequences` additional sequences per batch
@@ -1396,7 +1401,7 @@ def generate(
                 **model_kwargs,
             )
 
-            # 10. run sample
+            # 12. run sample
             outputs = self.sample(
                 input_ids,
                 logits_processor=logits_processor,
@@ -2375,7 +2380,12 @@ def generate(
             and generation_config.do_sample is True
         )
 
-        # 8. prepare distribution pre_processing samplers
+        # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
+            generation_config.guidance_scale = None
+
+        # 9. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             generation_config=generation_config,
             input_ids_seq_length=input_ids_seq_length,
@@ -2384,7 +2394,7 @@ def generate(
             logits_processor=logits_processor,
         )
 
-        # 9. prepare stopping criteria
+        # 10. prepare stopping criteria
         stopping_criteria = self._get_stopping_criteria(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
@@ -2396,7 +2406,7 @@ def generate(
                     f"but is {generation_config.num_return_sequences}."
                 )
 
-            # 10. run greedy search
+            # 11. run greedy search
             outputs = self.greedy_search(
                 input_ids,
                 logits_processor=logits_processor,

From 67b85f24def79962ce075353c2627f78e0e53e9f Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 31 Jul 2023 16:15:02 +0200
Subject: [PATCH 815/935] Better error message in `_prepare_output_docstrings`
 (#25202)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/utils/doc.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index f5eea7ae4e11..17aeadcfdf99 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -122,6 +122,11 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None):
     if i < len(lines):
         params_docstring = "\n".join(lines[(i + 1) :])
         params_docstring = _convert_output_args_doc(params_docstring)
+    else:
+        raise ValueError(
+            f"No `Args` or `Parameters` section is found in the docstring of `{output_type.__name__}`. Make sure it has"
+            "docstring and contain either `Args` or `Parameters`."
+        )
 
     # Add the return introduction
     full_output_type = f"{output_type.__module__}.{output_type.__name__}"

From 59dcea3fe4df37218aae8f36b183d3add5e256a8 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 31 Jul 2023 17:25:09 +0200
Subject: [PATCH 816/935] [`PreTrainedModel`] Wrap `cuda` and `to` method
 correctly (#25206)

wrap `cuda` and `to` method correctly
---
 src/transformers/modeling_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7fb1c62dae85..5ddea3e34e19 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -25,7 +25,7 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
-from functools import partial
+from functools import partial, wraps
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -1912,6 +1912,7 @@ def get_memory_footprint(self, return_buffers=True):
             mem = mem + mem_bufs
         return mem
 
+    @wraps(torch.nn.Module.cuda)
     def cuda(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
         if getattr(self, "is_quantized", False):
@@ -1922,6 +1923,7 @@ def cuda(self, *args, **kwargs):
         else:
             return super().cuda(*args, **kwargs)
 
+    @wraps(torch.nn.Module.to)
     def to(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
         if getattr(self, "is_quantized", False):

From 9ca3aa01564bb81e1362288a8fdf5ac6e0e63126 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 31 Jul 2023 17:32:05 +0200
Subject: [PATCH 817/935] Fix `all_model_classes` in `FlaxBloomGenerationTest`
 (#25211)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/bloom/test_modeling_flax_bloom.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/bloom/test_modeling_flax_bloom.py b/tests/models/bloom/test_modeling_flax_bloom.py
index 0e49039afe59..91716615de9a 100644
--- a/tests/models/bloom/test_modeling_flax_bloom.py
+++ b/tests/models/bloom/test_modeling_flax_bloom.py
@@ -198,7 +198,7 @@ def test_model_from_pretrained(self):
 @slow
 @require_flax
 class FlaxBloomGenerationTest(unittest.TestCase):
-    all_model_classes = (FlaxBloomForCausalLM) if is_flax_available() else ()
+    all_model_classes = (FlaxBloomForCausalLM,) if is_flax_available() else ()
     all_generative_model_classes = () if is_flax_available() else ()
 
     def setUp(self):

From 5220606607606db8ad56aecd0169f4ce9e70604a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Mon, 31 Jul 2023 09:37:29 -0700
Subject: [PATCH 818/935] [quantization.md] fix (#25190)

Update quantization.md
---
 docs/source/en/main_classes/quantization.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index c8547ab0c714..90d56969e4f0 100644
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -106,9 +106,9 @@ Note also that `device_map` is optional but setting `device_map = 'auto'` is pre
 
 </Tip>
 
-#### Advanced usecases
+#### Advanced use cases
 
-Here we will cover some advanced usecases you can perform with FP4 quantization 
+Here we will cover some advanced use cases you can perform with FP4 quantization 
 
 ##### Change the compute dtype
 
@@ -184,13 +184,13 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit",
 Note that in this case, you don't need to specify the arguments `load_in_8bit=True`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
 Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
 
-### Advanced usecases
+### Advanced use cases
 
 This section is intended to advanced users, that want to explore what it is possible to do beyond loading and running 8-bit models.
 
 #### Offload between `cpu` and `gpu`
 
-One of the advanced usecase of this is being able to load a model and dispatch the weights between `CPU` and `GPU`. Note that the weights that will be dispatched on CPU **will not** be converted in 8-bit, thus kept in `float32`. This feature is intended for users that want to fit a very large model and dispatch the model between GPU and CPU.
+One of the advanced use case of this is being able to load a model and dispatch the weights between `CPU` and `GPU`. Note that the weights that will be dispatched on CPU **will not** be converted in 8-bit, thus kept in `float32`. This feature is intended for users that want to fit a very large model and dispatch the model between GPU and CPU.
 
 First, load a `BitsAndBytesConfig` from `transformers` and set the attribute `llm_int8_enable_fp32_cpu_offload` to `True`:
 
@@ -226,7 +226,7 @@ And that's it! Enjoy your model!
 
 You can play with the `llm_int8_threshold` argument to change the threshold of the outliers. An "outlier" is a hidden state value that is greater than a certain threshold. 
 This corresponds to the outlier threshold for outlier detection as described in `LLM.int8()` paper. Any hidden states value that is above this threshold will be considered an outlier and the operation on those values will be done in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models (small models, fine-tuning).
-This argument can impact the inference speed of the model. We suggest to play with this parameter to find which one is the best for your usecase.
+This argument can impact the inference speed of the model. We suggest to play with this parameter to find which one is the best for your use case.
 
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
@@ -280,4 +280,4 @@ Note that you don't need to pass `device_map` when loading the model for trainin
 
 ## Quantization with 🤗 `optimum` 
 
-Please have a look at [Optimum documentation](https://huggingface.co/docs/optimum/index) to learn more about quantization methods that are supported by `optimum` and see if these are applicable for your usecase.
+Please have a look at [Optimum documentation](https://huggingface.co/docs/optimum/index) to learn more about quantization methods that are supported by `optimum` and see if these are applicable for your use case.

From e0c50b274ab396ca41beec9e0e025820524a1513 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 31 Jul 2023 18:43:21 +0200
Subject: [PATCH 819/935] [`pipeline`] revisit device check for pipeline
 (#25207)

* revisit device check for pipeline

* let's raise an error.
---
 src/transformers/pipelines/base.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 0911b24912be..153e8e9f6b42 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -775,12 +775,20 @@ def __init__(
         self.modelcard = modelcard
         self.framework = framework
 
+        # `accelerate` device map
+        hf_device_map = getattr(self.model, "hf_device_map", None)
+
+        if hf_device_map is not None and device is not None:
+            raise ValueError(
+                "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+                "discard the `device` argument when creating your pipeline object."
+            )
+
+        # We shouldn't call `model.to()` for models loaded with accelerate
         if self.framework == "pt" and device is not None and not (isinstance(device, int) and device < 0):
             self.model.to(device)
 
         if device is None:
-            # `accelerate` device map
-            hf_device_map = getattr(self.model, "hf_device_map", None)
             if hf_device_map is not None:
                 # Take the first device used by `accelerate`.
                 device = next(iter(hf_device_map.values()))

From 1b4f6199c68f28dcfee0aed923d2e89740353ed4 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 31 Jul 2023 19:35:33 +0200
Subject: [PATCH 820/935] Update tiny model info. and pipeline testing (#25213)

* update tiny_model_summary.json

* update

* update

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/falcon/test_modeling_falcon.py |   2 +-
 tests/models/mpt/test_modeling_mpt.py       |  11 +-
 tests/models/mra/test_modeling_mra.py       |  15 +-
 tests/models/pvt/test_modeling_pvt.py       |   8 +-
 tests/models/t5/test_modeling_t5.py         |  14 +-
 tests/models/umt5/test_modeling_umt5.py     |  14 +-
 tests/models/vivit/test_modeling_vivit.py   |   8 +-
 tests/utils/tiny_model_summary.json         | 296 +++++++++++++++++++-
 8 files changed, 358 insertions(+), 10 deletions(-)

diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 1c08f76bda5b..6530eeb1a157 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -278,9 +278,9 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     pipeline_model_mapping = (
         {
             "feature-extraction": FalconModel,
+            "question-answering": FalconForQuestionAnswering,
             "text-classification": FalconForSequenceClassification,
             "text-generation": FalconForCausalLM,
-            "question-answering": FalconForQuestionAnswering,
             "token-classification": FalconForTokenClassification,
             "zero-shot": FalconForSequenceClassification,
         }
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index cb39fd08f523..0261b9d75e46 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -362,7 +362,16 @@ class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_torchscript = False
     test_head_masking = False
     pipeline_model_mapping = (
-        {"feature-extraction": MptModel, "text-generation": MptForCausalLM} if is_torch_available() else {}
+        {
+            "feature-extraction": MptModel,
+            "question-answering": MptForQuestionAnswering,
+            "text-classification": MptForSequenceClassification,
+            "text-generation": MptForCausalLM,
+            "token-classification": MptForTokenClassification,
+            "zero-shot": MptForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
     )
 
     def setUp(self):
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index ced66f02f56b..c6a0813032fe 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -22,6 +22,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -280,7 +281,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MraModelTest(ModelTesterMixin, unittest.TestCase):
+class MraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             MraModel,
@@ -299,6 +300,18 @@ class MraModelTest(ModelTesterMixin, unittest.TestCase):
     has_attentions = False
 
     all_generative_model_classes = ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MraModel,
+            "fill-mask": MraForMaskedLM,
+            "question-answering": MraForQuestionAnswering,
+            "text-classification": MraForSequenceClassification,
+            "token-classification": MraForTokenClassification,
+            "zero-shot": MraForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = MraModelTester(self)
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index d2290e0a0281..eb1370d0bc29 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -30,6 +30,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -154,8 +155,13 @@ def prepare_img():
 
 
 @require_torch
-class PvtModelTest(ModelTesterMixin, unittest.TestCase):
+class PvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (PvtModel, PvtForImageClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": PvtModel, "image-classification": PvtForImageClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index e219a8802251..41aade960e84 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -560,11 +560,11 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         {
             "conversational": T5ForConditionalGeneration,
             "feature-extraction": T5Model,
+            "question-answering": T5ForQuestionAnswering,
             "summarization": T5ForConditionalGeneration,
+            "text-classification": T5ForSequenceClassification,
             "text2text-generation": T5ForConditionalGeneration,
             "translation": T5ForConditionalGeneration,
-            "question-answering": T5ForQuestionAnswering,
-            "text-classification": T5ForSequenceClassification,
             "zero-shot": T5ForSequenceClassification,
         }
         if is_torch_available()
@@ -583,6 +583,16 @@ def setUp(self):
         self.model_tester = T5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
 
+    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         if not is_torch_fx_available() or not self.fx_compatible:
             return
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 3615547f6215..29f8502179a5 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -296,11 +296,11 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         {
             "conversational": UMT5ForConditionalGeneration,
             "feature-extraction": UMT5Model,
+            "question-answering": UMT5ForQuestionAnswering,
             "summarization": UMT5ForConditionalGeneration,
+            "text-classification": UMT5ForSequenceClassification,
             "text2text-generation": UMT5ForConditionalGeneration,
             "translation": UMT5ForConditionalGeneration,
-            "question-answering": UMT5ForQuestionAnswering,
-            "text-classification": UMT5ForSequenceClassification,
             "zero-shot": UMT5ForSequenceClassification,
         }
         if is_torch_available()
@@ -317,6 +317,16 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     def setUp(self):
         self.model_tester = UMT5ModelTester(self)
 
+    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
+    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
+            return True
+
+        return False
+
     def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
         if not is_torch_fx_available() or not self.fx_compatible:
             return
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index ed032e4bdd32..152cfac155b8 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -29,6 +29,7 @@
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -153,13 +154,18 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class VivitModelTest(ModelTesterMixin, unittest.TestCase):
+class VivitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
     Here we also overwrite some of the tests of test_modeling_common.py, as Vivit does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (VivitModel, VivitForVideoClassification) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": VivitModel, "video-classification": VivitForVideoClassification}
+        if is_torch_available()
+        else {}
+    )
 
     test_pruning = False
     test_torchscript = False
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index 2d0575612d6d..b7fdf87bac75 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -1084,6 +1084,16 @@
         ],
         "sha": "1d6ae6c0b60868dffbef0dddeda381c51c6dcba5"
     },
+    "Data2VecAudioForAudioFrameClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Data2VecAudioForAudioFrameClassification"
+        ],
+        "sha": "a64828b27e73fc8dd95aeb315108ca2f6a66b55f"
+    },
     "Data2VecAudioForCTC": {
         "tokenizer_classes": [],
         "processor_classes": [
@@ -1509,6 +1519,26 @@
         ],
         "sha": "d6c75bc51196f0a683afb12de6310fdda13efefd"
     },
+    "Dinov2ForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "Dinov2ForImageClassification"
+        ],
+        "sha": "ae44840966456aae33641df2c8c8a4af5b457b24"
+    },
+    "Dinov2Model": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "BitImageProcessor"
+        ],
+        "model_classes": [
+            "Dinov2Model"
+        ],
+        "sha": "6f560b1cc9806bcf84fe0b0c60b5faf9c29be959"
+    },
     "DistilBertForMaskedLM": {
         "tokenizer_classes": [
             "DistilBertTokenizer",
@@ -3931,6 +3961,122 @@
         ],
         "sha": "2f46357659db2d6d54d870e28073deeea1c8cb64"
     },
+    "MptForCausalLM": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForCausalLM"
+        ],
+        "sha": "500c869b956c65f6b1a7b4867727f124c6f5728a"
+    },
+    "MptForQuestionAnswering": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForQuestionAnswering"
+        ],
+        "sha": "6ee46572bf61eb5e7dbbdaf00b73c4d37efc42d9"
+    },
+    "MptForSequenceClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForSequenceClassification"
+        ],
+        "sha": "f0b9153413b5dfceeb96b67d4b0f22c94bbaf64a"
+    },
+    "MptForTokenClassification": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptForTokenClassification"
+        ],
+        "sha": "3f7c3ccd67cd0b2aae56d37613429a64ef813246"
+    },
+    "MptModel": {
+        "tokenizer_classes": [
+            "GPTNeoXTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MptModel"
+        ],
+        "sha": "ea747f234556661b0c8b84a626f267066ce586bf"
+    },
+    "MraForMaskedLM": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForMaskedLM"
+        ],
+        "sha": "c00ee46cfd2b8fed29cc37f0a4ead40ad51a439c"
+    },
+    "MraForMultipleChoice": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForMultipleChoice"
+        ],
+        "sha": "f397469ba8109f64dab2d75335ea7bf0c2dbeb74"
+    },
+    "MraForQuestionAnswering": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForQuestionAnswering"
+        ],
+        "sha": "c2ed75acd20e5440a76d6504d9a3ebc2513011f0"
+    },
+    "MraForSequenceClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForSequenceClassification"
+        ],
+        "sha": "f47672d3708508bda7774215bee44a92ec16ab2f"
+    },
+    "MraForTokenClassification": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraForTokenClassification"
+        ],
+        "sha": "f0961ab5818bca473607fb94b391c186dc1d3492"
+    },
+    "MraModel": {
+        "tokenizer_classes": [
+            "RobertaTokenizer",
+            "RobertaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MraModel"
+        ],
+        "sha": "315f34f30bcc4b0b66b11987726df2a80c50e271"
+    },
     "MvpForCausalLM": {
         "tokenizer_classes": [
             "MvpTokenizer",
@@ -4500,7 +4646,8 @@
             "T5TokenizerFast"
         ],
         "processor_classes": [
-            "Pix2StructImageProcessor"
+            "Pix2StructImageProcessor",
+            "Pix2StructProcessor"
         ],
         "model_classes": [],
         "sha": "42b3de00ad535076c4893e4ac5ae2d2748cc4ccb"
@@ -4555,6 +4702,26 @@
         ],
         "sha": "f1ddbbcc768c7ba54c4d75b319540c1635e65937"
     },
+    "PvtForImageClassification": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "PvtImageProcessor"
+        ],
+        "model_classes": [
+            "PvtForImageClassification"
+        ],
+        "sha": "589b37bd6941aff6dd248259f9eee3c422a41fde"
+    },
+    "PvtModel": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "PvtImageProcessor"
+        ],
+        "model_classes": [
+            "PvtModel"
+        ],
+        "sha": "c40765c382515ae627652d60e9077b6478448d48"
+    },
     "ReformerForMaskedLM": {
         "tokenizer_classes": [
             "ReformerTokenizer",
@@ -5498,6 +5665,18 @@
         ],
         "sha": "275bbf6d389bfd0540b9f824c609c6b22a577328"
     },
+    "T5EncoderModel": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5EncoderModel",
+            "TFT5EncoderModel"
+        ],
+        "sha": "1c75090036a2b3740dfe2d570b889332ad8e59e8"
+    },
     "T5ForConditionalGeneration": {
         "tokenizer_classes": [
             "T5Tokenizer",
@@ -5510,6 +5689,28 @@
         ],
         "sha": "593fd6072a4e265f5cc73b1973cd8af76b261f29"
     },
+    "T5ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5ForQuestionAnswering"
+        ],
+        "sha": "b9edf2de494244ff032f67d2d7bdf6c591000c94"
+    },
+    "T5ForSequenceClassification": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "T5ForSequenceClassification"
+        ],
+        "sha": "105b5c4c8e1efe927444108f1388c4f102ebad15"
+    },
     "T5Model": {
         "tokenizer_classes": [
             "T5Tokenizer",
@@ -5659,6 +5860,50 @@
         ],
         "sha": "c3cbf7a6159c038f333ce7adda2480ea3396b2b3"
     },
+    "UMT5EncoderModel": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5EncoderModel"
+        ],
+        "sha": "2894e49c9fbd17ea4b3dab56ec388be354c1a5f0"
+    },
+    "UMT5ForQuestionAnswering": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5ForQuestionAnswering"
+        ],
+        "sha": "b381aa068a44200db539f2f48f4e34a5ed1cb093"
+    },
+    "UMT5ForSequenceClassification": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5ForSequenceClassification"
+        ],
+        "sha": "aa9f77b7b3cff21425b7512e7c0f478af7b5db14"
+    },
+    "UMT5Model": {
+        "tokenizer_classes": [
+            "T5Tokenizer",
+            "T5TokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "UMT5Model"
+        ],
+        "sha": "9180d850b24e5494442a4f7a8ca1a4c102f9babd"
+    },
     "UniSpeechForCTC": {
         "tokenizer_classes": [
             "Wav2Vec2CTCTokenizer"
@@ -5707,6 +5952,18 @@
         ],
         "sha": "18e170eb1091715b74ace28c8c380b6bf2b6202d"
     },
+    "UniSpeechSatForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "UniSpeechSatForAudioFrameClassification"
+        ],
+        "sha": "7eba5a1c6cd610928b27ecb217bb17c729a07a57"
+    },
     "UniSpeechSatForCTC": {
         "tokenizer_classes": [
             "Wav2Vec2CTCTokenizer"
@@ -5997,6 +6254,18 @@
         ],
         "sha": "85020189fb7bf1217eb9370b09bca8ec5bcfdafa"
     },
+    "Wav2Vec2ConformerForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ConformerForAudioFrameClassification"
+        ],
+        "sha": "e316a18a1d165b4cb51a7f28f8e8dab676da4b56"
+    },
     "Wav2Vec2ConformerForCTC": {
         "tokenizer_classes": [
             "Wav2Vec2CTCTokenizer"
@@ -6057,6 +6326,18 @@
         ],
         "sha": "ef2fe3aa8c23e6f8696e6612061aaddecae49994"
     },
+    "Wav2Vec2ForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "Wav2Vec2ForAudioFrameClassification"
+        ],
+        "sha": "ab219f119e10f56e1059966c66d23f0df3c2c343"
+    },
     "Wav2Vec2ForCTC": {
         "tokenizer_classes": [
             "Wav2Vec2CTCTokenizer"
@@ -6101,6 +6382,7 @@
             "Wav2Vec2FeatureExtractor"
         ],
         "model_classes": [
+            "TFWav2Vec2ForSequenceClassification",
             "Wav2Vec2ForSequenceClassification"
         ],
         "sha": "2000b2022abcc37100241485f5872126b70164c9"
@@ -6130,6 +6412,18 @@
         ],
         "sha": "7a998ee3ee0619a52828a79c3eed6872fd053f37"
     },
+    "WavLMForAudioFrameClassification": {
+        "tokenizer_classes": [
+            "Wav2Vec2CTCTokenizer"
+        ],
+        "processor_classes": [
+            "Wav2Vec2FeatureExtractor"
+        ],
+        "model_classes": [
+            "WavLMForAudioFrameClassification"
+        ],
+        "sha": "b135610f8d5de0b1a5bf5ed7212966135c63d6ec"
+    },
     "WavLMForCTC": {
         "tokenizer_classes": [
             "Wav2Vec2CTCTokenizer"

From 0fd8d2aa2cc9e172a8af9af8508b2530f55ca14c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 31 Jul 2023 20:13:15 +0200
Subject: [PATCH 821/935] Fix docker image build failure (#25214)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index c5ada9209bca..eb4092eacb6f 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -22,7 +22,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
 
 # TODO: Handle these in a python utility script
 RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
@@ -33,6 +32,9 @@ RUN echo torch=$VERSION
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
 RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
+
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
+
 RUN python3 -m pip uninstall -y flax jax
 
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://developer.intel.com/ipex-whl-stable-cpu

From 4033ea7167c4a826f895830bac04c2561680572c Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 1 Aug 2023 13:35:49 +0800
Subject: [PATCH 822/935] =?UTF-8?q?make=20build=5Fmpt=5Falibi=5Ftensor=20a?=
 =?UTF-8?q?=20method=20of=20MptModel=20so=20that=20deepspeed=20co=E2=80=A6?=
 =?UTF-8?q?=20(#25193)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

make build_mpt_alibi_tensor a method of MptModel so that deepspeed could override it to make autoTP work

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 src/transformers/models/mpt/modeling_mpt.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index e1e176568bfb..0c608dbd2a93 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -413,6 +413,9 @@ def __init__(self, config: MptConfig):
     def get_input_embeddings(self):
         return self.wte
 
+    def build_mpt_alibi_tensor(self, num_heads, sequence_length, alibi_bias_max=8, device=None):
+        return build_mpt_alibi_tensor(num_heads, sequence_length, alibi_bias_max, device)
+
     def _prepare_attn_mask(
         self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
     ) -> torch.BoolTensor:
@@ -507,7 +510,7 @@ def forward(
         else:
             attention_mask = attention_mask.to(hidden_states.device)
 
-        alibi = build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
+        alibi = self.build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
 
         causal_mask = self._prepare_attn_mask(
             attention_mask,

From 77c3973e8fcadc72735a2a9109bb5ee4454ed1c8 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 1 Aug 2023 10:56:37 +0200
Subject: [PATCH 823/935] [`Pix2Struct`] Fix pix2struct cross attention
 (#25200)

* fix pix2struct cross attention

* fix torchscript slow test
---
 src/transformers/models/pix2struct/modeling_pix2struct.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index b9cfff26a26a..015007a9679b 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1547,8 +1547,9 @@ def custom_forward(*inputs):
                 present_key_value_states = present_key_value_states + (present_key_value_state,)
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[2],)
-                all_cross_attentions = all_cross_attentions + (layer_outputs[3],)
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
 
         hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)

From 972fdcc77878cf7afcc8aef8979d6b4241005bb6 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 1 Aug 2023 10:56:52 +0200
Subject: [PATCH 824/935] [`Docs`/`quantization`] Clearer explanation on how
 things works under the hood. + remove outdated info (#25216)

* clearer explanation on how things works under the hood.

* Update docs/source/en/main_classes/quantization.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/main_classes/quantization.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add `load_in_4bit` in `from_pretrained`

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/main_classes/quantization.md | 25 ++++++++++++++++++++-
 src/transformers/modeling_utils.py          |  8 +++----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index 90d56969e4f0..e37f8cd9987a 100644
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -29,6 +29,29 @@ If you want to quantize your own pytorch model, check out this [documentation](h
 
 Here are the things you can do using `bitsandbytes` integration
 
+### General usage
+
+You can quantize a model by using the `load_in_8bit` or `load_in_4bit` argument when calling the [`~PreTrainedModel.from_pretrained`] method as long as your model supports loading with 🤗 Accelerate and contains `torch.nn.Linear` layers. This should work for any modality as well.
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True)
+model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True)
+```
+
+By default all other modules (e.g. `torch.nn.LayerNorm`) will be converted in `torch.float16`, but if you want to change their `dtype` you can overwrite the `torch_dtype` argument:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM
+
+>>> model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
+>>> model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+torch.float32
+```
+
+
 ### FP4 quantization 
 
 #### Requirements
@@ -41,7 +64,7 @@ Make sure that you have installed the requirements below before running any of t
 - Install latest `accelerate`
 `pip install --upgrade accelerate`
 
-- Install latest `transformers` from source 
+- Install latest `transformers`
 `pip install --upgrade transformers`
 
 #### Tips and best practices
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5ddea3e34e19..7c39733be4f2 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2126,10 +2126,10 @@ def from_pretrained(
                 `True` when there is some disk offload.
             load_in_8bit (`bool`, *optional*, defaults to `False`):
                 If `True`, will convert the loaded model into mixed-8bit quantized model. To use this feature please
-                install `bitsandbytes` compiled with your CUDA version by running `pip install -i
-                https://test.pypi.org/simple/ bitsandbytes-cudaXXX` where XXX is your CUDA version (e.g. 11.6 = 116).
-                Make also sure that you have enough GPU RAM to store half of the model size since the 8bit modules are
-                not compiled and adapted for CPUs.
+                install `bitsandbytes` (`pip install -U bitsandbytes`).
+            load_in_4bit (`bool`, *optional*, defaults to `False`):
+                If `True`, will convert the loaded model into 4bit precision quantized model. To use this feature
+                install the latest version of `bitsandbytes` (`pip install -U bitsandbytes`).
             quantization_config (`Dict`, *optional*):
                 A dictionary of configuration parameters for the `bitsandbytes` library and loading the model using
                 advanced features such as offloading in fp32 on CPU or on disk.

From 05ebb0264e430d96092a742e8f5b35deb6365b94 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 1 Aug 2023 12:20:34 +0200
Subject: [PATCH 825/935] [`MPT`] Add  `require_bitsandbytes` on MPT
 integration tests (#25201)

* add  `require_bitsandbytes` on MPT integration tests

* add it on mpt as well
---
 tests/models/instructblip/test_modeling_instructblip.py | 3 ++-
 tests/models/mpt/test_modeling_mpt.py                   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index fab53e4d929d..d659c3891635 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -29,7 +29,7 @@
     InstructBlipQFormerConfig,
     InstructBlipVisionConfig,
 )
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -521,6 +521,7 @@ def prepare_img():
 @require_torch
 @slow
 class InstructBlipModelIntegrationTest(unittest.TestCase):
+    @require_bitsandbytes
     def test_inference_vicuna_7b(self):
         processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
         model = InstructBlipForConditionalGeneration.from_pretrained(
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 0261b9d75e46..91cb35bb7a6a 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -18,7 +18,7 @@
 import unittest
 
 from transformers import MptConfig, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -430,6 +430,7 @@ def test_model_from_pretrained(self):
 
 @slow
 @require_torch_gpu
+@require_bitsandbytes
 class MptIntegrationTests(unittest.TestCase):
     def test_generation_8k(self):
         model_id = "mosaicml/mpt-7b-8k"

From 3170af71e1df8a9f7b7ef33ba8388dc630d65d79 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 1 Aug 2023 12:21:48 +0200
Subject: [PATCH 826/935] [`Detr`] Fix detr BatchNorm replacement issue
 (#25230)

* fix detr weird issue

* Update src/transformers/models/conditional_detr/modeling_conditional_detr.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fix copies

* fix copies

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../modeling_conditional_detr.py              | 34 ++++++++++++-------
 .../modeling_deformable_detr.py               | 34 ++++++++++++-------
 src/transformers/models/deta/modeling_deta.py | 34 ++++++++++++-------
 src/transformers/models/detr/modeling_detr.py | 34 ++++++++++++-------
 .../modeling_table_transformer.py             | 34 ++++++++++++-------
 5 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index f824b52430df..7de1d7b2e04a 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -310,19 +310,27 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->ConditionalDetr
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = ConditionalDetrFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `ConditionalDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = ConditionalDetrFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index cdeb3c796225..e1ce23182ee3 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -357,19 +357,27 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->DeformableDetr
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = DeformableDetrFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DeformableDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DeformableDetrFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
 
 
 class DeformableDetrConvEncoder(nn.Module):
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 07e6c535f7ea..4a9fb5fa3ae1 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -295,19 +295,27 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->Deta
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = DetaFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetaFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
 
 
 class DetaBackboneWithPositionalEncodings(nn.Module):
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index ec47c9bb4f61..95d5be375263 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -304,19 +304,27 @@ def forward(self, x):
         return x * scale + bias
 
 
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = DetrFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetrFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
 
 
 class DetrConvEncoder(nn.Module):
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index df2cf23b650d..fa99d13e27df 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -239,19 +239,27 @@ def forward(self, x):
 
 
 # Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->TableTransformer
-def replace_batch_norm(m, name=""):
-    for attr_str in dir(m):
-        target_attr = getattr(m, attr_str)
-        if isinstance(target_attr, nn.BatchNorm2d):
-            frozen = TableTransformerFrozenBatchNorm2d(target_attr.num_features)
-            bn = getattr(m, attr_str)
-            frozen.weight.data.copy_(bn.weight)
-            frozen.bias.data.copy_(bn.bias)
-            frozen.running_mean.data.copy_(bn.running_mean)
-            frozen.running_var.data.copy_(bn.running_var)
-            setattr(m, attr_str, frozen)
-    for n, ch in m.named_children():
-        replace_batch_norm(ch, n)
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `TableTransformerFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = TableTransformerFrozenBatchNorm2d(module.num_features)
+
+            new_module.weight.data.copy_(module.weight)
+            new_module.bias.data.copy_(module.bias)
+            new_module.running_mean.data.copy_(module.running_mean)
+            new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
 
 
 # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->TableTransformer

From d27e4c18fe2970abcb9a48dcb8a824e48083b15f Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 1 Aug 2023 12:33:12 +0100
Subject: [PATCH 827/935] Move rescale dtype recasting to match torchvision
 ToTensor (#25229)

Move dtype recasting to match torchvision ToTensor
---
 src/transformers/image_transforms.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 71afaaf268da..d1621492d667 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -110,11 +110,12 @@ def rescale(
     if not isinstance(image, np.ndarray):
         raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
 
-    image = image.astype(dtype)
-
     rescaled_image = image * scale
     if data_format is not None:
         rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+
+    rescaled_image = rescaled_image.astype(dtype)
+
     return rescaled_image
 
 

From f6f567d0bef19fac606b0f1f20ce41198909c14d Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 2 Aug 2023 03:29:00 -0400
Subject: [PATCH 828/935] Fix set of model parallel in the Trainer when no GPUs
 are available (#25239)

---
 src/transformers/trainer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 319c36e7c874..1e68fcc4bd53 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -378,14 +378,17 @@ def __init__(
             devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
             if len(devices) > 1:
                 self.is_model_parallel = True
-            else:
+            elif len(devices) == 1:
                 self.is_model_parallel = self.args.device != torch.device(devices[0])
+            else:
+                self.is_model_parallel = False
 
             # warn users
-            logger.info(
-                "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set"
-                " to `True` to avoid any unexpected behavior such as device placement mismatching."
-            )
+            if self.is_model_parallel:
+                logger.info(
+                    "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set"
+                    " to `True` to avoid any unexpected behavior such as device placement mismatching."
+                )
 
         # At this stage the model is already loaded
         if getattr(model, "is_quantized", False):

From 2230d149f0dc4de05ea7a9a1c5295b6d80e495cd Mon Sep 17 00:00:00 2001
From: YQ <ranchlai@163.com>
Date: Wed, 2 Aug 2023 16:21:52 +0800
Subject: [PATCH 829/935] fix get_keys_to_not_convert() to return correct
 modules for full precision inference (#25105)

* add test for `get_keys_to_not_convert`

* add minimum patch to keep mpt lm_head from 8bit quantization

* add reivsion to
---
 src/transformers/utils/bitsandbytes.py | 17 +++++-----
 tests/bnb/test_mixed_int8.py           | 47 ++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 94d6c33937a6..95a180dc5f48 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -265,17 +265,16 @@ def get_keys_to_not_convert(model):
         tied_keys = sum(tied_params, [])
     has_tied_params = len(tied_keys) > 0
 
-    # Check if it is a base model
-    is_base_model = not hasattr(model, model.base_model_prefix)
-
-    # Ignore this for base models (BertModel, GPT2Model, etc.)
-    if (not has_tied_params) and is_base_model:
-        return []
-
-    # otherwise they have an attached head
+    # If there is not tied weights, we want to keep the lm_head（output_embedding) in full precision
+    if not has_tied_params:
+        output_emb = model.get_output_embeddings()
+        if output_emb is not None:
+            list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
+            return list_last_module
+
+    # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
     list_modules = list(model.named_parameters())
     list_last_module = [list_modules[-1][0]]
-
     # add last module together with tied weights
     intersection = set(list_last_module) - set(tied_keys)
     list_untouched = list(set(tied_keys)) + list(intersection)
diff --git a/tests/bnb/test_mixed_int8.py b/tests/bnb/test_mixed_int8.py
index f905b26e3f71..3e88a366d82b 100644
--- a/tests/bnb/test_mixed_int8.py
+++ b/tests/bnb/test_mixed_int8.py
@@ -124,6 +124,53 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
+    def test_get_keys_to_not_convert(self):
+        r"""
+        Test the `get_keys_to_not_convert` function.
+        """
+        from accelerate import init_empty_weights
+
+        from transformers import AutoModelForMaskedLM, Blip2ForConditionalGeneration, MptForCausalLM, OPTForCausalLM
+        from transformers.utils.bitsandbytes import get_keys_to_not_convert
+
+        model_id = "mosaicml/mpt-7b"
+        config = AutoConfig.from_pretrained(
+            model_id, trust_remote_code=True, revision="72e5f594ce36f9cabfa2a9fd8f58b491eb467ee7"
+        )
+        with init_empty_weights():
+            model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+        self.assertEqual(get_keys_to_not_convert(model), ["transformer.wte"])
+        # without trust_remote_code
+        config = AutoConfig.from_pretrained(model_id, revision="72e5f594ce36f9cabfa2a9fd8f58b491eb467ee7")
+        with init_empty_weights():
+            model = MptForCausalLM(config)
+        # The order of the keys does not matter, so we sort them before comparing, same for the other tests.
+        self.assertEqual(get_keys_to_not_convert(model).sort(), ["lm_head", "transformer.wte"].sort())
+
+        model_id = "Salesforce/blip2-opt-2.7b"
+        config = AutoConfig.from_pretrained(model_id, revision="1ef7f63a8f0a144c13fdca8103eb7b4691c74cec")
+        with init_empty_weights():
+            model = Blip2ForConditionalGeneration(config)
+        self.assertEqual(
+            get_keys_to_not_convert(model).sort(),
+            ["language_model.lm_head", "language_model.model.decoder.embed_tokens"].sort(),
+        )
+
+        model_id = "facebook/opt-350m"
+        config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
+        with init_empty_weights():
+            model = OPTForCausalLM(config)
+        self.assertEqual(get_keys_to_not_convert(model).sort(), ["lm_head", "model.decoder.embed_tokens"].sort())
+
+        model_id = "roberta-large"
+        config = AutoConfig.from_pretrained(model_id, revision="716877d372b884cad6d419d828bac6c85b3b18d9")
+        with init_empty_weights():
+            model = AutoModelForMaskedLM.from_config(config)
+        self.assertEqual(
+            get_keys_to_not_convert(model).sort(),
+            ["'roberta.embeddings.word_embeddings', 'lm_head', 'lm_head.decoder"].sort(),
+        )
+
     def test_quantization_config_json_serialization(self):
         r"""
         A simple test to check if the quantization config is correctly serialized and deserialized

From c6a8768dab8eed1d1fea7549045bcbac462c513f Mon Sep 17 00:00:00 2001
From: YQ <ranchlai@163.com>
Date: Wed, 2 Aug 2023 16:44:43 +0800
Subject: [PATCH 830/935] add pathname and line number to logging formatter in
 debug mode (#25203)

* add pathname and lineno to logging formatter in debug mode

* use TRANSFORMERS_VERBOSITY="detail" to print pathname and lineno
---
 src/transformers/utils/logging.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index 1211994c5606..80d5b71f63e0 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -40,6 +40,7 @@
 _default_handler: Optional[logging.Handler] = None
 
 log_levels = {
+    "detail": logging.DEBUG,  # will also print filename and line number
     "debug": logging.DEBUG,
     "info": logging.INFO,
     "warning": logging.WARNING,
@@ -95,6 +96,11 @@ def _configure_library_root_logger() -> None:
         library_root_logger = _get_library_root_logger()
         library_root_logger.addHandler(_default_handler)
         library_root_logger.setLevel(_get_default_logging_level())
+        # if logging level is debug, we add pathname and lineno to formatter for easy debugging
+        if os.getenv("TRANSFORMERS_VERBOSITY", None) == "detail":
+            formatter = logging.Formatter("[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s")
+            _default_handler.setFormatter(formatter)
+
         library_root_logger.propagate = False
 
 

From 149cb0cce2df3f932de58c6d05cec548600553e2 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 2 Aug 2023 11:17:31 +0200
Subject: [PATCH 831/935] Add `token` arugment in example scripts (#25172)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../run_image_captioning_flax.py              | 31 +++++++++----
 .../language-modeling/run_bart_dlm_flax.py    | 43 ++++++++++++-------
 .../flax/language-modeling/run_clm_flax.py    | 43 ++++++++++++-------
 .../flax/language-modeling/run_mlm_flax.py    | 43 ++++++++++++-------
 .../flax/language-modeling/run_t5_mlm_flax.py | 43 ++++++++++++-------
 examples/flax/question-answering/run_qa.py    | 31 +++++++++----
 .../summarization/run_summarization_flax.py   | 35 ++++++++++-----
 .../flax/text-classification/run_flax_glue.py | 31 +++++++++----
 .../flax/token-classification/run_flax_ner.py | 33 +++++++++-----
 .../flax/vision/run_image_classification.py   | 27 +++++++++---
 .../run_audio_classification.py               | 30 +++++++++----
 .../contrastive-image-text/run_clip.py        | 29 +++++++++----
 .../run_image_classification.py               | 29 +++++++++----
 examples/pytorch/image-pretraining/run_mae.py | 27 +++++++++---
 examples/pytorch/image-pretraining/run_mim.py | 27 +++++++++---
 examples/pytorch/language-modeling/run_clm.py | 39 +++++++++++------
 examples/pytorch/language-modeling/run_mlm.py | 39 +++++++++++------
 examples/pytorch/language-modeling/run_plm.py | 37 ++++++++++------
 examples/pytorch/multiple-choice/run_swag.py  | 31 +++++++++----
 examples/pytorch/question-answering/run_qa.py | 31 +++++++++----
 .../question-answering/run_qa_beam_search.py  | 31 +++++++++----
 .../question-answering/run_seq2seq_qa.py      | 31 +++++++++----
 .../run_semantic_segmentation.py              | 27 +++++++++---
 .../run_speech_recognition_ctc.py             | 36 +++++++++++-----
 .../run_speech_recognition_ctc_adapter.py     | 41 +++++++++++++-----
 .../run_speech_recognition_seq2seq.py         | 33 +++++++++-----
 .../summarization/run_summarization.py        | 31 +++++++++----
 .../text-classification/run_classification.py | 33 +++++++++-----
 .../pytorch/text-classification/run_glue.py   | 35 ++++++++++-----
 .../pytorch/text-classification/run_xnli.py   | 35 ++++++++++-----
 .../pytorch/token-classification/run_ner.py   | 31 +++++++++----
 .../pytorch/translation/run_translation.py    | 31 +++++++++----
 .../contrastive-image-text/run_clip.py        | 33 +++++++++-----
 .../run_image_classification.py               | 29 +++++++++----
 .../tensorflow/language-modeling/run_clm.py   | 33 +++++++++-----
 .../tensorflow/language-modeling/run_mlm.py   | 29 +++++++++----
 .../tensorflow/multiple-choice/run_swag.py    | 31 +++++++++----
 .../tensorflow/question-answering/run_qa.py   | 31 +++++++++----
 .../summarization/run_summarization.py        | 31 +++++++++----
 .../text-classification/run_glue.py           | 29 +++++++++----
 .../run_text_classification.py                | 31 +++++++++----
 .../token-classification/run_ner.py           | 25 ++++++++---
 .../tensorflow/translation/run_translation.py | 31 +++++++++----
 43 files changed, 987 insertions(+), 420 deletions(-)

diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index c6d0f9ce3408..f71641e30f7d 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -22,6 +22,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from functools import partial
@@ -182,15 +183,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -389,6 +396,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_captioning", model_args, data_args, framework="flax")
@@ -448,7 +461,7 @@ def main():
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
             data_dir=data_args.data_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -465,7 +478,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -475,18 +488,18 @@ def main():
         model_args.model_name_or_path,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
 
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index 4452fb8d46ec..259f67f0b17d 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -26,6 +26,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -168,15 +169,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -463,6 +470,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_bart_dlm", model_args, data_args, framework="flax")
@@ -517,7 +530,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in datasets.keys():
@@ -526,14 +539,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -548,7 +561,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in datasets.keys():
@@ -557,14 +570,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -576,14 +589,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -596,13 +609,13 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             vocab_size=len(tokenizer),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = BartConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -707,7 +720,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config.vocab_size = len(tokenizer)
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 65e50c6a4ecd..7c4206a0c7c9 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -27,6 +27,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -169,15 +170,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -334,6 +341,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args, framework="flax")
@@ -397,7 +410,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in dataset.keys():
@@ -406,14 +419,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             dataset["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -431,7 +444,7 @@ def main():
             data_files=data_files,
             cache_dir=model_args.cache_dir,
             **dataset_args,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in dataset.keys():
@@ -441,7 +454,7 @@ def main():
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
                 **dataset_args,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             dataset["train"] = load_dataset(
                 extension,
@@ -449,7 +462,7 @@ def main():
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
                 **dataset_args,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -463,13 +476,13 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -480,14 +493,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -501,7 +514,7 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         model = FlaxAutoModelForCausalLM.from_config(
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index be6dc78b7d03..d5e44feaf3d8 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -26,6 +26,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -174,15 +175,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -377,6 +384,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args, framework="flax")
@@ -434,7 +447,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in datasets.keys():
@@ -443,14 +456,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -465,7 +478,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in datasets.keys():
@@ -474,14 +487,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -495,13 +508,13 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -512,14 +525,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -638,7 +651,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         model = FlaxAutoModelForMaskedLM.from_config(
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index 57f7d7e31bc6..c3afc58207b4 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -25,6 +25,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
@@ -168,15 +169,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -504,6 +511,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_t5_mlm", model_args, data_args, framework="flax")
@@ -558,7 +571,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in datasets.keys():
@@ -567,14 +580,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -589,7 +602,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         if "validation" not in datasets.keys():
@@ -598,14 +611,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -617,14 +630,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -637,13 +650,13 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             vocab_size=len(tokenizer),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = T5Config.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -738,7 +751,7 @@ def group_texts(examples):
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config.vocab_size = len(tokenizer)
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 2b59185c9184..925e182e7e96 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -25,6 +25,7 @@
 import random
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
@@ -155,15 +156,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
@@ -438,6 +445,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args, framework="flax")
@@ -487,7 +500,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading the dataset from local csv or json file.
@@ -507,7 +520,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -520,14 +533,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # endregion
 
@@ -874,7 +887,7 @@ def write_eval_metric(summary_writer, eval_metrics, step):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
     )
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index d10453ef46a6..83af31d8d22a 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -24,6 +24,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from functools import partial
@@ -188,15 +189,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -417,6 +424,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args, framework="flax")
@@ -475,7 +488,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -492,7 +505,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -503,13 +516,13 @@ def main():
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -520,14 +533,14 @@ def main():
             model_args.tokenizer_name,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         raise ValueError(
@@ -541,7 +554,7 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         model = FlaxAutoModelForSeq2SeqLM.from_config(
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 0ed62ee8ce33..3f6d0a5eb6c3 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -21,6 +21,7 @@
 import random
 import sys
 import time
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple
@@ -101,15 +102,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -321,6 +328,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args, framework="flax")
@@ -368,7 +381,7 @@ def main():
         raw_datasets = load_dataset(
             "glue",
             data_args.task_name,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading the dataset from local csv or json file.
@@ -381,7 +394,7 @@ def main():
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -411,17 +424,17 @@ def main():
         model_args.model_name_or_path,
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         use_fast=not model_args.use_slow_tokenizer,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = FlaxAutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Preprocessing the datasets
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 06f410a3ea1f..f66d9b3128a7 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -21,6 +21,7 @@
 import random
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from itertools import chain
@@ -149,15 +150,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -377,6 +384,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args, framework="flax")
@@ -422,7 +435,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading the dataset from local csv or json file.
@@ -436,7 +449,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -490,7 +503,7 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
     if config.model_type in {"gpt2", "roberta"}:
@@ -498,7 +511,7 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             add_prefix_space=True,
         )
     else:
@@ -506,14 +519,14 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     model = FlaxAutoModelForTokenClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Preprocessing the datasets
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 40331b167802..63b638423a76 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -24,6 +24,7 @@
 import os
 import sys
 import time
+import warnings
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
@@ -159,15 +160,21 @@ class ModelArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -257,6 +264,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_classification", model_args, data_args, framework="flax")
@@ -338,7 +351,7 @@ def main():
             num_labels=len(train_dataset.classes),
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
@@ -346,7 +359,7 @@ def main():
             num_labels=len(train_dataset.classes),
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -358,7 +371,7 @@ def main():
             config=config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         model = FlaxAutoModelForImageClassification.from_config(
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 9e6a0bdff08e..c1568867e232 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -152,15 +152,21 @@ class ModelArguments:
     attention_mask: bool = field(
         default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     freeze_feature_extractor: Optional[bool] = field(
         default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
     )
@@ -198,6 +204,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_audio_classification", model_args, data_args)
@@ -250,13 +262,13 @@ def main():
         data_args.dataset_name,
         data_args.dataset_config_name,
         split=data_args.train_split_name,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     raw_datasets["eval"] = load_dataset(
         data_args.dataset_name,
         data_args.dataset_config_name,
         split=data_args.eval_split_name,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -280,7 +292,7 @@ def main():
         return_attention_mask=model_args.attention_mask,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # `datasets` takes care of automatically loading and resampling the audio,
@@ -340,7 +352,7 @@ def compute_metrics(eval_pred):
         finetuning_task="audio-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -348,7 +360,7 @@ def compute_metrics(eval_pred):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 7c0a03099159..dad10ea27495 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -26,6 +26,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -86,15 +87,21 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     freeze_vision_model: bool = field(
         default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
     )
@@ -235,6 +242,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clip", model_args, data_args)
@@ -294,7 +307,7 @@ def main():
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
             data_dir=data_args.data_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -311,7 +324,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -336,14 +349,14 @@ def main():
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     model = AutoModel.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     config = model.config
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 143a15712d52..35a74b253263 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -142,15 +143,21 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -176,6 +183,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_image_classification", model_args, data_args)
@@ -229,7 +242,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             task="image-classification",
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -276,7 +289,7 @@ def compute_metrics(p):
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForImageClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -284,14 +297,14 @@ def compute_metrics(p):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Define torchvision transforms to be applied to each image.
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 3ae1caa20548..1c269fba3a2b 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -133,15 +134,21 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     mask_ratio: float = field(
         default=0.75, metadata={"help": "The ratio of the number of masked tokens in the input sequence."}
     )
@@ -175,6 +182,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mae", model_args, data_args)
@@ -224,7 +237,7 @@ def main():
         data_args.dataset_config_name,
         data_files=data_args.data_files,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -242,7 +255,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name:
         config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -280,7 +293,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         logger.info("Training new model from scratch")
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 21de0b24f7b6..25e780ab48be 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -153,15 +154,21 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     image_size: Optional[int] = field(
         default=None,
         metadata={
@@ -239,6 +246,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mim", model_args, data_args)
@@ -288,7 +301,7 @@ def main():
         data_args.dataset_config_name,
         data_files=data_args.data_files,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -305,7 +318,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name_or_path:
         config = AutoConfig.from_pretrained(model_args.config_name_or_path, **config_kwargs)
@@ -357,7 +370,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         logger.info("Training new model from scratch")
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 1ee03f100719..c1ed1c3412cc 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -25,6 +25,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -111,15 +112,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     torch_dtype: Optional[str] = field(
         default=None,
         metadata={
@@ -238,6 +245,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args)
@@ -300,7 +313,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
@@ -309,7 +322,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
@@ -317,7 +330,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
     else:
@@ -339,7 +352,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -349,7 +362,7 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
             raw_datasets["train"] = load_dataset(
@@ -357,7 +370,7 @@ def main():
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
 
@@ -373,7 +386,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -391,7 +404,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -415,7 +428,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             torch_dtype=torch_dtype,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index b6a490bfb3d8..d6c756edc387 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -25,6 +25,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -107,15 +108,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
@@ -238,6 +245,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args)
@@ -301,7 +314,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             streaming=data_args.streaming,
         )
         if "validation" not in raw_datasets.keys():
@@ -310,7 +323,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
             raw_datasets["train"] = load_dataset(
@@ -318,7 +331,7 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 streaming=data_args.streaming,
             )
     else:
@@ -335,7 +348,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -345,14 +358,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -366,7 +379,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -384,7 +397,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -403,7 +416,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 015e8573568e..30ac5d422578 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -22,6 +22,7 @@
 import math
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
@@ -95,15 +96,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
@@ -229,6 +236,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_plm", model_args, data_args)
@@ -291,7 +304,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -299,14 +312,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -325,14 +338,14 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -346,7 +359,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -364,7 +377,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
+        "token": model_args.token,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -383,7 +396,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index e9c3d089b4ba..61f72fd60528 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional, Union
@@ -79,15 +80,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -225,6 +232,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_swag", model_args, data_args)
@@ -292,7 +305,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Downloading and loading the swag dataset from the hub.
@@ -300,7 +313,7 @@ def main():
             "swag",
             "regular",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -314,14 +327,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForMultipleChoice.from_pretrained(
         model_args.model_name_or_path,
@@ -329,7 +342,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # When using your own dataset or a different dataset from swag, you will probably need to change this.
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 415a7ddac728..d823b28256ba 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -79,15 +80,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -227,6 +234,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args)
@@ -289,7 +302,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -308,7 +321,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -322,14 +335,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -337,7 +350,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Tokenizer check: this script requires a fast tokenizer.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 71ea498e4ec0..c12dd53ab7aa 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -78,15 +79,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -226,6 +233,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa_beam_search", model_args, data_args)
@@ -288,7 +301,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -306,7 +319,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -320,13 +333,13 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = XLNetTokenizerFast.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = XLNetForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -334,7 +347,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Preprocessing the datasets.
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index f2d9a77b0144..ac3fb64974f8 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
@@ -80,15 +81,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -273,6 +280,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_seq2seq_qa", model_args, data_args)
@@ -335,7 +348,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -353,7 +366,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -367,14 +380,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -382,7 +395,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index dc191136b604..ce63e15faca7 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -18,6 +18,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -241,15 +242,21 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 def main():
@@ -265,6 +272,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_semantic_segmentation", model_args, data_args)
@@ -379,7 +392,7 @@ def compute_metrics(eval_pred):
         id2label=id2label,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSemanticSegmentation.from_pretrained(
         model_args.model_name_or_path,
@@ -387,13 +400,13 @@ def compute_metrics(eval_pred):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # Define torchvision transforms to be applied to each image + target.
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index c8dbdda70dd8..bff864ddb305 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -229,15 +229,21 @@ class DataTrainingArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     unk_token: str = field(
         default="[UNK]",
         metadata={"help": "The unk token for the tokenizer"},
@@ -379,6 +385,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if data_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
@@ -427,7 +439,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             split=data_args.train_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
         )
 
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -452,7 +464,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
         )
 
         if data_args.max_eval_samples is not None:
@@ -490,7 +502,9 @@ def remove_special_characters(batch):
     # the tokenizer
     # load config
     config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
     )
 
     # 4. Next, if no tokenizer file is defined,
@@ -546,11 +560,13 @@ def remove_special_characters(batch):
     # load feature_extractor and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
         **tokenizer_kwargs,
     )
     feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
     )
 
     # adapt config
@@ -578,7 +594,7 @@ def remove_special_characters(batch):
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         config=config,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
     )
 
     # freeze encoder
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index f2e2234e86d7..be9021874a48 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -232,15 +232,21 @@ class DataTrainingArguments:
             )
         },
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     unk_token: str = field(
         default="[UNK]",
         metadata={"help": "The unk token for the tokenizer"},
@@ -375,6 +381,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if data_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if data_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        data_args.token = data_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_speech_recognition_ctc_adapter", model_args, data_args)
@@ -423,7 +435,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             split=data_args.train_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
         )
 
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -448,7 +460,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
-            use_auth_token=data_args.use_auth_token,
+            token=data_args.token,
         )
 
         if data_args.max_eval_samples is not None:
@@ -486,7 +498,9 @@ def remove_special_characters(batch):
     # the tokenizer
     # load config
     config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
     )
 
     # 4. Next, if no tokenizer file is defined,
@@ -500,7 +514,10 @@ def remove_special_characters(batch):
     vocab_dict = {}
     if tokenizer_name_or_path is not None:
         # load vocabulary of other adapter languages so that new language can be appended
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_auth_token=data_args.use_auth_token)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            token=data_args.token,
+        )
         vocab_dict = tokenizer.vocab.copy()
         if tokenizer.target_lang is None:
             raise ValueError("Make sure to load a multi-lingual tokenizer with a set target language.")
@@ -566,11 +583,13 @@ def remove_special_characters(batch):
     # load feature_extractor and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
         **tokenizer_kwargs,
     )
     feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
     )
 
     # adapt config
@@ -595,7 +614,7 @@ def remove_special_characters(batch):
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         config=config,
-        use_auth_token=data_args.use_auth_token,
+        token=data_args.token,
         ignore_mismatched_sizes=True,
     )
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 29c99e5ab2a2..aea7d3b7fdbd 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 
@@ -85,15 +86,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     freeze_feature_encoder: bool = field(
         default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
     )
@@ -278,6 +285,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
@@ -336,7 +349,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.train_split_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     if training_args.do_eval:
@@ -345,7 +358,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
@@ -370,7 +383,7 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
@@ -383,21 +396,21 @@ def main():
         model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     if model.config.decoder_start_token_id is None:
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index d5f4243e056d..4a27df7582f6 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -99,15 +100,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     resize_position_embeddings: Optional[bool] = field(
         default=None,
         metadata={
@@ -312,6 +319,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args)
@@ -386,7 +399,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -403,7 +416,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -417,14 +430,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -432,7 +445,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 1b680973c7f4..a5a4ad49f852 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -20,6 +20,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import List, Optional
 
@@ -227,15 +228,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -268,6 +275,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_classification", model_args, data_args)
@@ -327,7 +340,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         # Try print some info about the dataset
         logger.info(f"Dataset loaded: {raw_datasets}")
@@ -358,7 +371,7 @@ def main():
                 "csv",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
@@ -366,7 +379,7 @@ def main():
                 "json",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
 
     # See more about loading any type of standard or custom dataset at
@@ -468,7 +481,7 @@ def main():
         finetuning_task="text-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     if is_regression:
@@ -486,7 +499,7 @@ def main():
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -494,7 +507,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 50bbeba7ff0b..3f8bcc2fbb61 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -20,6 +20,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -188,15 +189,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -216,6 +223,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args)
@@ -281,7 +294,7 @@ def main():
             "glue",
             data_args.task_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
@@ -289,7 +302,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from your local files.
@@ -318,7 +331,7 @@ def main():
                 "csv",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
@@ -326,7 +339,7 @@ def main():
                 "json",
                 data_files=data_files,
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -361,14 +374,14 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -376,7 +389,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 84751df54823..459a59282e32 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -21,6 +21,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -152,15 +153,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -175,6 +182,12 @@ def main():
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_xnli", model_args)
@@ -232,7 +245,7 @@ def main():
                 model_args.language,
                 split="train",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         else:
             train_dataset = load_dataset(
@@ -240,7 +253,7 @@ def main():
                 model_args.train_language,
                 split="train",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
         label_list = train_dataset.features["label"].names
 
@@ -250,7 +263,7 @@ def main():
             model_args.language,
             split="validation",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         label_list = eval_dataset.features["label"].names
 
@@ -260,7 +273,7 @@ def main():
             model_args.language,
             split="test",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         label_list = predict_dataset.features["label"].names
 
@@ -278,7 +291,7 @@ def main():
         finetuning_task="xnli",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -286,7 +299,7 @@ def main():
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -294,7 +307,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 256c862a194e..0e9d16041289 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -79,15 +80,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -217,6 +224,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args)
@@ -279,7 +292,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -348,7 +361,7 @@ def get_label_list(labels):
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
@@ -358,7 +371,7 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             add_prefix_space=True,
         )
     else:
@@ -367,7 +380,7 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             use_fast=True,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     model = AutoModelForTokenClassification.from_pretrained(
@@ -376,7 +389,7 @@ def get_label_list(labels):
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 015e58c97b5f..179ef71ad3d2 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -89,15 +90,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -261,6 +268,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_translation", model_args, data_args)
@@ -335,7 +348,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -352,7 +365,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -366,14 +379,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -381,7 +394,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index d3e24a44a764..f5519046c68f 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -26,6 +26,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -92,15 +93,21 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     freeze_vision_model: bool = field(
         default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
     )
@@ -245,6 +252,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     if model_args.model_name_or_path is not None:
         if model_args.vision_model_name_or_path is not None or model_args.text_model_name_or_path is not None:
             raise ValueError(
@@ -315,7 +328,7 @@ def main():
             cache_dir=model_args.cache_dir,
             keep_in_memory=False,
             data_dir=data_args.data_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -332,7 +345,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -362,14 +375,14 @@ def main():
             model_args.image_processor_name or model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         with training_args.strategy.scope():
             model = TFAutoModel.from_pretrained(
                 model_args.model_name_or_path,
                 cache_dir=model_args.cache_dir,
                 revision=model_args.model_revision,
-                token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         # Load image_processor, in this script we only use this to get the mean and std for normalization.
@@ -377,14 +390,14 @@ def main():
             model_args.image_processor_name or model_args.vision_model_name_or_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         with training_args.strategy.scope():
             model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
                 vision_model_name_or_path=model_args.vision_model_name_or_path,
                 text_model_name_or_path=model_args.text_model_name_or_path,
                 cache_dir=model_args.cache_dir,
-                token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     config = model.config
 
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 83f15fea41d8..23e04c93181d 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -23,6 +23,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -157,15 +158,21 @@ class ModelArguments:
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -226,6 +233,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     if not (training_args.do_train or training_args.do_eval or training_args.do_predict):
         exit("Must specify at least one of --do_train, --do_eval or --do_predict!")
 
@@ -275,7 +288,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             task="image-classification",
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -309,13 +322,13 @@ def main():
         finetuning_task="image-classification",
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -435,7 +448,7 @@ def compute_metrics(p):
             from_pt=bool(".bin" in model_path),
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
         num_replicas = training_args.strategy.num_replicas_in_sync
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 650459d41819..7c068d5c8c62 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -30,6 +30,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
@@ -112,15 +113,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -220,6 +227,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_clm", model_args, data_args, framework="tensorflow")
@@ -287,7 +300,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -295,14 +308,14 @@ def main():
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -323,7 +336,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
             **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -333,7 +346,7 @@ def main():
                 data_files=data_files,
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
             raw_datasets["train"] = load_dataset(
@@ -341,7 +354,7 @@ def main():
                 data_files=data_files,
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
                 **dataset_args,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 89d68ade4d40..7ea1a2f8534a 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -28,6 +28,7 @@
 import os
 import random
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
@@ -110,15 +111,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -226,6 +233,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mlm", model_args, data_args, framework="tensorflow")
@@ -296,20 +309,20 @@ def main():
         raw_datasets = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
-                use_auth_token=True if model_args.use_auth_token else None,
+                token=model_args.token,
             )
     else:
         data_files = {}
@@ -323,7 +336,7 @@ def main():
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 94416c381fc9..170117ad9d72 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
@@ -146,15 +147,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -239,6 +246,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_swag", model_args, data_args, framework="tensorflow")
@@ -301,7 +314,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Downloading and loading the swag dataset from the hub.
@@ -309,7 +322,7 @@ def main():
             "swag",
             "regular",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -335,14 +348,14 @@ def main():
         config_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # endregion
 
@@ -428,7 +441,7 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         num_replicas = training_args.strategy.num_replicas_in_sync
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index a0e847ec4e86..8bbc986d0e90 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
@@ -77,15 +78,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -245,6 +252,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_qa", model_args, data_args, framework="tensorflow")
@@ -304,7 +317,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -323,7 +336,7 @@ def main():
             data_files=data_files,
             field="data",
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -338,14 +351,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # endregion
 
@@ -625,7 +638,7 @@ def compute_metrics(p: EvalPrediction):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         if training_args.do_train:
             training_dataset = model.prepare_tf_dataset(
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index c691aa9ebecf..82359f0921d4 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -99,15 +100,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -287,6 +294,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_summarization", model_args, data_args, framework="tensorflow")
@@ -355,7 +368,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -372,7 +385,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -388,14 +401,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -513,7 +526,7 @@ def postprocess_text(preds, labels):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index ae2a0de6c733..daeed0c6eab2 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -20,6 +20,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -164,15 +165,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 # endregion
@@ -192,6 +199,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_glue", model_args, data_args, framework="tensorflow")
@@ -242,7 +255,7 @@ def main():
         "glue",
         data_args.task_name,
         cache_dir=model_args.cache_dir,
-        use_auth_token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -284,14 +297,14 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # endregion
 
@@ -374,7 +387,7 @@ def compute_metrics(preds, label_ids):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         # endregion
 
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index f6031fc49779..a68aed942d8e 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -20,6 +20,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
@@ -170,15 +171,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 # endregion
@@ -198,6 +205,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_text_classification", model_args, data_args, framework="tensorflow")
@@ -258,7 +271,7 @@ def main():
             "csv",
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         # Loading a dataset from local json files
@@ -301,20 +314,20 @@ def main():
             num_labels=num_labels,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         config = AutoConfig.from_pretrained(
             config_path,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     # endregion
 
@@ -402,7 +415,7 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
         # endregion
 
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index 2d5ed748fe95..ed8ab00e4f85 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import random
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -75,15 +76,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -196,6 +203,12 @@ def main():
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_ner", model_args, data_args, framework="tensorflow")
@@ -228,7 +241,7 @@ def main():
         raw_datasets = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -240,7 +253,7 @@ def main():
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 9004f648856f..68e9e9c16b3a 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -93,15 +94,21 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    use_auth_token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
             "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
             )
         },
     )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
 
 
 @dataclass
@@ -268,6 +275,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_translation", model_args, data_args, framework="tensorflow")
@@ -322,7 +335,7 @@ def main():
             data_args.dataset_name,
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     else:
         data_files = {}
@@ -336,7 +349,7 @@ def main():
             extension,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading
@@ -352,14 +365,14 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
+        token=model_args.token,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -466,7 +479,7 @@ def preprocess_function(examples):
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
+            token=model_args.token,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch

From 904e7e0f3cee944bffc54e2a084dfcab47ef2036 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 2 Aug 2023 15:07:27 +0530
Subject: [PATCH 832/935] resolving zero3 init when using accelerate config
 with Trainer (#25227)

* resolving zero3 init when using accelerate config with Trainer

* refactor

* fix

* fix import
---
 src/transformers/training_args.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 8189c22fe5a1..a8d3258afedf 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1627,6 +1627,12 @@ def __post_init__(self):
 
             os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
             self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
+        elif strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")):
+            # Accelerate DeepSpeed Plugin
+            from accelerate.utils import DeepSpeedPlugin
+
+            self.deepspeed_plugin = DeepSpeedPlugin()
+            self.deepspeed_plugin.set_deepspeed_weakref()
 
         if self.push_to_hub_token is not None:
             warnings.warn(

From 1b35409768c61f417498ea678e8acb56e548ad8f Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 2 Aug 2023 11:29:55 +0100
Subject: [PATCH 833/935] Update rescale tests - cast to float after rescaling
 to reflect #25229 (#25259)

Rescale tests - cast to float after rescaling to reflect #25229
---
 .../models/efficientnet/test_image_processing_efficientnet.py | 4 ++--
 tests/models/vivit/test_image_processing_vivit.py             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.py b/tests/models/efficientnet/test_image_processing_efficientnet.py
index 11aee2d01c5c..3e427474f696 100644
--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -201,9 +201,9 @@ def test_rescale(self):
         image_processor = self.image_processing_class(**self.image_processor_dict)
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255)
-        expected_image = image.astype(np.float32) * (2 / 255.0) - 1
+        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
         self.assertTrue(np.allclose(rescaled_image, expected_image))
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
-        expected_image = image.astype(np.float32) / 255.0
+        expected_image = (image / 255.0).astype(np.float32)
         self.assertTrue(np.allclose(rescaled_image, expected_image))
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
index 75a3e0264c0a..0b445cf47484 100644
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -220,9 +220,9 @@ def test_rescale(self):
         image_processor = self.image_processing_class(**self.image_processor_dict)
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255)
-        expected_image = image.astype(np.float32) * (2 / 255.0) - 1
+        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
         self.assertTrue(np.allclose(rescaled_image, expected_image))
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
-        expected_image = image.astype(np.float32) / 255.0
+        expected_image = (image / 255.0).astype(np.float32)
         self.assertTrue(np.allclose(rescaled_image, expected_image))

From 8021c684ec3023295513be36bdc30e27e6f28cfc Mon Sep 17 00:00:00 2001
From: Yupeng Jia <34328687+jypjypjypjyp@users.noreply.github.com>
Date: Wed, 2 Aug 2023 18:30:36 +0800
Subject: [PATCH 834/935] Fix some bugs for two stage training of deformable
 detr (#25045)

* Update modeling_deformable_detr.py

Fix bugs for two stage training

* Update modeling_deformable_detr.py

* Add test_two_stage_training to DeformableDetrModelTest

---------

Co-authored-by: yupeng.jia <yupeng.jia@momenta.ai>
---
 .../modeling_deformable_detr.py               | 23 ++++++++-----------
 src/transformers/models/deta/modeling_deta.py |  2 +-
 .../test_modeling_deformable_detr.py          | 15 ++++++++++++
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index e1ce23182ee3..8760ac32c972 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1558,7 +1558,7 @@ def get_valid_ratio(self, mask):
     def get_proposal_pos_embed(self, proposals):
         """Get the position embedding of the proposals."""
 
-        num_pos_feats = 128
+        num_pos_feats = self.config.d_model // 2
         temperature = 10000
         scale = 2 * math.pi
 
@@ -1977,12 +1977,11 @@ def forward(
             outputs_coord = outputs_coord_logits.sigmoid()
             outputs_classes.append(outputs_class)
             outputs_coords.append(outputs_coord)
-        # Keep batch_size as first dimension
-        outputs_class = torch.stack(outputs_classes, dim=1)
-        outputs_coord = torch.stack(outputs_coords, dim=1)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
 
-        logits = outputs_class[:, -1]
-        pred_boxes = outputs_coord[:, -1]
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
@@ -2008,7 +2007,7 @@ def forward(
                 outputs_loss["auxiliary_outputs"] = auxiliary_outputs
             if self.config.two_stage:
                 enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
-                outputs["enc_outputs"] = {"pred_logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
+                outputs_loss["enc_outputs"] = {"logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
 
             loss_dict = criterion(outputs_loss, labels)
             # Fourth: compute total loss, as a weighted sum of the various losses
@@ -2240,7 +2239,7 @@ def forward(self, outputs, targets):
                 List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                 losses applied, see each loss' doc.
         """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"}
 
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
@@ -2272,14 +2271,10 @@ def forward(self, outputs, targets):
             enc_outputs = outputs["enc_outputs"]
             bin_targets = copy.deepcopy(targets)
             for bt in bin_targets:
-                bt["labels"] = torch.zeros_like(bt["labels"])
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
             indices = self.matcher(enc_outputs, bin_targets)
             for loss in self.losses:
-                kwargs = {}
-                if loss == "labels":
-                    # Logging is enabled only for the last layer
-                    kwargs["log"] = False
-                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
                 l_dict = {k + "_enc": v for k, v in l_dict.items()}
                 losses.update(l_dict)
 
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 4a9fb5fa3ae1..db0fe24c4ab0 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -1463,7 +1463,7 @@ def get_valid_ratio(self, mask):
     def get_proposal_pos_embed(self, proposals):
         """Get the position embedding of the proposals."""
 
-        num_pos_feats = 128
+        num_pos_feats = self.config.d_model // 2
         temperature = 10000
         scale = 2 * math.pi
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 1864d054fdb3..b44564f69193 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -544,6 +544,21 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    def test_two_stage_training(self):
+        model_class = DeformableDetrForObjectDetection
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        config.two_stage = True
+        config.auxiliary_loss = True
+        config.with_box_refine = True
+
+        model = model_class(config)
+        model.to(torch_device)
+        model.train()
+        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+        loss = model(**inputs).loss
+        loss.backward()
+
 
 TOLERANCE = 1e-4
 

From eec0d84e6a98f597508fc177ec3a961f923d4e5d Mon Sep 17 00:00:00 2001
From: Ashish Thomas Chempolil <ashishthomas7@gmail.com>
Date: Wed, 2 Aug 2023 06:55:56 -0400
Subject: [PATCH 835/935] [DOCS] Add example and modified docs of
 EtaLogitsWarper (#25125)

* added example and modified docs for EtaLogitsWarper

* make style

* fixed styling issue on 544

* removed error info and added set_seed

* Update src/transformers/generation/logits_process.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/generation/logits_process.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* updated the results

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/generation/logits_process.py | 61 +++++++++++++++++--
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 9f130dfa2e9c..6b1093761fb9 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -485,14 +485,65 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class EtaLogitsWarper(LogitsWarper):
     r"""
-    [`LogitsWarper`] that performs eta-sampling, i.e. calculates a dynamic cutoff `eta := min(epsilon, sqrt(epsilon,
-    e^-entropy(probabilities)))` and restricts to tokens with `prob >= eta`. Takes the largest min_tokens_to_keep
-    tokens if no tokens satisfy this constraint. See [Truncation Sampling as Language Model
-    Desmoothing](https://arxiv.org/abs/2210.15191) for more information.
+    [`LogitsWarper`] that performs eta-sampling, a technique to filter out tokens with probabilities below a dynamic
+    cutoff value, `eta`, which is calculated based on a combination of the hyperparameter `epsilon` and the entropy of
+    the token probabilities, i.e. `eta := min(epsilon, sqrt(epsilon, e^-entropy(probabilities)))`. Takes the largest
+    min_tokens_to_keep tokens if no tokens satisfy this constraint. It addresses the issue of poor quality in long
+    samples of text generated by neural language models leading to more coherent and fluent text. See [Truncation
+    Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more information. Note: `do_sample`
+    must be set to `True` for this `LogitsWarper` to work.
+
 
     Args:
+        epsilon (`float`):
+            A float value in the range (0, 1). Hyperparameter used to calculate the dynamic cutoff value, `eta`. The
+            suggested values from the paper ranges from 3e-4 to 4e-3 depending on the size of the model.
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+            All values that are found to be below the dynamic cutoff value, `eta`, are set to this float value. This
+            parameter is useful when logits need to be modified for very low probability tokens that should be excluded
+            from generation entirely.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimum number of tokens that cannot be filtered."""
+            Specifies the minimum number of tokens that must be kept for generation, regardless of their probabilities.
+            For example, if `min_tokens_to_keep` is set to 1, at least one token will always be kept for generation,
+            even if all tokens have probabilities below the cutoff `eta`.
+
+    Examples:
+    ```python
+    >>> # Import required libraries
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+
+    >>> # Set the model name
+    >>> model_name = "gpt2"
+
+    >>> # Initialize the model and tokenizer
+    >>> model = AutoModelForCausalLM.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    >>> # Set the pad token to eos token
+    >>> model.config.pad_token_id = model.config.eos_token_id
+    >>> model.generation_config.pad_token_id = model.config.eos_token_id
+
+    >>> # The below sequence intentionally contains two subjects to show the difference between the two approaches
+    >>> sequence = "a quadcopter flight controller (RTFQ Flip MWC) that supports I2C sensors for adding things like a barometer, magnetometer, and GPS system. The officially supported sensor block (BMP180, HMC5883L on one board) is discontinued, as far as I know, everyone involved lived to sing another day. . . disorder and an extreme state of dysmetabolism characterized by extensive erythema and a significant reduction in uncovered"
+
+    >>> # Tokenize the sequence
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+
+    >>> set_seed(0)
+
+    >>> # We can see that the model is generating repeating text and also is not able to continue the sequence properly
+    >>> outputs = model.generate(inputs["input_ids"], max_length=128)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    a quadcopter flight controller (RTFQ Flip MWC) that supports I2C sensors for adding things like a barometer, magnetometer, and GPS system. The officially supported sensor block (BMP180, HMC5883L on one board) is discontinued, as far as I know, everyone involved lived to sing another day... disorder and an extreme state of dysmetabolism characterized by extensive erythema and a significant reduction in uncovered muscle mass. The patient was diagnosed with a severe erythema and a severe erythema-like condition. The patient was treated with a combination
+
+    >>> # The result is much better and coherent when we use the `eta_cutoff` parameter
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=128, do_sample=True, eta_cutoff=2e-2
+    ... )  # need to set do_sample=True for eta_cutoff to work
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    a quadcopter flight controller (RTFQ Flip MWC) that supports I2C sensors for adding things like a barometer, magnetometer, and GPS system. The officially supported sensor block (BMP180, HMC5883L on one board) is discontinued, as far as I know, everyone involved lived to sing another day... disorder and an extreme state of dysmetabolism characterized by extensive erythema and a significant reduction in uncovered fatty acids. A significant loss of brain development. The individual also experienced high levels of a common psychiatric condition called schizophrenia, with an important and life threatening consequence.
+    ```
+    """
 
     def __init__(self, epsilon: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         epsilon = float(epsilon)

From 1baeed5bdf3c58b723a6125632567f97bdf322c6 Mon Sep 17 00:00:00 2001
From: Euan Ong <euan.l.y.ong@gmail.com>
Date: Wed, 2 Aug 2023 13:43:54 +0100
Subject: [PATCH 836/935] Fix return_dict_in_generate bug in InstructBlip
 generate function (#25246)

Fix bug in InstructBlip generate function

Previously, the postprocessing conducted on generated sequences in InstructBlip's generate function assumed these sequences were tensors (i.e. that `return_dict_in_generate == False`).

This commit checks whether the result of the call to the wrapped language model `generate()` is a tensor, and if not attempts to postprocess the sequence attribute of the returned results object.
---
 .../models/instructblip/modeling_instructblip.py             | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 3d44bfd2cd7d..48961998ed1a 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1559,6 +1559,9 @@ def generate(
         # with the tokenizer's bos token being set to </s> which has ID=2,
         # whereas the model's text config has bos token id = 0
         if self.config.text_config.architectures[0] == "LLaMAForCausalLM":
-            outputs[outputs == 0] = 2
+            if isinstance(outputs, torch.Tensor):
+                outputs[outputs == 0] = 2
+            else:
+                outputs.sequences[outputs.sequences == 0] = 2
 
         return outputs

From 8edd0da960cb91f17e97803435a45887600d3984 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 2 Aug 2023 14:53:05 +0200
Subject: [PATCH 837/935] Remove `pytest_options={"rA": None}` in CI (#25263)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index a902bf4b2dba..56be3a0232df 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -299,7 +299,6 @@ def job_name(self):
     ],
     parallelism=1,
     pytest_num_workers=6,
-    pytest_options={"rA": None},
 )
 
 
@@ -311,7 +310,6 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece,flax-speech,vision]",
     ],
     parallelism=1,
-    pytest_options={"rA": None},
 )
 
 
@@ -323,7 +321,6 @@ def job_name(self):
         "pip install --upgrade --upgrade-strategy eager pip",
         "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
     ],
-    pytest_options={"rA": None},
     marker="is_pipeline_test",
 )
 
@@ -337,7 +334,6 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,vision]",
         "pip install -U --upgrade-strategy eager tensorflow_probability",
     ],
-    pytest_options={"rA": None},
     marker="is_pipeline_test",
 )
 

From bef02fd6b9cde975c51607fb936050ef706ff6d8 Mon Sep 17 00:00:00 2001
From: heuristicwave <31366038+heuristicwave@users.noreply.github.com>
Date: Wed, 2 Aug 2023 23:06:35 +0900
Subject: [PATCH 838/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perf=5Finfer=5Fgpu=5Fmany.md`=20to=20Korean=20(#24943)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* doc: ko: perf_infer_gpu_many.mdx

* feat: chatgpt draft

* fix: manual edits

* Update docs/source/ko/perf_infer_gpu_many.md

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

---------

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml           |  4 ++--
 docs/source/ko/perf_infer_gpu_many.md | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perf_infer_gpu_many.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 6956db092bc4..71985b68328e 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -127,8 +127,8 @@
       title: CPU로 추론하기
     - local: in_translation
       title: (번역중) Inference on one GPU
-    - local: in_translation
-      title: (번역중) Inference on many GPUs
+    - local: perf_infer_gpu_many
+      title: 여러 GPU에서 추론
     - local: in_translation
       title: (번역중) Inference on Specialized Hardware
     - local: perf_hardware
diff --git a/docs/source/ko/perf_infer_gpu_many.md b/docs/source/ko/perf_infer_gpu_many.md
new file mode 100644
index 000000000000..3e4542180398
--- /dev/null
+++ b/docs/source/ko/perf_infer_gpu_many.md
@@ -0,0 +1,27 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 다중 GPU에서 효율적인 추론 [[efficient-inference-on-a-multiple-gpus]]
+
+이 문서에는 다중 GPU에서 효율적으로 추론하는 방법에 대한 정보가 포함되어 있습니다.
+<Tip>
+
+참고: 다중 GPU 설정은 [단일 GPU 섹션](./perf_infer_gpu_one)에서 설명된 대부분의 전략을 사용할 수 있습니다. 그러나 더 나은 활용을 위해 간단한 기법들을 알아야 합니다.
+
+</Tip>
+
+## 더 빠른 추론을 위한 `BetterTransformer` [[bettertransformer-for-faster-inference]]
+
+우리는 최근 텍스트, 이미지 및 오디오 모델에 대한 다중 GPU에서 더 빠른 추론을 위해 `BetterTransformer`를 통합했습니다. 자세한 내용은 이 통합에 대한 [문서](https://huggingface.co/docs/optimum/bettertransformer/overview)를 확인하십시오.
\ No newline at end of file

From ad8321512d857578ce2e2469eb82a8e8e45c9037 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Wed, 2 Aug 2023 23:48:39 +0800
Subject: [PATCH 839/935] recommend DeepSpeed's Argument Parsing documentation
 (#25268)

---
 docs/source/en/main_classes/deepspeed.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/main_classes/deepspeed.md b/docs/source/en/main_classes/deepspeed.md
index bb0cd54372a7..ae0292e8ee30 100644
--- a/docs/source/en/main_classes/deepspeed.md
+++ b/docs/source/en/main_classes/deepspeed.md
@@ -168,6 +168,8 @@ If after trying everything suggested you still encounter build issues, please, p
 
 To deploy the DeepSpeed integration adjust the [`Trainer`] command line arguments to include a new argument `--deepspeed ds_config.json`, where `ds_config.json` is the DeepSpeed configuration file as
    documented [here](https://www.deepspeed.ai/docs/config-json/). The file naming is up to you.
+   It's recommended to use DeepSpeed's `add_config_arguments` utility to add the necessary command line arguments to your code.
+   For more information please see [DeepSpeed's Argument Parsing](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) doc.
 
 You can use a launcher of your choice here. You can continue using the pytorch launcher:
 

From b28ebb265590ff12bb888b3ca5e07d56922beb6a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 2 Aug 2023 18:11:15 +0200
Subject: [PATCH 840/935] [MMS] Fix mms (#25267)

* [MMS] Fix mms

* [MMS] Fix mms

* fix mms loading

* Apply suggestions from code review

* make style

* Update tests/models/wav2vec2/test_modeling_wav2vec2.py
---
 src/transformers/modeling_utils.py            |  5 ++-
 .../models/wav2vec2/test_modeling_wav2vec2.py | 37 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7c39733be4f2..9e167cfdeeb4 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3047,6 +3047,10 @@ def _load_pretrained_model(
                 offload_state_dict = True
 
         is_sharded_safetensors = is_safetensors and sharded_metadata is not None
+
+        # tie the model weights before retrieving the state_dict
+        model.tie_weights()
+
         # Retrieve missing & unexpected_keys
         model_state_dict = model.state_dict()
         expected_keys = list(model_state_dict.keys())
@@ -3092,7 +3096,6 @@ def _fix_key(key):
             model_buffers = {".".join([prefix, key]) for key in model_buffers}
         unexpected_keys = list(unexpected_keys - model_buffers)
 
-        model.tie_weights()
         if device_map is None:
             ptrs = collections.defaultdict(list)
             for name, tensor in model.state_dict().items():
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 4db9b156db46..630a5d8e8513 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -1151,6 +1151,43 @@ def get_logits(model, input_features):
 
         self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
 
+    # test that loading adapter weights with mismatched vocab sizes can be loaded
+    def test_load_target_lang_with_mismatched_size(self):
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
+
+        def get_logits(model, input_features):
+            model = model.to(torch_device)
+            batch = processor(
+                input_features,
+                padding=True,
+                sampling_rate=processor.feature_extractor.sampling_rate,
+                return_tensors="pt",
+            )
+
+            with torch.no_grad():
+                logits = model(
+                    input_values=batch["input_values"].to(torch_device),
+                    attention_mask=batch["attention_mask"].to(torch_device),
+                ).logits
+            return logits
+
+        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
+
+        model = Wav2Vec2ForCTC.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True
+        )
+
+        logits = get_logits(model, input_features)
+
+        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
+        model_2.load_adapter("fr")
+
+        logits_2 = get_logits(model_2, input_features)
+
+        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
+
     def test_load_attn_adapter(self):
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True

From bd90cda9a6bb4723515c17df1192e53abc8e36e3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 2 Aug 2023 20:22:36 +0200
Subject: [PATCH 841/935] =?UTF-8?q?CI=20with=20`num=5Fhidden=5Flayers=3D2`?=
 =?UTF-8?q?=20=F0=9F=9A=80=F0=9F=9A=80=F0=9F=9A=80=20(#25266)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CI with layers=2

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/albert/test_modeling_albert.py   |  5 ++--
 .../albert/test_modeling_flax_albert.py       |  2 +-
 tests/models/align/test_modeling_align.py     |  2 +-
 tests/models/altclip/test_modeling_altclip.py |  4 +--
 ..._modeling_audio_spectrogram_transformer.py |  2 +-
 tests/models/bart/test_modeling_bart.py       |  2 +-
 tests/models/beit/test_modeling_beit.py       |  2 +-
 tests/models/beit/test_modeling_flax_beit.py  |  2 +-
 tests/models/bert/test_modeling_bert.py       |  2 +-
 tests/models/bert/test_modeling_flax_bert.py  |  2 +-
 .../test_modeling_bert_generation.py          |  2 +-
 .../test_modeling_bigbird_pegasus.py          |  2 +-
 tests/models/biogpt/test_modeling_biogpt.py   |  2 +-
 .../blenderbot/test_modeling_blenderbot.py    |  2 +-
 .../test_modeling_blenderbot_small.py         |  2 +-
 tests/models/blip/test_modeling_blip.py       |  4 +--
 tests/models/blip/test_modeling_blip_text.py  |  2 +-
 tests/models/blip_2/test_modeling_blip_2.py   |  8 +++---
 tests/models/bloom/test_modeling_bloom.py     |  2 +-
 tests/models/canine/test_modeling_canine.py   |  2 +-
 .../test_modeling_chinese_clip.py             |  4 +--
 tests/models/clap/test_modeling_clap.py       |  2 +-
 tests/models/clip/test_modeling_clip.py       |  4 +--
 tests/models/clip/test_modeling_flax_clip.py  |  4 +--
 tests/models/clipseg/test_modeling_clipseg.py | 17 ++++++++++---
 tests/models/codegen/test_modeling_codegen.py |  2 +-
 .../models/convbert/test_modeling_convbert.py |  2 +-
 tests/models/cpmant/test_modeling_cpmant.py   |  2 +-
 tests/models/ctrl/test_modeling_ctrl.py       |  2 +-
 .../data2vec/test_modeling_data2vec_audio.py  |  2 +-
 .../data2vec/test_modeling_data2vec_text.py   |  2 +-
 .../data2vec/test_modeling_data2vec_vision.py |  2 +-
 tests/models/deberta/test_modeling_deberta.py |  2 +-
 .../deberta_v2/test_modeling_deberta_v2.py    |  2 +-
 tests/models/deit/test_modeling_deit.py       |  2 +-
 tests/models/dinov2/test_modeling_dinov2.py   |  2 +-
 .../distilbert/test_modeling_distilbert.py    |  2 +-
 .../test_modeling_flax_distilbert.py          |  2 +-
 tests/models/dpr/test_modeling_dpr.py         |  2 +-
 tests/models/dpt/test_modeling_dpt.py         |  4 +--
 tests/models/electra/test_modeling_electra.py |  2 +-
 .../electra/test_modeling_flax_electra.py     |  2 +-
 tests/models/ernie/test_modeling_ernie.py     |  2 +-
 tests/models/ernie_m/test_modeling_ernie_m.py |  2 +-
 tests/models/esm/test_modeling_esm.py         |  2 +-
 tests/models/esm/test_modeling_esmfold.py     |  2 +-
 tests/models/falcon/test_modeling_falcon.py   |  2 +-
 .../models/flaubert/test_modeling_flaubert.py |  2 +-
 tests/models/flava/test_modeling_flava.py     |  6 ++---
 tests/models/fnet/test_modeling_fnet.py       |  2 +-
 tests/models/git/test_modeling_git.py         |  4 +--
 tests/models/gpt2/test_modeling_flax_gpt2.py  |  2 +-
 tests/models/gpt2/test_modeling_gpt2.py       |  2 +-
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  |  2 +-
 .../gpt_neo/test_modeling_flax_gpt_neo.py     |  4 +--
 tests/models/gpt_neo/test_modeling_gpt_neo.py |  4 +--
 .../models/gpt_neox/test_modeling_gpt_neox.py |  2 +-
 .../test_modeling_gpt_neox_japanese.py        |  2 +-
 tests/models/gptj/test_modeling_flax_gptj.py  |  2 +-
 tests/models/gptj/test_modeling_gptj.py       |  2 +-
 .../test_modeling_gptsan_japanese.py          |  2 +-
 .../models/groupvit/test_modeling_groupvit.py |  6 ++++-
 tests/models/hubert/test_modeling_hubert.py   |  2 +-
 tests/models/ibert/test_modeling_ibert.py     |  2 +-
 .../models/imagegpt/test_modeling_imagegpt.py |  2 +-
 .../test_modeling_instructblip.py             |  6 ++---
 .../models/layoutlm/test_modeling_layoutlm.py |  2 +-
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  2 +-
 .../layoutlmv3/test_modeling_layoutlmv3.py    |  2 +-
 tests/models/llama/test_modeling_llama.py     |  2 +-
 .../longformer/test_modeling_longformer.py    |  2 +-
 .../longt5/test_modeling_flax_longt5.py       |  2 +-
 tests/models/longt5/test_modeling_longt5.py   |  4 +--
 tests/models/luke/test_modeling_luke.py       |  2 +-
 tests/models/marian/test_modeling_marian.py   |  2 +-
 .../models/markuplm/test_modeling_markuplm.py |  2 +-
 tests/models/mbart/test_modeling_mbart.py     |  2 +-
 tests/models/mega/test_modeling_mega.py       |  2 +-
 .../test_modeling_megatron_bert.py            |  2 +-
 tests/models/mgp_str/test_modeling_mgp_str.py |  2 +-
 .../mobilebert/test_modeling_mobilebert.py    |  2 +-
 tests/models/mpnet/test_modeling_mpnet.py     |  2 +-
 tests/models/mpt/test_modeling_mpt.py         |  2 +-
 tests/models/mra/test_modeling_mra.py         |  2 +-
 tests/models/mvp/test_modeling_mvp.py         |  2 +-
 tests/models/nezha/test_modeling_nezha.py     |  2 +-
 .../models/nllb_moe/test_modeling_nllb_moe.py |  2 +-
 .../test_modeling_nystromformer.py            |  2 +-
 tests/models/openai/test_modeling_openai.py   |  2 +-
 tests/models/opt/test_modeling_opt.py         |  2 +-
 tests/models/owlvit/test_modeling_owlvit.py   |  2 +-
 .../pegasus/test_modeling_flax_pegasus.py     |  2 +-
 tests/models/pegasus/test_modeling_pegasus.py |  2 +-
 .../pegasus_x/test_modeling_pegasus_x.py      |  2 +-
 .../pix2struct/test_modeling_pix2struct.py    |  4 +--
 tests/models/plbart/test_modeling_plbart.py   |  2 +-
 .../prophetnet/test_modeling_prophetnet.py    | 16 ++++++------
 tests/models/qdqbert/test_modeling_qdqbert.py |  2 +-
 tests/models/realm/test_modeling_realm.py     |  2 +-
 tests/models/rembert/test_modeling_rembert.py |  2 +-
 .../roberta/test_modeling_flax_roberta.py     |  2 +-
 tests/models/roberta/test_modeling_roberta.py |  2 +-
 ...test_modeling_flax_roberta_prelayernorm.py |  2 +-
 .../test_modeling_roberta_prelayernorm.py     |  2 +-
 .../models/roc_bert/test_modeling_roc_bert.py |  2 +-
 .../roformer/test_modeling_flax_roformer.py   |  2 +-
 .../models/roformer/test_modeling_roformer.py |  2 +-
 tests/models/rwkv/test_modeling_rwkv.py       |  2 +-
 tests/models/sew/test_modeling_sew.py         |  2 +-
 tests/models/sew_d/test_modeling_sew_d.py     |  2 +-
 .../test_modeling_speech_to_text_2.py         |  2 +-
 .../models/speecht5/test_modeling_speecht5.py |  8 +++---
 .../models/splinter/test_modeling_splinter.py |  2 +-
 .../squeezebert/test_modeling_squeezebert.py  |  2 +-
 .../test_modeling_switch_transformers.py      |  4 +--
 tests/models/t5/test_modeling_flax_t5.py      |  4 +--
 tests/models/t5/test_modeling_t5.py           |  4 +--
 tests/models/tapas/test_modeling_tapas.py     |  2 +-
 .../timesformer/test_modeling_timesformer.py  |  2 +-
 .../transfo_xl/test_modeling_transfo_xl.py    |  2 +-
 tests/models/trocr/test_modeling_trocr.py     |  2 +-
 tests/models/tvlt/test_modeling_tvlt.py       |  2 +-
 tests/models/umt5/test_modeling_umt5.py       |  2 +-
 .../unispeech/test_modeling_unispeech.py      |  2 +-
 .../test_modeling_unispeech_sat.py            |  2 +-
 .../models/videomae/test_modeling_videomae.py |  2 +-
 tests/models/vilt/test_modeling_vilt.py       |  2 +-
 .../visual_bert/test_modeling_visual_bert.py  |  2 +-
 tests/models/vit/test_modeling_flax_vit.py    |  2 +-
 tests/models/vit/test_modeling_vit.py         |  2 +-
 .../vit_hybrid/test_modeling_vit_hybrid.py    |  2 +-
 tests/models/vit_mae/test_modeling_vit_mae.py |  2 +-
 tests/models/vit_msn/test_modeling_vit_msn.py |  2 +-
 .../wav2vec2/test_modeling_flax_wav2vec2.py   |  2 +-
 .../models/wav2vec2/test_modeling_wav2vec2.py |  2 +-
 .../test_modeling_wav2vec2_conformer.py       |  2 +-
 tests/models/wavlm/test_modeling_wavlm.py     |  2 +-
 tests/models/x_clip/test_modeling_x_clip.py   |  4 +--
 tests/models/xglm/test_modeling_flax_xglm.py  |  2 +-
 tests/models/xglm/test_modeling_xglm.py       |  2 +-
 tests/models/xlm/test_modeling_xlm.py         |  2 +-
 .../test_modeling_xlm_roberta_xl.py           |  2 +-
 tests/models/xlnet/test_modeling_xlnet.py     |  2 +-
 tests/models/xmod/test_modeling_xmod.py       |  2 +-
 tests/models/yolos/test_modeling_yolos.py     |  2 +-
 tests/models/yoso/test_modeling_yoso.py       |  2 +-
 tests/test_modeling_common.py                 | 25 ++++++++-----------
 147 files changed, 207 insertions(+), 196 deletions(-)

diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 96fa5596a21d..75c84ad0d3d3 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -54,8 +54,9 @@ def __init__(
         vocab_size=99,
         embedding_size=16,
         hidden_size=36,
-        num_hidden_layers=6,
-        num_hidden_groups=6,
+        num_hidden_layers=2,
+        # this needs to be the same as `num_hidden_layers`!
+        num_hidden_groups=2,
         num_attention_heads=6,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
index 5292665f55b7..0bdc8065bce9 100644
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ b/tests/models/albert/test_modeling_flax_albert.py
@@ -48,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 35b2aebf6c02..47918bcd8389 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -242,7 +242,7 @@ def __init__(
         use_token_type_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index fdae7768da1f..244ef1ed3bd7 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -60,7 +60,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -212,7 +212,7 @@ def __init__(
         hidden_size=32,
         projection_dim=32,
         project_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index c36e946f19f7..ce596d84e372 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -55,7 +55,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index 949e647e6fc3..01189e562810 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1289,7 +1289,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 4c0e5e5a4599..2a35cddf4057 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -64,7 +64,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
index 0587174bac91..78c24220c2d2 100644
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -48,7 +48,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 52c8035d8a60..9aec91367d8d 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -57,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
index 55ffb440196f..822689917513 100644
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ b/tests/models/bert/test_modeling_flax_bert.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py
index ced98e6f7279..ecd7a459e0ea 100644
--- a/tests/models/bert_generation/test_modeling_bert_generation.py
+++ b/tests/models/bert_generation/test_modeling_bert_generation.py
@@ -41,7 +41,7 @@ def __init__(
         use_input_mask=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 5d345db3fc5d..aedbbb46341e 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -605,7 +605,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index e50410930887..e43fc1e41b8f 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index 499c7aa52155..ca1630b3cfd3 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -356,7 +356,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index 257aa1699c3e..249a8a799a83 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -365,7 +365,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 9a6e3da06c78..cf8c487082c7 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -70,7 +70,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -221,7 +221,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 488512f6e2f8..2301b776feb4 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -44,7 +44,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 71f652050faa..c5bdb70791eb 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -62,7 +62,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -215,7 +215,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=6,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -289,7 +289,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
@@ -503,7 +503,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 4e9b837c8adc..de7cb03e7e8e 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -54,7 +54,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index 057fc0913185..303d465ca905 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -53,7 +53,7 @@ def __init__(
         # NOTE: this is not a model parameter, just an input
         vocab_size=100000,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 894f1e127999..137c3c2888dd 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -69,7 +69,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -246,7 +246,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 35c0f4e203e9..dc5718850f4e 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -287,7 +287,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 4239ce5ed0d5..996bea95b95d 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -86,7 +86,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -261,7 +261,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py
index 7d63fa9edf17..565c641aef63 100644
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ b/tests/models/clip/test_modeling_flax_clip.py
@@ -35,7 +35,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -252,7 +252,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index e931bdc8d56f..37a71d1b18a3 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -78,7 +78,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -228,7 +228,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -346,7 +346,15 @@ def test_model_from_pretrained(self):
 
 
 class CLIPSegModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+    def __init__(
+        self,
+        parent,
+        text_kwargs=None,
+        vision_kwargs=None,
+        is_training=True,
+        # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester`
+        extract_layers=(1,),
+    ):
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -356,6 +364,7 @@ def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=Tru
         self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
         self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
         self.is_training = is_training
+        self.extract_layers = extract_layers
 
     def prepare_config_and_inputs(self):
         text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
@@ -371,7 +380,7 @@ def get_config(self):
             self.vision_model_tester.get_config(),
             projection_dim=64,
             reduce_dim=32,
-            extract_layers=[1, 2, 3],
+            extract_layers=self.extract_layers,
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index 9072c2b5bc2f..34a32caa7ff8 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -47,7 +47,7 @@ def __init__(
         vocab_size=256,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py
index dc1550acc201..754967ce0039 100644
--- a/tests/models/convbert/test_modeling_convbert.py
+++ b/tests/models/convbert/test_modeling_convbert.py
@@ -53,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index f3ebeaad3e9b..6ecfe15c2ec7 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -49,7 +49,7 @@ def __init__(
         use_mc_token_ids=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         num_buckets=32,
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index dfcb2c913386..ff427444151d 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -49,7 +49,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index 67fe0cbe70c0..e9448621e9f6 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -59,7 +59,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,
         intermediate_size=20,
diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py
index a45c9b6a8b41..4b4b2835dc1d 100644
--- a/tests/models/data2vec/test_modeling_data2vec_text.py
+++ b/tests/models/data2vec/test_modeling_data2vec_text.py
@@ -57,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index 299ffad3e722..69a763a4f2ec 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -59,7 +59,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py
index 7daff3b522e7..52758e2222ae 100644
--- a/tests/models/deberta/test_modeling_deberta.py
+++ b/tests/models/deberta/test_modeling_deberta.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py
index 548c9617b8da..abfbe7402c93 100644
--- a/tests/models/deberta_v2/test_modeling_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py
@@ -48,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index 37bfe3fa70b9..2685900afbb9 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -69,7 +69,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index ed69faa444d5..cf7ff95b572c 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -57,7 +57,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 9d17a1c4415d..ff56afd0a981 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py
index f4481a6e4a07..1f5a402e86ac 100644
--- a/tests/models/distilbert/test_modeling_flax_distilbert.py
+++ b/tests/models/distilbert/test_modeling_flax_distilbert.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py
index cd4f430dedf7..b6a687a351b0 100644
--- a/tests/models/dpr/test_modeling_dpr.py
+++ b/tests/models/dpr/test_modeling_dpr.py
@@ -48,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 62ac20df3130..247791ed4127 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -53,7 +53,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         backbone_out_indices=[0, 1, 2, 3],
         num_attention_heads=4,
         intermediate_size=37,
@@ -62,7 +62,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         initializer_range=0.02,
         num_labels=3,
-        neck_hidden_sizes=[16, 16, 32, 32],
+        neck_hidden_sizes=[16, 32],
         is_hybrid=False,
         scope=None,
     ):
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index 550bc1448707..a5d3fa585e1f 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -54,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py
index 0dda4e38fdda..19b35d894095 100644
--- a/tests/models/electra/test_modeling_flax_electra.py
+++ b/tests/models/electra/test_modeling_flax_electra.py
@@ -34,7 +34,7 @@ def __init__(
         vocab_size=99,
         embedding_size=24,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py
index e845bd1f83cf..f0bdec3efb91 100644
--- a/tests/models/ernie/test_modeling_ernie.py
+++ b/tests/models/ernie/test_modeling_ernie.py
@@ -56,7 +56,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py
index 5e0ac95233d9..1fafcd34bafc 100644
--- a/tests/models/ernie_m/test_modeling_ernie_m.py
+++ b/tests/models/ernie_m/test_modeling_ernie_m.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 2e5d48082be2..f242e77966c0 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -49,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=33,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 39f274af54dc..1ec5ab8acbd0 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -43,7 +43,7 @@ def __init__(
         use_labels=False,
         vocab_size=19,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 6530eeb1a157..0efc76237124 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 99dbf927e18b..61806182bb7f 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -57,7 +57,7 @@ def __init__(
         vocab_size=99,
         n_special=0,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index f1221f1061c5..02241816373a 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -79,7 +79,7 @@ def __init__(
         parent,
         batch_size=12,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -342,7 +342,7 @@ def __init__(
         max_position_embeddings=512,
         position_embedding_type="absolute",
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -487,7 +487,7 @@ def __init__(
         seq_length=44,
         use_input_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index e7e592d5b62f..01e9942de252 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -70,7 +70,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index ed094db4a056..0dde54a398e3 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -51,7 +51,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -203,7 +203,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index e842bbc73268..9bdc17fa1903 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -52,7 +52,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index f820b54942e1..c94103988849 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -56,7 +56,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 8beddc0abad1..3d4dd27fa472 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -55,7 +55,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="relu",
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index a32f35f6e747..58574a8b1da3 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -52,9 +52,9 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
-        attention_types=[[["global", "local"], 2]],
+        attention_types=[[["global", "local"], 1]],
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index a79cf5b25d7d..075b9a26633c 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -54,8 +54,8 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=4,
-        attention_types=[[["global", "local"], 2]],
+        num_hidden_layers=2,
+        attention_types=[[["global", "local"], 1]],
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 176970779ed5..8777bd3abd62 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -52,7 +52,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
index 47bb22b62705..fc78b8bdd420 100644
--- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
@@ -44,7 +44,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_multiple_size=4,
         hidden_act="gelu",
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index d177e345e88e..48061f84d86c 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -53,7 +53,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 3636d357d5dc..f0e02700700c 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -56,7 +56,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         rotary_dim=4,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
index 54a98cf70fd3..1a86e23fdccf 100644
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -45,7 +45,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         ext_size=42,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_ext_layers=2,
         num_attention_heads=4,
         num_experts=2,
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 261841277ad6..6d52b6b50185 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -356,7 +356,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -553,6 +553,10 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
+    # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
+
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
         import tensorflow as tf
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index bad1a561da0c..c5a6a1398f9f 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -71,7 +71,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py
index 9f2f2c950232..096a55169a00 100644
--- a/tests/models/ibert/test_modeling_ibert.py
+++ b/tests/models/ibert/test_modeling_ibert.py
@@ -62,7 +62,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 19fe688bf4c3..b4e2cd5ab413 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -65,7 +65,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index d659c3891635..49d780918c44 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -64,7 +64,7 @@ def __init__(
         is_training=True,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -219,7 +219,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         projection_dim=32,
-        num_hidden_layers=6,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -295,7 +295,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 0535fbf4e1f4..aafa53969dfb 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -48,7 +48,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index c8457331c58a..cffa09d6d0f1 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -55,7 +55,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index 2c3aef9b9357..bf9a0b83144a 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -63,7 +63,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index c2efc3f5a331..e8d5f9abe62f 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -46,7 +46,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index 21853e44208f..b40e464e6032 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
index 2c262bef3092..9449cfa5e35a 100644
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -71,7 +71,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index 0f7ae0a27210..b2d17dc0e67a 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -59,7 +59,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -916,7 +916,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 35bdb6b6d5fa..95cb4f5d01a6 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -61,7 +61,7 @@ def __init__(
         entity_vocab_size=10,
         entity_emb_size=6,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 6cbcd55d3f76..8fd5e04a5674 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -661,7 +661,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/markuplm/test_modeling_markuplm.py b/tests/models/markuplm/test_modeling_markuplm.py
index 09d2f1ad52a7..71757385e87c 100644
--- a/tests/models/markuplm/test_modeling_markuplm.py
+++ b/tests/models/markuplm/test_modeling_markuplm.py
@@ -53,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index ec3d36f33d0e..db5b554e82d4 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -491,7 +491,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/mega/test_modeling_mega.py b/tests/models/mega/test_modeling_mega.py
index dfb00d190f4d..e10ecc5487a0 100644
--- a/tests/models/mega/test_modeling_mega.py
+++ b/tests/models/mega/test_modeling_mega.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index bc1d81c4e030..818f65d80c56 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -58,7 +58,7 @@ def __init__(
         vocab_size=99,
         hidden_size=64,
         embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index 1d972e22a35b..d8ba50a35002 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -55,7 +55,7 @@ def __init__(
         num_bpe_labels=99,
         num_wordpiece_labels=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         mlp_ratio=4.0,
         patch_embeds_hidden_size=257,
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index 6e4f696b8b01..572490071a27 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -54,7 +54,7 @@ def __init__(
         vocab_size=99,
         hidden_size=64,
         embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py
index d3261e4bc045..fc167641742a 100644
--- a/tests/models/mpnet/test_modeling_mpnet.py
+++ b/tests/models/mpnet/test_modeling_mpnet.py
@@ -49,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 91cb35bb7a6a..363c493b1e79 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -54,7 +54,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index c6a0813032fe..aac9ce5bc160 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=36,
         hidden_act="gelu",
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index cc3986a3701a..8e6143529a80 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -595,7 +595,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py
index 5b36ffbc963b..a71823d8a5f6 100644
--- a/tests/models/nezha/test_modeling_nezha.py
+++ b/tests/models/nezha/test_modeling_nezha.py
@@ -55,7 +55,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index 030b5f2a8988..9311a0199094 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -52,7 +52,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="relu",
diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py
index 390308631d28..ae06670103c8 100644
--- a/tests/models/nystromformer/test_modeling_nystromformer.py
+++ b/tests/models/nystromformer/test_modeling_nystromformer.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py
index 0e8ba6d9ced7..98d74ee5f807 100644
--- a/tests/models/openai/test_modeling_openai.py
+++ b/tests/models/openai/test_modeling_openai.py
@@ -49,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 251282a91b18..69a063f276b7 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -70,7 +70,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
         hidden_act="gelu",
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 4dbd1fb0a8cd..8360b9f2a232 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -62,7 +62,7 @@ def __init__(
         num_channels=3,
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py
index fbc49c78112b..62b9077f0d47 100644
--- a/tests/models/pegasus/test_modeling_flax_pegasus.py
+++ b/tests/models/pegasus/test_modeling_flax_pegasus.py
@@ -52,7 +52,7 @@ def __init__(
         use_labels=False,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_dropout_prob=0.1,
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index bde7477f9450..4011fe2c6824 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -371,7 +371,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index 73c4ee62bf13..22d7b0c8634a 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -670,7 +670,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index c49db3dcff2b..34ca767d6b01 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -71,7 +71,7 @@ def __init__(
         patch_embed_hidden_size=12,
         projection_dim=32,
         max_patches=64,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
@@ -230,7 +230,7 @@ def __init__(
         vocab_size=99,
         hidden_size=12,
         projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 05dbac6a2c80..4cd8ecd14fe8 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -473,7 +473,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         encoder_attention_heads=4,
         decoder_attention_heads=4,
         max_position_embeddings=30,
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index fa717b274306..eee03134d34e 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -55,10 +55,10 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=True,
@@ -437,10 +437,10 @@ def check_fast_integration(
                 decoder_attention_mask=decoder_attention_mask,
                 labels=lm_labels,
             )
-        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5981, device=torch_device), atol=1e-3))
+        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5892, device=torch_device), atol=1e-3))
 
         expected_logit_slice = torch.tensor(
-            [-0.0648, 0.0790, 0.0360, 0.0089, 0.0039, -0.0639, 0.0131], device=torch_device
+            [-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453], device=torch_device
         )
         self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
 
@@ -551,10 +551,10 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
@@ -782,10 +782,10 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=0,
         encoder_ffn_dim=32,
-        num_encoder_layers=4,
+        num_encoder_layers=2,
         num_encoder_attention_heads=4,
         decoder_ffn_dim=32,
-        num_decoder_layers=4,
+        num_decoder_layers=2,
         num_decoder_attention_heads=4,
         max_position_embeddings=30,
         is_encoder_decoder=False,
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
index cc05389eea60..d10abb733e07 100644
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -54,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
index ddd6c2645011..4d6d9fd0ff11 100644
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -54,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index 4e6754b2e55f..557a42243df8 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -55,7 +55,7 @@ def __init__(
         hidden_size=32,
         input_embedding_size=18,
         output_embedding_size=43,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
index c325e295f610..f82479aa706f 100644
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -46,7 +46,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 7ca78e23b7f1..40c85123c476 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -58,7 +58,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
index 3f15ca9ff3af..8500dfcb67a8 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
@@ -49,7 +49,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
index 4e4915147fda..c44e1613b201 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
@@ -57,7 +57,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 2efd9b79926f..d1caca6b6f94 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -58,7 +58,7 @@ def __init__(
         pronunciation_embed_dim=32,
         shape_embed_dim=32,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py
index 28d0ffba95a4..8364e121b42a 100644
--- a/tests/models/roformer/test_modeling_flax_roformer.py
+++ b/tests/models/roformer/test_modeling_flax_roformer.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 357e126a047a..e54d31d15468 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -56,7 +56,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 2b9cc47133ae..4ca5cfdf9e13 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -52,7 +52,7 @@ def __init__(
         use_mc_token_ids=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 651600c4372a..876b232a11c0 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -65,7 +65,7 @@ def __init__(
         num_conv_pos_embeddings=31,
         num_conv_pos_embedding_groups=2,
         squeeze_factor=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout=0.1,
         intermediate_size=20,
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index 9aa4b8edace4..dc33e80ede92 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -72,7 +72,7 @@ def __init__(
         position_biased_input=False,
         pos_att_type=("p2c", "c2p"),
         norm_rel_ebd="layer_norm",
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout=0.1,
         intermediate_size=20,
diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
index ccd5bfa189da..cbb449c6e7ae 100644
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         decoder_attention_heads=4,
         max_position_embeddings=30,
         pad_token_id=0,
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index c357259d7865..9324996ffe06 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -105,7 +105,7 @@ def __init__(
         is_training=False,
         vocab_size=81,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
     ):
@@ -249,7 +249,7 @@ def __init__(
         decoder_seq_length=7,
         is_training=False,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
         conv_dim=(32, 32, 32),
@@ -786,7 +786,7 @@ def __init__(
         decoder_seq_length=1024,  # speech is longer
         is_training=False,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
         vocab_size=81,
@@ -1031,7 +1031,7 @@ def __init__(
         decoder_seq_length=1024,
         is_training=False,
         hidden_size=24,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         intermediate_size=4,
         conv_dim=(32, 32, 32),
diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py
index 24a0753157bf..90ee07c35458 100644
--- a/tests/models/splinter/test_modeling_splinter.py
+++ b/tests/models/splinter/test_modeling_splinter.py
@@ -46,7 +46,7 @@ def __init__(
         vocab_size=99,
         hidden_size=32,
         question_token_id=1,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py
index 5efb03031132..bf86792f57f1 100644
--- a/tests/models/squeezebert/test_modeling_squeezebert.py
+++ b/tests/models/squeezebert/test_modeling_squeezebert.py
@@ -50,7 +50,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=64,
         hidden_act="gelu",
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index abe785eca083..ae9966a1db60 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -58,7 +58,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -826,7 +826,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index a2a80ab25bb6..d5d729dac9af 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -70,7 +70,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -477,7 +477,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 41aade960e84..cae891ef8b76 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -71,7 +71,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
@@ -902,7 +902,7 @@ def __init__(
         # For common tests
         use_attention_mask=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 619a5d26128f..6a482d03bed9 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -79,7 +79,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 2783a65cede9..2b7a5e279fe7 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -60,7 +60,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py
index 970f87bf1063..63afd438d97d 100644
--- a/tests/models/transfo_xl/test_modeling_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_transfo_xl.py
@@ -52,7 +52,7 @@ def __init__(
         d_head=8,
         d_inner=128,
         div_val=2,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         scope=None,
         seed=1,
         eos_token_id=0,
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index 0033f339ae58..da24c7dd4309 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -47,7 +47,7 @@ def __init__(
         use_labels=True,
         decoder_start_token_id=2,
         decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
         decoder_attention_heads=4,
         max_position_embeddings=30,
         pad_token_id=0,
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index e437b2651e4f..3ee7f7adc7ff 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -68,7 +68,7 @@ def __init__(
         num_audio_channels=1,
         num_frames=2,
         hidden_size=32,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=128,
         hidden_act="gelu",
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 29f8502179a5..d9fd852c884a 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -64,7 +64,7 @@ def __init__(
         use_attention_mask=True,
         use_labels=False,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         d_ff=37,
         relative_attention_num_buckets=8,
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index 6d0bd1bf1f24..ac770bdbb6d2 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -65,7 +65,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index a418a56dadb6..9c8cffba9a89 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -67,7 +67,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 85a0d2714e98..9fb9c9e7f376 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -62,7 +62,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 772091d5b976..4aa036ebb6cb 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -65,7 +65,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index cf48fd7ffbec..9000be33ab7e 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -54,7 +54,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py
index ca3130493eda..af56f4717b88 100644
--- a/tests/models/vit/test_modeling_flax_vit.py
+++ b/tests/models/vit/test_modeling_flax_vit.py
@@ -41,7 +41,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index 67c6e4acb10b..82ba910ec869 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -59,7 +59,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index fc816750e713..20747b2d54c0 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -50,7 +50,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 3cedb0c176f5..89a3a0d803e4 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -56,7 +56,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index 173dca091391..a53163775150 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -52,7 +52,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index b9b52dc121e9..4cff7dca41ca 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -123,7 +123,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 630a5d8e8513..fb639077b69b 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -153,7 +153,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 8c26268c6dad..a79e8ac1eaf5 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -71,7 +71,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,
         intermediate_size=20,
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index b04a96dd1caa..05385b68b082 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -64,7 +64,7 @@ def __init__(
         conv_bias=False,
         num_conv_pos_embeddings=16,
         num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
         num_attention_heads=2,
         hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
         intermediate_size=20,
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index ef2d11ac626e..5c602d3d3ef7 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -61,7 +61,7 @@ def __init__(
         num_frames=8,  # important; the batch size * time must be divisible by the number of frames
         is_training=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         mit_hidden_size=64,
@@ -318,7 +318,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         dropout=0.1,
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
index 60436cb1f9a3..8f1c9a5e2a74 100644
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -53,7 +53,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index bbb87abe6d6e..e6c013cca150 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -44,7 +44,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         d_model=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         ffn_dim=37,
         activation_function="gelu",
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index d8a18441146a..b551e7e645d5 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -57,7 +57,7 @@ def __init__(
         vocab_size=99,
         n_special=0,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
index 7ec84c9b1f2b..828d6a02a6a3 100644
--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -55,7 +55,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index 2b3f4752ee6a..2b0c95cd6d13 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -56,7 +56,7 @@ def __init__(
         hidden_size=32,
         num_attention_heads=4,
         d_inner=128,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         type_sequence_label_size=2,
         untie_r=True,
         bi_data=False,
diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py
index 5845dee7448d..fc1ce44e35d8 100644
--- a/tests/models/xmod/test_modeling_xmod.py
+++ b/tests/models/xmod/test_modeling_xmod.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index 7c49bb864e4c..c1fb50e30b76 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -52,7 +52,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index e275e19e40e8..67d7b9edc4e4 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -51,7 +51,7 @@ def __init__(
         use_labels=True,
         vocab_size=99,
         hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index d80624b7189b..0cdc94fc8e42 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1017,7 +1017,8 @@ def test_head_pruning(self):
             attentions = outputs[-1]
 
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_pretrained(self):
@@ -1053,7 +1054,8 @@ def test_head_pruning_save_load_from_pretrained(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_save_load_from_config_init(self):
@@ -1087,7 +1089,8 @@ def test_head_pruning_save_load_from_config_init(self):
             attentions = outputs[-1]
 
             self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
             self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
     def test_head_pruning_integration(self):
@@ -1106,7 +1109,7 @@ def test_head_pruning_integration(self):
             inputs_dict["output_attentions"] = True
             config.output_hidden_states = False
 
-            heads_to_prune = {0: [0], 1: [1, 2]}
+            heads_to_prune = {1: [1, 2]}
             config.pruned_heads = heads_to_prune
 
             model = model_class(config=config)
@@ -1117,10 +1120,8 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
             with tempfile.TemporaryDirectory() as temp_dir_name:
                 model.save_pretrained(temp_dir_name)
@@ -1131,12 +1132,10 @@ def test_head_pruning_integration(self):
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs[-1]
 
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-            heads_to_prune = {0: [0], 2: [1, 2]}
+            heads_to_prune = {0: [0], 1: [1, 2]}
             model.prune_heads(heads_to_prune)
 
             with torch.no_grad():
@@ -1145,10 +1144,8 @@ def test_head_pruning_integration(self):
 
             self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
             self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]})
 
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):

From 2bd7a27a671fd1d98059124024f580f8f5c0f3b5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 2 Aug 2023 22:00:32 +0200
Subject: [PATCH 842/935] CI with `pytest_num_workers=8` for torch/tf jobs
 (#25274)

n8

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 56be3a0232df..504c8915b997 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -285,7 +285,6 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     parallelism=1,
-    pytest_num_workers=3,
 )
 
 
@@ -298,7 +297,6 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager tensorflow_probability",
     ],
     parallelism=1,
-    pytest_num_workers=6,
 )
 
 

From 15082a9dc6950ecae63a0d3e5060b2fc7f15050a Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 3 Aug 2023 11:34:45 +0200
Subject: [PATCH 843/935] Docs: Update list of `report_to` logging integrations
 in docstring (#25281)

* Update list of logging integrations in docstring

Also update type hint

* Also add 'flyte' to report_to callback list

* Revert 'report_to' type hint update

Due to CLI breaking
---
 src/transformers/training_args.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index a8d3258afedf..59f2cac02b38 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -507,8 +507,9 @@ class TrainingArguments:
             instance of `Dataset`.
         report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
-            `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. Use `"all"` to report to
-            all integrations installed, `"none"` for no integrations.
+            `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"flyte"`, `"mlflow"`, `"neptune"`,
+            `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` for no
+            integrations.
         ddp_find_unused_parameters (`bool`, *optional*):
             When using distributed training, the value of the flag `find_unused_parameters` passed to
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.

From 30409af6e1b2b5efb6d9932b3e3b4ce20cfdb30e Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 3 Aug 2023 11:01:10 +0100
Subject: [PATCH 844/935] Update InstructBLIP & Align values after rescale
 update (#25209)

* Update InstructBLIP values
Note: the tests are not independent. Running the test independentely produces different logits compared to running all the integration tests

* Update test values after rescale update

* Remove left over commented out code

* Revert to previous rescaling logic

* Update rescale tests
---
 .../models/efficientnet/image_processing_efficientnet.py  | 8 ++++----
 src/transformers/models/vivit/image_processing_vivit.py   | 8 ++++----
 .../efficientnet/test_image_processing_efficientnet.py    | 4 ++--
 tests/models/instructblip/test_modeling_instructblip.py   | 6 +++---
 tests/models/vivit/test_image_processing_vivit.py         | 4 ++--
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 8873a8006997..9a19ab1ff643 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -155,10 +155,11 @@ def rescale(
         """
         Rescale an image by a scale factor.
 
-        If offset is True, the image is rescaled between [-1, 1].
-            image = image * scale * 2 - 1
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
 
-        If offset is False, the image is rescaled between [0, 1].
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
             image = image * scale
 
         Args:
@@ -171,7 +172,6 @@ def rescale(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        scale = scale * 2 if offset else scale
         rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
 
         if offset:
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 41666e99999f..0790c5d82bb5 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -179,10 +179,11 @@ def rescale(
         """
         Rescale an image by a scale factor.
 
-        If offset is True, the image is rescaled between [-1, 1].
-            image = image * scale * 2 - 1
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
 
-        If offset is False, the image is rescaled between [0, 1].
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
             image = image * scale
 
         Args:
@@ -195,7 +196,6 @@ def rescale(
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
-        scale = scale * 2 if offset else scale
         rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
 
         if offset:
diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.py b/tests/models/efficientnet/test_image_processing_efficientnet.py
index 3e427474f696..bc65e7acbf18 100644
--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -200,8 +200,8 @@ def test_rescale(self):
 
         image_processor = self.image_processing_class(**self.image_processor_dict)
 
-        rescaled_image = image_processor.rescale(image, scale=1 / 255)
-        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
         self.assertTrue(np.allclose(rescaled_image, expected_image))
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 49d780918c44..f8ce2b22e815 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -538,7 +538,7 @@ def test_inference_vicuna_7b(self):
             logits = model(**inputs).logits
 
         expected_slice = torch.tensor(
-            [[-3.5020, -12.3281, 8.4453], [-5.1406, -11.9609, 7.8711], [-4.0430, -13.4375, 9.1172]],
+            [[-3.4727, -11.8203, 8.3828], [-5.1172, -11.3438, 7.7656], [-4.0742, -13.4688, 9.1953]],
             device=torch_device,
         )
         self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))
@@ -548,12 +548,12 @@ def test_inference_vicuna_7b(self):
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 
         # fmt: off
-        expected_outputs = [    2,   450, 22910,  9565,   310,   445,  1967,   338,   393,   263, 767,   338, 13977,   292, 22095,   373,   278,  1250,   310,   263, 13328, 20134, 29963,  1550, 19500,  1623,   263, 19587,  4272, 11952, 29889]
+        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889]
         # fmt: on
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving down a busy city street.",
+            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
         )
 
     def test_inference_flant5_xl(self):
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
index 0b445cf47484..695473474874 100644
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -219,8 +219,8 @@ def test_rescale(self):
 
         image_processor = self.image_processing_class(**self.image_processor_dict)
 
-        rescaled_image = image_processor.rescale(image, scale=1 / 255)
-        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
         self.assertTrue(np.allclose(rescaled_image, expected_image))
 
         rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)

From a8817371c9c9f72882b37c4c646877b2032dcbfc Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 3 Aug 2023 13:51:56 +0100
Subject: [PATCH 845/935] Docs: separate generate section (#25235)

Separate generate doc section
---
 docs/source/en/_toctree.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b6f496c11fb8..b28c6dbc1267 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -77,14 +77,17 @@
         title: Text to speech
     title: Multimodal
     isExpanded: false
+  - sections:
+      - local: generation_strategies
+        title: Customize text generation strategy
+    title: Generation
+    isExpanded: false
   title: Task Guides
 - sections:
     - local: fast_tokenizers
       title: Use fast tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Run inference with multilingual models
-    - local: generation_strategies
-      title: Customize text generation strategy
     - local: create_a_model
       title: Use model-specific APIs
     - local: custom_models

From 8455346c5c3a1accd86ea0497b0e7d9c16de174b Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:08:39 +0200
Subject: [PATCH 846/935] Update bark doc (#25234)

* add mention to optimization in Bark docs

* add offload mention in docs

* Apply suggestions from code review

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update bark docs.

* Update bark.md

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 docs/source/en/model_doc/bark.md | 62 +++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md
index 4126f10c3ef7..63edb38dc956 100644
--- a/docs/source/en/model_doc/bark.md
+++ b/docs/source/en/model_doc/bark.md
@@ -26,8 +26,67 @@ Bark is made of 4 main models:
 
 It should be noted that each of the first three modules can support conditional speaker embeddings to condition the output sound according to specific predefined voice.
 
+### Optimizing Bark
 
-### Tips:
+Bark can be optimized with just a few extra lines of code, which **significantly reduces its memory footprint** and **accelerates inference**.
+
+#### Using half-precision
+
+You can speed up inference and reduce memory footprint by 50% simply by loading the model in half-precision.
+
+```python
+from transformers import BarkModel
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+```
+
+#### Using 🤗 Better Transformer
+
+Better Transformer is an 🤗 Optimum feature that performs kernel fusion under the hood. You can gain 20% to 30% in speed with zero performance degradation. It only requires one line of code to export the model to 🤗 Better Transformer:
+
+```python
+model =  model.to_bettertransformer()
+```
+
+Note that 🤗 Optimum must be installed before using this feature. [Here's how to install it.](https://huggingface.co/docs/optimum/installation)
+
+#### Using CPU offload
+
+As mentioned above, Bark is made up of 4 sub-models, which are called up sequentially during audio generation. In other words, while one sub-model is in use, the other sub-models are idle.
+
+If you're using a CUDA device, a simple solution to benefit from an 80% reduction in memory footprint is to offload the GPU's submodels when they're idle. This operation is called CPU offloading. You can use it with one line of code.
+
+```python
+model.enable_cpu_offload()
+```
+
+Note that 🤗 Accelerate must be installed before using this feature. [Here's how to install it.](https://huggingface.co/docs/accelerate/basic_tutorials/install)
+
+#### Combining optimizaton techniques
+
+You can combine optimization techniques, and use CPU offload, half-precision and 🤗 Better Transformer all at once.
+
+```python
+from transformers import BarkModel
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load in fp16
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+
+# convert to bettertransformer
+model = BetterTransformer.transform(model, keep_original_model=False)
+
+# enable CPU offload
+model.enable_cpu_offload()
+```
+
+Find out more on inference optimization techniques [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
+
+### Tips
 
 Suno offers a library of voice presets in a number of languages [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c).
 These presets are also uploaded in the hub [here](https://huggingface.co/suno/bark-small/tree/main/speaker_embeddings) or [here](https://huggingface.co/suno/bark/tree/main/speaker_embeddings).
@@ -103,6 +162,7 @@ The original code can be found [here](https://github.com/suno-ai/bark).
 
 [[autodoc]] BarkModel
     - generate
+    - enable_cpu_offload
 
 ## BarkSemanticModel
 

From 6d3f9c1e2ea18f51038b7d4758bdcd372c964dea Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:12:07 +0200
Subject: [PATCH 847/935] add generate method to SpeechT5ForTextToSpeech
 (#25233)

* add generate method to SpeechT5ForTextToSpeech

* update speecht5forTTS docstrings

* Remove defaults to None in generate docstrings

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/model_doc/speecht5.md          |  2 +-
 .../models/speecht5/modeling_speecht5.py      | 61 ++++++++++++++++++-
 .../models/speecht5/test_modeling_speecht5.py |  4 ++
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/speecht5.md b/docs/source/en/model_doc/speecht5.md
index 915fe2a9c43f..4d5e2098a542 100644
--- a/docs/source/en/model_doc/speecht5.md
+++ b/docs/source/en/model_doc/speecht5.md
@@ -71,7 +71,7 @@ This model was contributed by [Matthijs](https://huggingface.co/Matthijs). The o
 
 [[autodoc]] SpeechT5ForTextToSpeech
     - forward
-    - generate_speech
+    - generate
 
 ## SpeechT5ForSpeechToSpeech
 
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 5a3f117adbc8..f4e0e8052c38 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -2717,7 +2717,7 @@ def forward(
         >>> set_seed(555)  # make deterministic
 
         >>> # generate speech
-        >>> speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
         >>> speech.shape
         torch.Size([15872])
         ```
@@ -2783,6 +2783,65 @@ def forward(
             encoder_attentions=outputs.encoder_attentions,
         )
 
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        speaker_embeddings: Optional[torch.FloatTensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 20.0,
+        vocoder: Optional[nn.Module] = None,
+        output_cross_attentions: bool = False,
+        **kwargs,
+    ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
+        r"""
+        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
+        speech waveform using a vocoder.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. The `batch_size` should be 1 currently.
+
+                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+                [`~PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+                Tensor containing the speaker embeddings.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The generated sequence ends when the predicted stop token probability exceeds this value.
+            minlenratio (`float`, *optional*, defaults to 0.0):
+                Used to calculate the minimum required length for the output sequence.
+            maxlenratio (`float`, *optional*, defaults to 20.0):
+                Used to calculate the maximum allowed length for the output sequence.
+            vocoder (`nn.Module`, *optional*):
+                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
+                spectrogram.
+            output_cross_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
+            - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
+              `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
+            - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
+              `(num_frames,)` -- The predicted speech waveform.
+            - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`) `torch.FloatTensor`
+              of shape `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length,
+              input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
+        """
+        return _generate_speech(
+            self,
+            input_ids,
+            speaker_embeddings,
+            threshold,
+            minlenratio,
+            maxlenratio,
+            vocoder,
+            output_cross_attentions,
+        )
+
     @torch.no_grad()
     def generate_speech(
         self,
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 9324996ffe06..eaec85491415 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -1020,6 +1020,10 @@ def test_generation(self):
         generated_speech = model.generate_speech(input_ids)
         self.assertEqual(generated_speech.shape, (1820, model.config.num_mel_bins))
 
+        # test model.generate, same method than generate_speech but with additional kwargs to absorb kwargs such as attention_mask
+        generated_speech_with_generate = model.generate(input_ids, attention_mask=None)
+        self.assertEqual(generated_speech_with_generate.shape, (1820, model.config.num_mel_bins))
+
 
 @require_torch
 class SpeechT5ForSpeechToSpeechTester:

From d114a6b71f243054db333dc5a3f55816161eb7ea Mon Sep 17 00:00:00 2001
From: Roland Szabo <roland@rolisz.ro>
Date: Thu, 3 Aug 2023 14:51:54 +0000
Subject: [PATCH 848/935] Add timeout parameter to load_image function (#25184)

* Add timeout parameter to load_image function.

* Remove line.

* Reformat code

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add parameter to docs.

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/image_utils.py                    |  8 +++++---
 src/transformers/pipelines/depth_estimation.py     | 14 ++++++++++----
 .../pipelines/document_question_answering.py       |  9 ++++++++-
 src/transformers/pipelines/image_classification.py | 14 ++++++++++----
 src/transformers/pipelines/image_segmentation.py   |  9 +++++++--
 src/transformers/pipelines/image_to_text.py        | 11 ++++++++---
 src/transformers/pipelines/mask_generation.py      |  8 +++++++-
 src/transformers/pipelines/object_detection.py     | 12 +++++++++---
 .../pipelines/visual_question_answering.py         | 11 ++++++++---
 .../pipelines/zero_shot_image_classification.py    | 10 ++++++++--
 .../pipelines/zero_shot_object_detection.py        | 13 ++++++++++---
 tests/utils/test_image_utils.py                    | 12 ++++++++++++
 12 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 109b148f34e9..f06fdb0e6dfa 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import os
-from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import requests
@@ -253,13 +253,15 @@ def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List,
     return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
 
 
-def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
     """
     Loads `image` to a PIL Image.
 
     Args:
         image (`str` or `PIL.Image.Image`):
             The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
 
     Returns:
         `PIL.Image.Image`: A PIL Image.
@@ -269,7 +271,7 @@ def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
         if image.startswith("http://") or image.startswith("https://"):
             # We need to actually check for a real protocol, otherwise it's impossible to use a local file
             # like http_huggingface_co.png
-            image = PIL.Image.open(requests.get(image, stream=True).raw)
+            image = PIL.Image.open(requests.get(image, stream=True, timeout=timeout).raw)
         elif os.path.isfile(image):
             image = PIL.Image.open(image)
         else:
diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index 89b29e8dcaca..c8d0cad6fc77 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -68,6 +68,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -81,11 +84,14 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def _sanitize_parameters(self, **kwargs):
-        return {}, {}, {}
+    def _sanitize_parameters(self, timeout=None, **kwargs):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        return preprocess_params, {}, {}
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout)
         self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         return model_inputs
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index e4b96348e7be..3c107d650cfd 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -159,6 +159,7 @@ def _sanitize_parameters(
         max_seq_len=None,
         top_k=None,
         handle_impossible_answer=None,
+        timeout=None,
         **kwargs,
     ):
         preprocess_params, postprocess_params = {}, {}
@@ -174,6 +175,8 @@ def _sanitize_parameters(
             preprocess_params["lang"] = lang
         if tesseract_config is not None:
             preprocess_params["tesseract_config"] = tesseract_config
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
 
         if top_k is not None:
             if top_k < 1:
@@ -244,6 +247,9 @@ def __call__(
                 Language to use while running OCR. Defaults to english.
             tesseract_config (`str`, *optional*):
                 Additional flags to pass to tesseract while running OCR.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
@@ -273,6 +279,7 @@ def preprocess(
         word_boxes: Tuple[str, List[float]] = None,
         lang=None,
         tesseract_config="",
+        timeout=None,
     ):
         # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
         # to support documents with enough tokens that overflow the model's window
@@ -285,7 +292,7 @@ def preprocess(
         image = None
         image_features = {}
         if input.get("image", None) is not None:
-            image = load_image(input["image"])
+            image = load_image(input["image"], timeout=timeout)
             if self.image_processor is not None:
                 image_features.update(self.image_processor(images=image, return_tensors=self.framework))
             elif self.feature_extractor is not None:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index bdda57430359..59ebabbd20e4 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -62,11 +62,14 @@ def __init__(self, *args, **kwargs):
             else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
         )
 
-    def _sanitize_parameters(self, top_k=None):
+    def _sanitize_parameters(self, top_k=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
         postprocess_params = {}
         if top_k is not None:
             postprocess_params["top_k"] = top_k
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
@@ -86,6 +89,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
@@ -99,8 +105,8 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         return model_inputs
 
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index c28946c77cd3..01540729e57b 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -89,6 +89,8 @@ def _sanitize_parameters(self, **kwargs):
             postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
         if "overlap_mask_area_threshold" in kwargs:
             postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
 
         return preprocess_kwargs, {}, postprocess_kwargs
 
@@ -116,6 +118,9 @@ def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
                 Threshold to use when turning the predicted masks into binary values.
             overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
                 Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
@@ -133,8 +138,8 @@ def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image, subtask=None):
-        image = load_image(image)
+    def preprocess(self, image, subtask=None, timeout=None):
+        image = load_image(image, timeout=timeout)
         target_size = [(image.height, image.width)]
         if self.model.config.__class__.__name__ == "OneFormerConfig":
             if subtask is None:
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index e9f802ab7b2a..e5cbb36ea526 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -58,12 +58,14 @@ def __init__(self, *args, **kwargs):
             TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
         )
 
-    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None):
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
         forward_kwargs = {}
         preprocess_params = {}
 
         if prompt is not None:
             preprocess_params["prompt"] = prompt
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
 
         if generate_kwargs is not None:
             forward_kwargs["generate_kwargs"] = generate_kwargs
@@ -97,6 +99,9 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
 
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
@@ -105,8 +110,8 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
         """
         return super().__call__(images, **kwargs)
 
-    def preprocess(self, image, prompt=None):
-        image = load_image(image)
+    def preprocess(self, image, prompt=None, timeout=None):
+        image = load_image(image, timeout=timeout)
 
         if prompt is not None:
             if not isinstance(prompt, str):
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index 26c5f7101ef2..bc2c719084a1 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -113,6 +113,8 @@ def _sanitize_parameters(self, **kwargs):
             preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
         if "crop_n_points_downscale_factor" in kwargs:
             preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
         # postprocess args
         if "pred_iou_thresh" in kwargs:
             forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
@@ -156,6 +158,9 @@ def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
                 the image length. Later layers with more crops scale down this overlap.
             crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
                 The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             `Dict`: A dictionary with the following keys:
@@ -175,8 +180,9 @@ def preprocess(
         crop_overlap_ratio: float = 512 / 1500,
         points_per_crop: Optional[int] = 32,
         crop_n_points_downscale_factor: Optional[int] = 1,
+        timeout: Optional[float] = None,
     ):
-        image = load_image(image)
+        image = load_image(image, timeout=timeout)
         target_size = self.image_processor.size["longest_edge"]
         crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
             image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index e7e26465abd2..636a1b6a061b 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -61,10 +61,13 @@ def __init__(self, *args, **kwargs):
         self.check_model_type(mapping)
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_kwargs = {}
         if "threshold" in kwargs:
             postprocess_kwargs["threshold"] = kwargs["threshold"]
-        return {}, {}, postprocess_kwargs
+        return preprocess_params, {}, postprocess_kwargs
 
     def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
         """
@@ -82,6 +85,9 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
             threshold (`float`, *optional*, defaults to 0.9):
                 The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
 
         Return:
             A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
@@ -97,8 +103,8 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
 
         return super().__call__(*args, **kwargs)
 
-    def preprocess(self, image):
-        image = load_image(image)
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
         target_size = torch.IntTensor([[image.height, image.width]])
         inputs = self.image_processor(images=[image], return_tensors="pt")
         if self.tokenizer is not None:
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index ed36972d83f1..339a907cbbf5 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -55,12 +55,14 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
 
-    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, **kwargs):
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
         preprocess_params, postprocess_params = {}, {}
         if padding is not None:
             preprocess_params["padding"] = padding
         if truncation is not None:
             preprocess_params["truncation"] = truncation
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
         if top_k is not None:
             postprocess_params["top_k"] = top_k
         return preprocess_params, {}, postprocess_params
@@ -90,6 +92,9 @@ def __call__(self, image: Union["Image.Image", str], question: str = None, **kwa
             top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
         Return:
             A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
 
@@ -109,8 +114,8 @@ def __call__(self, image: Union["Image.Image", str], question: str = None, **kwa
         results = super().__call__(inputs, **kwargs)
         return results
 
-    def preprocess(self, inputs, padding=False, truncation=False):
-        image = load_image(inputs["image"])
+    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
         model_inputs = self.tokenizer(
             inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
         )
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index a6e90dd52250..b16d191754a1 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -91,6 +91,10 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                 logits_per_image
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
         Return:
             A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the
             following keys:
@@ -104,13 +108,15 @@ def _sanitize_parameters(self, **kwargs):
         preprocess_params = {}
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
 
         return preprocess_params, {}, {}
 
-    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."):
-        image = load_image(image)
+    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
+        image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index cd7009abfbf0..a7181d9540b9 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -111,6 +111,10 @@ def __call__(
                 The number of top predictions that will be returned by the pipeline. If the provided number is `None`
                 or higher than the number of predictions available, it will default to the number of predictions.
 
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
 
         Return:
             A list of lists containing prediction results, one list per input image. Each list contains dictionaries
@@ -132,15 +136,18 @@ def __call__(
         return results
 
     def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
         postprocess_params = {}
         if "threshold" in kwargs:
             postprocess_params["threshold"] = kwargs["threshold"]
         if "top_k" in kwargs:
             postprocess_params["top_k"] = kwargs["top_k"]
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
-    def preprocess(self, inputs):
-        image = load_image(inputs["image"])
+    def preprocess(self, inputs, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
         candidate_labels = inputs["candidate_labels"]
         if isinstance(candidate_labels, str):
             candidate_labels = candidate_labels.split(",")
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index dd816f5931bb..6d6c0f5d9b05 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -18,7 +18,9 @@
 import datasets
 import numpy as np
 import pytest
+from requests import ReadTimeout
 
+from tests.pipelines.test_pipelines_document_question_answering import INVOICE_URL
 from transformers import is_torch_available, is_vision_available
 from transformers.image_utils import ChannelDimension, get_channel_dimension_axis, make_list_of_images
 from transformers.testing_utils import require_torch, require_vision
@@ -478,6 +480,16 @@ def test_center_crop_tensor(self):
 
 @require_vision
 class LoadImageTester(unittest.TestCase):
+    def test_load_img_url(self):
+        img = load_image(INVOICE_URL)
+        img_arr = np.array(img)
+
+        self.assertEqual(img_arr.shape, (1061, 750, 3))
+
+    def test_load_img_url_timeout(self):
+        with self.assertRaises(ReadTimeout):
+            load_image(INVOICE_URL, timeout=0.001)
+
     def test_load_img_local(self):
         img = load_image("./tests/fixtures/tests_samples/COCO/000000039769.png")
         img_arr = np.array(img)

From 66c240f3c950612fa05b2e14c85d4b86c88e473e Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 3 Aug 2023 16:05:02 +0100
Subject: [PATCH 849/935] [JAX] Bump min version (#25286)

* [JAX] Bump min version

* make fixup
---
 setup.py                                      | 4 ++--
 src/transformers/dependency_versions_table.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index b9afe248525a..2a0ef33b8831 100644
--- a/setup.py
+++ b/setup.py
@@ -124,8 +124,8 @@
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
-    "jax>=0.2.8,!=0.3.2,<=0.4.13",
-    "jaxlib>=0.1.65,<=0.4.13",
+    "jax>=0.4.1,<=0.4.13",
+    "jaxlib>=0.4.1,<=0.4.13",
     "jieba",
     "kenlm",
     "keras-nlp>=0.3.1",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 350c31213466..cce16d66da67 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -29,8 +29,8 @@
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
-    "jax": "jax>=0.2.8,!=0.3.2,<=0.4.13",
-    "jaxlib": "jaxlib>=0.1.65,<=0.4.13",
+    "jax": "jax>=0.4.1,<=0.4.13",
+    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
     "jieba": "jieba",
     "kenlm": "kenlm",
     "keras-nlp": "keras-nlp>=0.3.1",

From 33da2db5eafc83c9fa4e5ae9abd2f5636a3c6616 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang96@gmail.com>
Date: Thu, 3 Aug 2023 17:17:06 -0400
Subject: [PATCH 850/935] [small] llama2.md typo (#25295)

`groupe` -> `grouped`
---
 docs/source/en/model_doc/llama2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md
index 07f59a620755..212b8dfad5ef 100644
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@@ -29,7 +29,7 @@ Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)
 Tips:
 
 - Weights for the Llama2 models can be obtained by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
-- The architecture is very similar to the first Llama, with the addition of Groupe Query Attention (GQA) following this [paper](https://arxiv.org/pdf/2305.13245.pdf)
+- The architecture is very similar to the first Llama, with the addition of Grouped Query Attention (GQA) following this [paper](https://arxiv.org/pdf/2305.13245.pdf)
 - Setting `config.pretraining_tp` to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
 - The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
 - After filling out the form and gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:

From 641adca55832ed9c5648f54dcd8926d67d3511db Mon Sep 17 00:00:00 2001
From: Victor Geislinger <9027783+MrGeislinger@users.noreply.github.com>
Date: Thu, 3 Aug 2023 14:17:30 -0700
Subject: [PATCH 851/935] Fix typo: Roberta -> RoBERTa (#25302)

---
 docs/source/en/tokenizer_summary.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/tokenizer_summary.md b/docs/source/en/tokenizer_summary.md
index b13c4e83b834..5a23c7bf8473 100644
--- a/docs/source/en/tokenizer_summary.md
+++ b/docs/source/en/tokenizer_summary.md
@@ -141,7 +141,7 @@ on.
 
 Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword Units (Sennrich et
 al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the training data into
-words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [Roberta](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm),
+words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [RoBERTa](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm),
 [FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/gpt) which uses
 Spacy and ftfy, to count the frequency of each word in the training corpus.
 

From 67683095a6811bc63e2e71f5896906f3b8714b87 Mon Sep 17 00:00:00 2001
From: Peter Law <PeterJCLaw@gmail.com>
Date: Fri, 4 Aug 2023 12:42:05 +0100
Subject: [PATCH 852/935] Move usage of deprecated logging.warn to
 logging.warning (#25310)

The former spelling is deprecated and has been discouraged for a
while. The latter spelling seems to be more common in this project
anyway, so this change ought to be safe.

Fixes https://github.com/huggingface/transformers/issues/25283
---
 src/transformers/modeling_utils.py                            | 4 ++--
 src/transformers/models/blip/modeling_blip_text.py            | 2 +-
 src/transformers/models/blip_2/modeling_blip_2.py             | 2 +-
 .../models/deprecated/open_llama/modeling_open_llama.py       | 2 +-
 src/transformers/models/instructblip/modeling_instructblip.py | 2 +-
 src/transformers/models/wav2vec2/modeling_wav2vec2.py         | 2 +-
 src/transformers/tools/agents.py                              | 4 ++--
 src/transformers/tools/base.py                                | 4 ++--
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9e167cfdeeb4..f879aabc2629 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2853,7 +2853,7 @@ def from_pretrained(
             if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
                 kwargs["special_dtypes"] = special_dtypes
             elif len(special_dtypes) > 0:
-                logger.warn(
+                logger.warning(
                     "This model has some weights that should be kept in higher precision, you need to upgrade "
                     "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)."
                 )
@@ -3359,7 +3359,7 @@ def _find_mismatched_keys(
 
         if len(unexpected_keys) > 0:
             archs = [] if model.config.architectures is None else model.config.architectures
-            warner = logger.warn if model.__class__.__name__ in archs else logger.info
+            warner = logger.warning if model.__class__.__name__ in archs else logger.info
             warner(
                 f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
                 f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 444a7a22b6b0..61e983bb2db4 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -403,7 +403,7 @@ def forward(
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
         if self.gradient_checkpointing and self.training:
             if use_cache:
-                logger.warn(
+                logger.warning(
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                 )
                 use_cache = False
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index a54aa9f633c3..611a95bb1c1e 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -940,7 +940,7 @@ def forward(
 
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                     )
                     use_cache = False
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index f02c64502c00..0a7f0f8e985d 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -39,7 +39,7 @@
     from xformers import ops as xops
 except ImportError:
     xops = None
-    logger.warn(
+    logger.warning(
         "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
     )
 
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 48961998ed1a..b532d78b4454 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -930,7 +930,7 @@ def forward(
 
             if getattr(self.config, "gradient_checkpointing", False) and self.training:
                 if use_cache:
-                    logger.warn(
+                    logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                     )
                     use_cache = False
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 88d666cd534f..95f465262c47 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1273,7 +1273,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
             raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
 
         if target_lang == self.target_lang and not force_load:
-            logger.warn(f"Adapter weights are already set to {target_lang}.")
+            logger.warning(f"Adapter weights are already set to {target_lang}.")
             return
 
         cache_dir = kwargs.pop("cache_dir", None)
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index ec4e0c1cc36d..c63aa9a063c8 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -224,12 +224,12 @@ def __init__(self, chat_prompt_template=None, run_prompt_template=None, addition
             self._toolbox.update(additional_tools)
             if len(replacements) > 1:
                 names = "\n".join([f"- {n}: {t}" for n, t in replacements.items()])
-                logger.warn(
+                logger.warning(
                     f"The following tools have been replaced by the ones provided in `additional_tools`:\n{names}."
                 )
             elif len(replacements) == 1:
                 name = list(replacements.keys())[0]
-                logger.warn(f"{name} has been replaced by {replacements[name]} as provided in `additional_tools`.")
+                logger.warning(f"{name} has been replaced by {replacements[name]} as provided in `additional_tools`.")
 
         self.prepare_for_new_chat()
 
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index f4a0b31da432..bf2dd8f16051 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -264,7 +264,7 @@ def from_hub(
         if len(tool_class.name) == 0:
             tool_class.name = custom_tool["name"]
         if tool_class.name != custom_tool["name"]:
-            logger.warn(
+            logger.warning(
                 f"{tool_class.__name__} implements a different name in its configuration and class. Using the tool "
                 "configuration name."
             )
@@ -273,7 +273,7 @@ def from_hub(
         if len(tool_class.description) == 0:
             tool_class.description = custom_tool["description"]
         if tool_class.description != custom_tool["description"]:
-            logger.warn(
+            logger.warning(
                 f"{tool_class.__name__} implements a different description in its configuration and class. Using the "
                 "tool configuration description."
             )

From fab1a0aa82e032eee3816d1c767ec9e95659fb8f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 4 Aug 2023 14:10:31 +0200
Subject: [PATCH 853/935] Give more memory in test_disk_offload (#25315)

---
 tests/test_modeling_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0cdc94fc8e42..fe2e2bc81953 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2521,7 +2521,7 @@ def test_disk_offload(self):
             base_output = model(**inputs_dict_class)
 
             model_size = compute_module_sizes(model)[""]
-            max_size = int(self.model_split_percents[0] * model_size)
+            max_size = int(self.model_split_percents[1] * model_size)
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 

From bff4313b37960751d88ddba0a9fdb9ca5d32a64e Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Fri, 4 Aug 2023 13:35:10 +0100
Subject: [PATCH 854/935] Generate: get generation mode as an enum (#25292)

---
 src/transformers/generation/utils.py | 133 ++++++++++++++-------------
 1 file changed, 68 insertions(+), 65 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index aa1f377b283a..75b2c4f7145e 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -33,7 +33,7 @@
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
     MODEL_FOR_VISION_2_SEQ_MAPPING,
 )
-from ..utils import ModelOutput, logging
+from ..utils import ExplicitEnum, ModelOutput, logging
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .configuration_utils import GenerationConfig
@@ -468,6 +468,23 @@ class BeamSampleEncoderDecoderOutput(ModelOutput):
 GenerateOutput = Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, ContrastiveSearchOutput]
 
 
+class GenerationMode(ExplicitEnum):
+    """
+    Possible generation modes, downstream of the [`~generation.GenerationMixin.generate`] method.
+    """
+
+    # Non-beam methods
+    CONTRASTIVE_SEARCH = "contrastive_search"
+    GREEDY_SEARCH = "greedy_search"
+    SAMPLE = "sample"
+    ASSISTED_GENERATION = "assisted_generation"
+    # Beam methods
+    BEAM_SEARCH = "beam_search"
+    BEAM_SAMPLE = "beam_sample"
+    CONSTRAINED_BEAM_SEARCH = "constrained_beam_search"
+    GROUP_BEAM_SEARCH = "group_beam_search"
+
+
 class GenerationMixin:
     """
     A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].
@@ -829,6 +846,46 @@ def _get_logits_warper(
             warpers.append(LogitNormalization())
         return warpers
 
+    def _get_generation_mode(
+        self, generation_config: GenerationConfig, assistant_model: Optional["PreTrainedModel"]
+    ) -> GenerationMode:
+        """
+        Returns the generation mode triggered by a [`GenerationConfig`] instance.
+        """
+        if generation_config.constraints is not None or generation_config.force_words_ids is not None:
+            generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
+        elif generation_config.num_beams == 1:
+            if generation_config.do_sample is False:
+                if (
+                    generation_config.top_k is not None
+                    and generation_config.top_k > 1
+                    and generation_config.penalty_alpha is not None
+                    and generation_config.penalty_alpha > 0
+                ):
+                    generation_mode = GenerationMode.CONTRASTIVE_SEARCH
+                else:
+                    generation_mode = GenerationMode.GREEDY_SEARCH
+            else:
+                generation_mode = GenerationMode.SAMPLE
+        else:
+            if generation_config.num_beam_groups > 1:
+                generation_mode = GenerationMode.GROUP_BEAM_SEARCH
+            elif generation_config.do_sample is True:
+                generation_mode = GenerationMode.BEAM_SAMPLE
+            else:
+                generation_mode = GenerationMode.BEAM_SEARCH
+
+        # Assisted generation may extend some generation modes
+        if assistant_model is not None:
+            if generation_mode in ("greedy_search", "sample"):
+                generation_mode = GenerationMode.ASSISTED_GENERATION
+            else:
+                raise ValueError(
+                    "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
+                    "is only supported with Greedy Search and Sample."
+                )
+        return generation_mode
+
     def _get_logits_processor(
         self,
         generation_config: GenerationConfig,
@@ -1422,65 +1479,11 @@ def generate(
             )
 
         # 7. determine generation mode
-        is_constraint_gen_mode = (
-            generation_config.constraints is not None or generation_config.force_words_ids is not None
-        )
-
-        is_contrastive_search_gen_mode = (
-            (generation_config.num_beams == 1)
-            and generation_config.top_k is not None
-            and generation_config.top_k > 1
-            and generation_config.do_sample is False
-            and generation_config.penalty_alpha is not None
-            and generation_config.penalty_alpha > 0
-        )
-
-        is_greedy_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_sample_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_beam_gen_mode = (
-            (generation_config.num_beams > 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_beam_sample_gen_mode = (
-            (generation_config.num_beams > 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_group_beam_gen_mode = (
-            (generation_config.num_beams > 1)
-            and (generation_config.num_beam_groups > 1)
-            and not is_constraint_gen_mode
-            and not is_contrastive_search_gen_mode
-        )
-        is_assisted_gen_mode = False
-        if assistant_model is not None:
-            if not (is_greedy_gen_mode or is_sample_gen_mode):
-                raise ValueError(
-                    "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
-                    "is only supported with Greedy Search and Sample."
-                )
-            is_assisted_gen_mode = True
+        generation_mode = self._get_generation_mode(generation_config, assistant_model)
 
         if generation_config.num_beam_groups > generation_config.num_beams:
             raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
-        if is_group_beam_gen_mode and generation_config.do_sample is True:
+        if generation_mode == GenerationMode.GROUP_BEAM_SEARCH and generation_config.do_sample is True:
             raise ValueError(
                 "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`."
             )
@@ -1515,7 +1518,7 @@ def generate(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
         # 10. go into different generation modes
-        if is_assisted_gen_mode:
+        if generation_mode == GenerationMode.ASSISTED_GENERATION:
             if generation_config.num_return_sequences > 1:
                 raise ValueError(
                     "num_return_sequences has to be 1 when doing assisted generate, "
@@ -1553,7 +1556,7 @@ def generate(
                 streamer=streamer,
                 **model_kwargs,
             )
-        if is_greedy_gen_mode:
+        if generation_mode == GenerationMode.GREEDY_SEARCH:
             if generation_config.num_return_sequences > 1:
                 raise ValueError(
                     "num_return_sequences has to be 1 when doing greedy search, "
@@ -1574,7 +1577,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_contrastive_search_gen_mode:
+        elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
             if generation_config.num_return_sequences > 1:
                 raise ValueError(
                     "num_return_sequences has to be 1 when doing contrastive search, "
@@ -1599,7 +1602,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_sample_gen_mode:
+        elif generation_mode == GenerationMode.SAMPLE:
             # 11. prepare logits warper
             logits_warper = self._get_logits_warper(generation_config)
 
@@ -1626,7 +1629,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_beam_gen_mode:
+        elif generation_mode == GenerationMode.BEAM_SEARCH:
             if generation_config.num_return_sequences > generation_config.num_beams:
                 raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
 
@@ -1664,7 +1667,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_beam_sample_gen_mode:
+        elif generation_mode == GenerationMode.BEAM_SAMPLE:
             # 11. prepare logits warper
             logits_warper = self._get_logits_warper(generation_config)
 
@@ -1703,7 +1706,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_group_beam_gen_mode:
+        elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
             if generation_config.num_return_sequences > generation_config.num_beams:
                 raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
 
@@ -1754,7 +1757,7 @@ def generate(
                 **model_kwargs,
             )
 
-        elif is_constraint_gen_mode:
+        elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
             if generation_config.num_return_sequences > generation_config.num_beams:
                 raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
 

From aeb5a08abdaf8e282a0e0ac4c29e5e35deb0c9ca Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 4 Aug 2023 14:55:58 +0200
Subject: [PATCH 855/935] Add offline mode for agents (#25226)

* Add offline mode for agents

* Disable second check too
---
 src/transformers/tools/agents.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index c63aa9a063c8..51e3f6db0c25 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -25,7 +25,7 @@
 from huggingface_hub import HfFolder, hf_hub_download, list_spaces
 
 from ..models.auto import AutoTokenizer
-from ..utils import is_openai_available, is_torch_available, logging
+from ..utils import is_offline_mode, is_openai_available, is_torch_available, logging
 from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote
 from .prompts import CHAT_MESSAGE_PROMPT, download_prompt
 from .python_interpreter import evaluate
@@ -75,6 +75,10 @@ class PreTool:
 
 
 def get_remote_tools(organization="huggingface-tools"):
+    if is_offline_mode():
+        logger.info("You are in offline mode, so remote tools are not available.")
+        return {}
+
     spaces = list_spaces(author=organization)
     tools = {}
     for space_info in spaces:
@@ -105,16 +109,17 @@ def _setup_default_tools():
         description = tool_class.description
         HUGGINGFACE_DEFAULT_TOOLS[tool_class.name] = PreTool(task=task_name, description=description, repo_id=None)
 
-    for task_name in HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB:
-        found = False
-        for tool_name, tool in remote_tools.items():
-            if tool.task == task_name:
-                HUGGINGFACE_DEFAULT_TOOLS[tool_name] = tool
-                found = True
-                break
-
-        if not found:
-            raise ValueError(f"{task_name} is not implemented on the Hub.")
+    if not is_offline_mode():
+        for task_name in HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB:
+            found = False
+            for tool_name, tool in remote_tools.items():
+                if tool.task == task_name:
+                    HUGGINGFACE_DEFAULT_TOOLS[tool_name] = tool
+                    found = True
+                    break
+
+            if not found:
+                raise ValueError(f"{task_name} is not implemented on the Hub.")
 
     _tools_are_initialized = True
 

From 29f04002e604398bd369533c392e23e8af746014 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 4 Aug 2023 14:56:09 +0200
Subject: [PATCH 856/935] Deal with nested configs better in base class
 (#25237)

* Deal better with nested configs

* Fixes

* More fixes

* Fix last test

* Clean up existing configs

* Remove hack in MPT Config

* Update src/transformers/configuration_utils.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* Fix setting a nested config via dict in the kwargs

* Adapt common test

* Add test for nested config load with dict

---------

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 src/transformers/configuration_utils.py       | 42 +++++++++++++++++++
 .../models/align/configuration_align.py       | 15 -------
 .../models/altclip/configuration_altclip.py   | 15 -------
 .../models/bark/configuration_bark.py         | 19 ---------
 .../models/blip/configuration_blip.py         | 15 -------
 .../models/blip_2/configuration_blip_2.py     | 16 -------
 .../bridgetower/configuration_bridgetower.py  | 14 -------
 .../configuration_chinese_clip.py             | 15 -------
 .../models/clap/configuration_clap.py         | 15 -------
 .../models/clip/configuration_clip.py         | 15 -------
 .../models/clipseg/configuration_clipseg.py   | 15 -------
 .../configuration_conditional_detr.py         | 14 -------
 .../configuration_deformable_detr.py          | 14 -------
 .../models/deta/configuration_deta.py         | 11 -----
 .../models/detr/configuration_detr.py         | 14 +------
 .../configuration_encoder_decoder.py          | 14 -------
 .../models/flava/configuration_flava.py       | 17 --------
 .../models/fsmt/configuration_fsmt.py         | 14 -------
 .../models/git/configuration_git.py           | 11 -----
 .../models/groupvit/configuration_groupvit.py | 15 -------
 .../configuration_instructblip.py             | 16 -------
 .../models/jukebox/configuration_jukebox.py   | 29 -------------
 .../mask2former/configuration_mask2former.py  | 13 ------
 .../maskformer/configuration_maskformer.py    | 14 -------
 .../models/mpt/configuration_mpt.py           | 41 +++---------------
 .../models/musicgen/configuration_musicgen.py | 15 -------
 .../oneformer/configuration_oneformer.py      | 11 -----
 .../models/owlvit/configuration_owlvit.py     | 15 -------
 .../pix2struct/configuration_pix2struct.py    | 15 -------
 .../models/rag/configuration_rag.py           | 14 -------
 .../models/sam/configuration_sam.py           | 16 -------
 .../configuration_speech_encoder_decoder.py   | 14 -------
 .../configuration_table_transformer.py        | 14 +------
 .../models/upernet/configuration_upernet.py   | 11 -----
 .../configuration_vision_encoder_decoder.py   | 14 -------
 .../configuration_vision_text_dual_encoder.py | 14 -------
 .../vit_hybrid/configuration_vit_hybrid.py    | 12 ------
 .../models/x_clip/configuration_x_clip.py     | 15 -------
 tests/test_configuration_common.py            |  8 ++--
 tests/test_configuration_utils.py             |  7 ++++
 40 files changed, 62 insertions(+), 566 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 3f177ba19fd9..00f9b5610e6b 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -762,6 +762,10 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         to_remove = []
         for key, value in kwargs.items():
             if hasattr(config, key):
+                current_attr = getattr(config, key)
+                # To authorize passing a custom subconfig as kwarg in models that have nested configs.
+                if isinstance(current_attr, PretrainedConfig) and isinstance(value, dict):
+                    value = current_attr.__class__(**value)
                 setattr(config, key, value)
                 if key != "torch_dtype":
                     to_remove.append(key)
@@ -823,6 +827,18 @@ def to_diff_dict(self) -> Dict[str, Any]:
         # only serialize values that differ from the default config
         for key, value in config_dict.items():
             if (
+                isinstance(getattr(self, key, None), PretrainedConfig)
+                and key in class_config_dict
+                and isinstance(class_config_dict[key], dict)
+            ):
+                # For nested configs we need to clean the diff recursively
+                diff = recursive_diff_dict(value, class_config_dict[key], config_obj=getattr(self, key, None))
+                if "model_type" in value:
+                    # Needs to be set even if it's not in the diff
+                    diff["model_type"] = value["model_type"]
+                if len(diff) > 0:
+                    serializable_config_dict[key] = diff
+            elif (
                 key not in default_config_dict
                 or key == "transformers_version"
                 or value != default_config_dict[key]
@@ -859,6 +875,14 @@ def to_dict(self) -> Dict[str, Any]:
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
 
+        for key, value in output.items():
+            # Deal with nested configs like CLIP
+            if isinstance(value, PretrainedConfig):
+                value = value.to_dict()
+                del value["transformers_version"]
+
+            output[key] = value
+
         if hasattr(self, "quantization_config"):
             output["quantization_config"] = (
                 self.quantization_config.to_dict()
@@ -1020,6 +1044,24 @@ def get_configuration_file(configuration_files: List[str]) -> str:
     return configuration_file
 
 
+def recursive_diff_dict(dict_a, dict_b, config_obj=None):
+    """
+    Helper function to recursively take the diff between two nested dictionaries. The resulting diff only contains the
+    values from `dict_a` that are different from values in `dict_b`.
+    """
+    diff = {}
+    default = config_obj.__class__().to_dict() if config_obj is not None else {}
+    for key, value in dict_a.items():
+        obj_value = getattr(config_obj, str(key), None)
+        if isinstance(obj_value, PretrainedConfig) and key in dict_b and isinstance(dict_b[key], dict):
+            diff_value = recursive_diff_dict(value, dict_b[key], config_obj=obj_value)
+            if len(diff_value) > 0:
+                diff[key] = diff_value
+        elif key not in dict_b or value != dict_b[key] or key not in default or value != default[key]:
+            diff[key] = value
+    return diff
+
+
 PretrainedConfig.push_to_hub = copy_func(PretrainedConfig.push_to_hub)
 if PretrainedConfig.push_to_hub.__doc__ is not None:
     PretrainedConfig.push_to_hub.__doc__ = PretrainedConfig.push_to_hub.__doc__.format(
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 787ed4b69720..818a4f7c4a2f 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ ALIGN model configuration"""
 
-import copy
 import os
 from typing import TYPE_CHECKING, List, Union
 
@@ -344,7 +343,6 @@ class AlignConfig(PretrainedConfig):
     ```"""
 
     model_type = "align"
-    is_composition = True
 
     def __init__(
         self,
@@ -383,16 +381,3 @@ def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: A
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index ddf22a8a526f..c944345e8663 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ AltCLIP model configuration"""
-import copy
 import os
 from typing import Union
 
@@ -291,7 +290,6 @@ class AltCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "altclip"
-    is_composition = True
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
@@ -392,16 +390,3 @@ def from_text_vision_configs(cls, text_config: AltCLIPTextConfig, vision_config:
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
index 4d7c3bc94d84..635cb0aa1a94 100644
--- a/src/transformers/models/bark/configuration_bark.py
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ BARK model configuration"""
 
-import copy
 import os
 from typing import Dict, Optional, Union
 
@@ -271,7 +270,6 @@ class BarkConfig(PretrainedConfig):
     """
 
     model_type = "bark"
-    is_composition = True
 
     def __init__(
         self,
@@ -329,20 +327,3 @@ def from_sub_model_configs(
             codec_config=codec_config.to_dict(),
             **kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-
-        output["semantic_config"] = self.semantic_config.to_dict()
-        output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
-        output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()
-        output["codec_config"] = self.codec_config.to_dict()
-
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index a1e2ff37ef4a..a0f2e397b22b 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Blip model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -325,7 +324,6 @@ class BlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "blip"
-    is_composition = True
 
     def __init__(
         self,
@@ -368,16 +366,3 @@ def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: Bl
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 650ce264ac99..1f09044cc71c 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ BLIP-2 model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -302,7 +301,6 @@ class Blip2Config(PretrainedConfig):
     ```"""
 
     model_type = "blip-2"
-    is_composition = True
 
     def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
         super().__init__(**kwargs)
@@ -355,17 +353,3 @@ def from_vision_qformer_text_configs(
             text_config=text_config.to_dict(),
             **kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["qformer_config"] = self.qformer_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
index 17c9cadaf838..30b6bf28795a 100644
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ BridgeTower model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -349,16 +348,3 @@ def from_text_vision_configs(
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index 941b76969644..cbbf429e1bd5 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Chinese-CLIP model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
@@ -314,7 +313,6 @@ class ChineseCLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "chinese_clip"
-    is_composition = True
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -417,19 +415,6 @@ def from_text_vision_configs(
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class ChineseCLIPOnnxConfig(OnnxConfig):
     @property
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 32d06813777e..7a198b40160f 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ CLAP model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -382,7 +381,6 @@ class ClapConfig(PretrainedConfig):
     ```"""
 
     model_type = "clap"
-    is_composition = True
 
     def __init__(
         self,
@@ -431,16 +429,3 @@ def from_text_audio_configs(cls, text_config: ClapTextConfig, audio_config: Clap
         """
 
         return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["audio_config"] = self.audio_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 01fc5a3ea178..7462ee065b65 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ CLIP model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
@@ -298,7 +297,6 @@ class CLIPConfig(PretrainedConfig):
     ```"""
 
     model_type = "clip"
-    is_composition = True
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -400,19 +398,6 @@ def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CL
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class CLIPOnnxConfig(OnnxConfig):
     @property
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 6d4147f82065..e53229840b66 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ CLIPSeg model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -302,7 +301,6 @@ class CLIPSegConfig(PretrainedConfig):
     ```"""
 
     model_type = "clipseg"
-    is_composition = True
 
     def __init__(
         self,
@@ -424,16 +422,3 @@ def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config:
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 9ce6179a19b1..356e5c0a574b 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Conditional DETR model configuration"""
-import copy
 from collections import OrderedDict
 from typing import Mapping
 
@@ -238,19 +237,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if self.backbone_config is not None:
-            output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index be4634477d14..dbe5fd7f0a78 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Deformable DETR model configuration"""
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -261,16 +260,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if self.backbone_config is not None:
-            output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index dc85ea91df36..8abe077ae126 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ DETA model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -230,13 +229,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 955b71de1ec5..3b6ac3624f10 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """ DETR model configuration"""
 
-import copy
 from collections import OrderedDict
-from typing import Dict, Mapping
+from typing import Mapping
 
 from packaging import version
 
@@ -248,17 +247,6 @@ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
         """
         return cls(backbone_config=backbone_config, **kwargs)
 
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if output["backbone_config"] is not None:
-            output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class DetrOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index 1fca8a10f78d..f7eda78e5c05 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -104,16 +103,3 @@ def from_encoder_decoder_configs(
         decoder_config.add_cross_attention = True
 
         return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder"] = self.encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index 8e899ad0f6f6..2dd00618c5a2 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ FLAVA model configurations"""
 
-import copy
 import os
 from typing import Any, Dict, Union
 
@@ -536,7 +535,6 @@ class FlavaConfig(PretrainedConfig):
     """
 
     model_type = "flava"
-    is_composition = True
 
     def __init__(
         self,
@@ -764,18 +762,3 @@ def from_configs(
             image_codebook_config=image_codebook_config.to_dict(),
             **kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["image_config"] = self.image_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["multimodal_config"] = self.multimodal_config.to_dict()
-        output["image_codebook_config"] = self.image_codebook_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index decfb1b90f9c..afd97f137dc3 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -15,8 +15,6 @@
 """ FSMT configuration"""
 
 
-import copy
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -216,15 +214,3 @@ def __init__(
             early_stopping=early_stopping,
             **common_kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py
index 8e1a7cf2e362..41f54612afdb 100644
--- a/src/transformers/models/git/configuration_git.py
+++ b/src/transformers/models/git/configuration_git.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import os
 from typing import Union
 
@@ -239,13 +238,3 @@ def __init__(
 
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 4d10a2dbb501..8acf0d1c4e3b 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ GroupViT model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
@@ -296,7 +295,6 @@ class GroupViTConfig(PretrainedConfig):
     """
 
     model_type = "groupvit"
-    is_composition = True
 
     def __init__(
         self,
@@ -407,19 +405,6 @@ def from_text_vision_configs(cls, text_config: GroupViTTextConfig, vision_config
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class GroupViTOnnxConfig(OnnxConfig):
     @property
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index 417f8538909e..0500e1cbeb13 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ InstructBLIP model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -305,7 +304,6 @@ class InstructBlipConfig(PretrainedConfig):
     ```"""
 
     model_type = "instructblip"
-    is_composition = True
 
     def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
         super().__init__(**kwargs)
@@ -358,17 +356,3 @@ def from_vision_qformer_text_configs(
             text_config=text_config.to_dict(),
             **kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["qformer_config"] = self.qformer_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 2cd0c53dc93b..265bbfbfe3fd 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Jukebox configuration"""
 
-import copy
 import os
 from typing import List, Union
 
@@ -369,18 +368,6 @@ def from_pretrained(
 
         return cls.from_dict(config_dict, **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder_config"] = self.encoder_config.to_dict() if self.encoder_config is not None else None
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class JukeboxVQVAEConfig(PretrainedConfig):
     """
@@ -561,7 +548,6 @@ class JukeboxConfig(PretrainedConfig):
     """
 
     model_type = "jukebox"
-    is_composition = True
 
     def __init__(
         self,
@@ -620,18 +606,3 @@ def from_configs(cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: Juk
         """
         prior_config_list = [config.to_dict() for config in prior_configs]
         return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        for i, config in enumerate(output.pop("prior_configs")):
-            output[f"prior_{i}"] = config.to_dict()
-
-        output["vqvae_config"] = self.vqvae_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 6711904b7015..ccc1c9c2cffc 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Mask2Former model configuration"""
-import copy
 from typing import Dict, List, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -230,15 +229,3 @@ def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
             backbone_config=backbone_config,
             **kwargs,
         )
-
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index 25deef2f2062..baf907ee53c0 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ MaskFormer model configuration"""
-import copy
 from typing import Dict, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -200,16 +199,3 @@ def from_backbone_and_decoder_configs(
             decoder_config=decoder_config,
             **kwargs,
         )
-
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["decoder_config"] = self.decoder_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index fced045f71e1..f1a140aacbcf 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Mpt configuration"""
-import copy
 from typing import TYPE_CHECKING, Optional, Union
 
 
@@ -197,7 +196,6 @@ class MptConfig(PretrainedConfig):
         "hidden_size": "d_model",
         "num_hidden_layers": "n_layers",
     }
-    is_composition = True
 
     def __init__(
         self,
@@ -222,7 +220,12 @@ def __init__(
         initializer_range=0.02,
         **kwargs,
     ):
-        self.attn_config = attn_config
+        if attn_config is None:
+            self.attn_config = MptAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = MptAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
         self.d_model = d_model
         self.n_heads = n_heads
         self.n_layers = n_layers
@@ -242,35 +245,3 @@ def __init__(
         self.use_cache = use_cache
         self.initializer_range = initializer_range
         super().__init__(**kwargs)
-
-    @property
-    def attn_config(self):
-        return self._attn_config
-
-    @attn_config.setter
-    def attn_config(self, attn_config):
-        if attn_config is None:
-            self._attn_config = MptAttentionConfig()
-        elif isinstance(attn_config, dict):
-            self._attn_config = MptAttentionConfig(**attn_config)
-        elif isinstance(attn_config, MptAttentionConfig):
-            self._attn_config = attn_config
-        else:
-            raise ValueError(
-                f"`attn_config` has to be either a `MptAttentionConfig` or a dictionary. Received: {type(attn_config)}"
-            )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["attn_config"] = (
-            self._attn_config.to_dict() if not isinstance(self.attn_config, dict) else self.attn_config
-        )
-        del output["_attn_config"]
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
index 2882c49f75b2..315f03811060 100644
--- a/src/transformers/models/musicgen/configuration_musicgen.py
+++ b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ MusicGen model configuration"""
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -227,17 +226,3 @@ def from_sub_models_config(
             decoder=decoder_config.to_dict(),
             **kwargs,
         )
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_encoder"] = self.text_encoder.to_dict()
-        output["audio_encoder"] = self.audio_encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index 0a0465e437e5..c990deed2343 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """OneFormer model configuration"""
-import copy
 from typing import Dict, Optional
 
 from ...configuration_utils import PretrainedConfig
@@ -250,13 +249,3 @@ def __init__(
         self.num_hidden_layers = decoder_layers
 
         super().__init__(**kwargs)
-
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 2403937b89eb..a03120a46e5a 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ OWL-ViT model configuration"""
 
-import copy
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union
@@ -274,7 +273,6 @@ class OwlViTConfig(PretrainedConfig):
     """
 
     model_type = "owlvit"
-    is_composition = True
 
     def __init__(
         self,
@@ -332,19 +330,6 @@ def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwar
 
         return cls.from_dict(config_dict, **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class OwlViTOnnxConfig(OnnxConfig):
     @property
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index a86abae07391..23268c217b34 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Pix2Struct model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -338,7 +337,6 @@ class Pix2StructConfig(PretrainedConfig):
     ```"""
 
     model_type = "pix2struct"
-    is_composition = True
 
     def __init__(
         self,
@@ -389,16 +387,3 @@ def from_text_vision_configs(
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 59d2951deffe..60f38ee6a532 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ RAG model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import add_start_docstrings
@@ -179,16 +178,3 @@ def from_question_encoder_generator_configs(
             [`EncoderDecoderConfig`]: An instance of a configuration object
         """
         return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["question_encoder"] = self.question_encoder.to_dict()
-        output["generator"] = self.generator.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py
index 87427d457310..b184788c4e9a 100644
--- a/src/transformers/models/sam/configuration_sam.py
+++ b/src/transformers/models/sam/configuration_sam.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ SAM model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -286,7 +285,6 @@ class SamConfig(PretrainedConfig):
     ```"""
 
     model_type = "sam"
-    is_composition = True
 
     def __init__(
         self,
@@ -312,17 +310,3 @@ def __init__(
         self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config)
         self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config)
         self.initializer_range = initializer_range
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["prompt_encoder_config"] = self.prompt_encoder_config.to_dict()
-        output["mask_decoder_config"] = self.mask_decoder_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index 8b648f8e21bc..4a144514fd3b 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -106,16 +105,3 @@ def from_encoder_decoder_configs(
         decoder_config.add_cross_attention = True
 
         return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder"] = self.encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 250816ef4949..9cc903656a4c 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -13,9 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Table Transformer model configuration"""
-import copy
 from collections import OrderedDict
-from typing import Dict, Mapping
+from typing import Mapping
 
 from packaging import version
 
@@ -237,17 +236,6 @@ def num_attention_heads(self) -> int:
     def hidden_size(self) -> int:
         return self.d_model
 
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if output["backbone_config"] is not None:
-            output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 593e8953a273..f7ad5d04652c 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ UperNet model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -108,13 +107,3 @@ def __init__(
         self.auxiliary_num_convs = auxiliary_num_convs
         self.auxiliary_concat_input = auxiliary_concat_input
         self.loss_ignore_index = loss_ignore_index
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index 693c41c74691..8a8fd2f0f631 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 from typing import TYPE_CHECKING, Any, Mapping, Optional, OrderedDict
 
 from packaging import version
@@ -114,19 +113,6 @@ def from_encoder_decoder_configs(
 
         return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
 
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["encoder"] = self.encoder.to_dict()
-        output["decoder"] = self.decoder.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
-
 
 class VisionEncoderDecoderEncoderOnnxConfig(OnnxConfig):
     torch_onnx_minimum_version = version.parse("1.11")
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index 8c31c1ac0303..cffc58865a36 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ VisionTextDualEncoder model configuration"""
 
-import copy
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -113,16 +112,3 @@ def from_vision_text_configs(cls, vision_config: PretrainedConfig, text_config:
         """
 
         return cls(vision_config=vision_config.to_dict(), text_config=text_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 34b778a1f944..f06cc6c5f81d 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 """ ViT Hybrid model configuration"""
 
-import copy
-from typing import Dict
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -146,13 +144,3 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-
-    def to_dict(self) -> Dict[str, any]:
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["backbone_config"] = self.backbone_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 17e8c1c23053..183b66439b36 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ X-CLIP model configuration"""
 
-import copy
 import os
 from typing import Union
 
@@ -299,7 +298,6 @@ class XCLIPConfig(PretrainedConfig):
     """
 
     model_type = "xclip"
-    is_composition = True
 
     def __init__(
         self,
@@ -417,16 +415,3 @@ def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: X
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 18d9a76e97dc..5fb93f71eb04 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -118,9 +118,11 @@ def create_and_test_config_with_num_labels(self):
 
     def check_config_can_be_init_without_params(self):
         if self.config_class.is_composition:
-            return
-        config = self.config_class()
-        self.parent.assertIsNotNone(config)
+            with self.parent.assertRaises(ValueError):
+                config = self.config_class()
+        else:
+            config = self.config_class()
+            self.parent.assertIsNotNone(config)
 
     def check_config_arguments_init(self):
         kwargs = copy.deepcopy(config_common_kwargs)
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
index 12a956a55eca..1b8136bfbb42 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/test_configuration_utils.py
@@ -210,6 +210,13 @@ def test_config_common_kwargs_is_complete(self):
                 f" {', '.join(keys_with_defaults)}."
             )
 
+    def test_nested_config_load_from_dict(self):
+        config = AutoConfig.from_pretrained(
+            "hf-internal-testing/tiny-random-CLIPModel", text_config={"num_hidden_layers": 2}
+        )
+        self.assertNotIsInstance(config.text_config, dict)
+        self.assertEqual(config.text_config.__class__.__name__, "CLIPTextConfig")
+
     def test_from_pretrained_subfolder(self):
         with self.assertRaises(OSError):
             # config is in subfolder, the following should not work without specifying the subfolder

From f0fd73a2de0a611a3826885c0f529493ab32ace0 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 4 Aug 2023 14:56:29 +0200
Subject: [PATCH 857/935] Document check copies (#25291)

* Document check copies better and add tests

* Include header in check for copies

* Manual fixes

* Try autofix

* Fixes

* Clean tests

* Finalize doc

* Remove debug print

* More fixes
---
 docs/source/en/add_new_model.md               |   2 +-
 docs/source/en/pr_checks.md                   |  55 +++++
 src/transformers/__init__.py                  |   2 +
 .../models/albert/modeling_flax_albert.py     |   1 -
 .../models/albert/tokenization_albert.py      |   4 +-
 .../models/align/modeling_align.py            |   2 +-
 src/transformers/models/bart/__init__.py      |   2 +
 src/transformers/models/bart/modeling_bart.py |  30 ++-
 src/transformers/models/bit/modeling_bit.py   |   2 +-
 .../blenderbot/modeling_flax_blenderbot.py    |   3 +-
 .../modeling_flax_blenderbot_small.py         |   2 +-
 .../models/clipseg/modeling_clipseg.py        |   2 +-
 .../image_processing_conditional_detr.py      |   2 +-
 .../models/convnext/modeling_convnext.py      |   2 +-
 .../models/convnextv2/modeling_convnextv2.py  |   2 +-
 src/transformers/models/cvt/modeling_cvt.py   |   2 +-
 .../models/deprecated/van/modeling_van.py     |   2 +-
 .../models/dinat/modeling_dinat.py            |   2 +-
 .../models/dinov2/modeling_dinov2.py          |   2 +-
 .../models/donut/modeling_donut_swin.py       |   4 +-
 .../modeling_efficientformer.py               |   2 +-
 .../models/esm/modeling_tf_esm.py             |   2 +-
 .../models/focalnet/modeling_focalnet.py      |   2 +-
 src/transformers/models/glpn/modeling_glpn.py |   4 +-
 .../gpt_bigcode/modeling_gpt_bigcode.py       |   2 +-
 .../models/llama/tokenization_llama.py        |   7 +-
 .../models/longt5/modeling_flax_longt5.py     |   2 +-
 .../models/marian/modeling_flax_marian.py     |   2 +-
 .../maskformer/modeling_maskformer_swin.py    |   2 +-
 .../models/mgp_str/modeling_mgp_str.py        |   2 +-
 src/transformers/models/nat/modeling_nat.py   |   2 +-
 .../models/owlvit/image_processing_owlvit.py  |   1 -
 .../models/owlvit/modeling_owlvit.py          |   8 +-
 .../models/pegasus/modeling_flax_pegasus.py   |  12 +-
 .../models/poolformer/modeling_poolformer.py  |   2 +-
 src/transformers/models/pvt/modeling_pvt.py   |   4 +-
 .../models/segformer/modeling_segformer.py    |   4 +-
 .../swiftformer/modeling_swiftformer.py       |   2 +-
 src/transformers/models/swin/modeling_swin.py |   2 +-
 .../models/swin2sr/modeling_swin2sr.py        |   4 +-
 .../models/swinv2/modeling_swinv2.py          |   2 +-
 .../models/t5/modeling_flax_t5.py             |   2 +-
 .../modeling_wav2vec2_conformer.py            |   2 +-
 .../models/whisper/tokenization_whisper.py    |   8 +-
 .../whisper/tokenization_whisper_fast.py      |   8 +-
 .../models/x_clip/modeling_x_clip.py          |   2 +-
 .../models/xglm/modeling_tf_xglm.py           |   2 +-
 .../models/yolos/image_processing_yolos.py    |   2 +-
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 tests/repo_utils/test_check_copies.py         | 193 ++++++++++++------
 utils/check_copies.py                         | 123 +++++++++--
 51 files changed, 380 insertions(+), 164 deletions(-)

diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md
index b33053540802..4072be6f59ca 100644
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@@ -101,7 +101,7 @@ own regarding how code should be written :-)
 1. The forward pass of your model should be fully written in the modeling file while being fully independent of other
    models in the library. If you want to reuse a block from another model, copy the code and paste it with a
    `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   for a good example).
+   for a good example and [there](pr_checks#check-copies) for more documentation on Copied from). 
 2. The code should be fully understandable, even by a non-native English speaker. This means you should pick
    descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`.
    One-letter variable names are strongly discouraged unless it's an index in a for loop.
diff --git a/docs/source/en/pr_checks.md b/docs/source/en/pr_checks.md
index 6aeee89d75e1..c5a2e539c02b 100644
--- a/docs/source/en/pr_checks.md
+++ b/docs/source/en/pr_checks.md
@@ -142,3 +142,58 @@ Additional checks concern PRs that add new models, mainly that:
 - All checkpoints used actually exist on the Hub
 
 -->
+
+### Check copies
+
+Since the Transformers library is very opinionated with respect to model code, and each model should fully be implemented in a single file without relying on other models, we have added a mechanism that checks whether a copy of the code of a layer of a given model stays consistent with the original. This way, when there is a bug fix, we can see all other impacted models and choose to trickle down the modification or break the copy.
+
+<Tip>
+
+If a file is a full copy of another file, you should register it in the constant `FULL_COPIES` of `utils/check_copies.py`.
+
+</Tip>
+
+This mechanism relies on comments of the form `# Copied from xxx`. The `xxx` should contain the whole path to the class of function which is being copied below. For instance, `RobertaSelfOutput` is a direct copy of the `BertSelfOutput` class, so you can see [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) it has a comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+```
+
+Note that instead of applying this to a whole class, you can apply it to the relevant methods that are copied from. For instance [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) you can see how `RobertaPreTrainedModel._init_weights` is copied from the same method in `BertPreTrainedModel` with the comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+```
+
+Sometimes the copy is exactly the same except for names: for instance in `RobertaAttention`, we use `RobertaSelfAttention` insted of `BertSelfAttention` but other than that, the code is exactly the same. This is why `# Copied from` supports simple string replacements with the follwoing syntax: `Copied from xxx with foo->bar`. This means the code is copied with all instances of `foo` being replaced by `bar`. You can see how it used [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` with the comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+```
+
+Note that there shouldn't be any spaces around the arrow (unless that space is part of the pattern to replace of course).
+
+You can add several patterns separated by a comma. For instance here `CamemberForMaskedLM` is a direct copy of `RobertaForMaskedLM` with two replacements: `Roberta` to `Camembert` and `ROBERTA` to `CAMEMBERT`. You can see [here](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) this is done with the comment:
+
+```py
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+```
+
+If the order matters (because one of the replacements might conflict with a previous one), the replacements are executed from left to right.
+
+<Tip>
+
+If the replacements change the formatting (if you replace a short name by a very long name for instance), the copy is checked after applying the auto-formatter.
+
+</Tip>
+
+Another way when the patterns are just different casings of the same replacement (with an uppercased and a lowercased variants) is just to add the option `all-casing`. [Here](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) is an example in `MobileBertForSequenceClassification` with the comment:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing
+```
+
+In this case, the code is copied from `BertForSequenceClassification` by replacing:
+- `Bert` by `MobileBert` (for instance when using `MobileBertModel` in the init)
+- `bert` by `mobilebert` (for instance when defining `self.mobilebert`)
+- `BERT` by `MOBILEBERT` (in the constant `MOBILEBERT_INPUTS_DOCSTRING`)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7994a5b6ed33..2253bda3908a 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1168,6 +1168,7 @@
             "BartForSequenceClassification",
             "BartModel",
             "BartPretrainedModel",
+            "BartPreTrainedModel",
             "PretrainedBartModel",
         ]
     )
@@ -5072,6 +5073,7 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
+            BartPreTrainedModel,
             BartPretrainedModel,
             PretrainedBartModel,
         )
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 0ff1b9276a19..55fd9d5a4c91 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -173,7 +173,6 @@ def setup(self):
         self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
         self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
 
-    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.__call__
     def __call__(self, input_ids, token_type_ids, position_ids, deterministic: bool = True):
         # Embed
         inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index b043a14989fc..231abf1c0301 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -183,10 +183,10 @@ def __init__(
         self.sp_model.Load(vocab_file)
 
     @property
-    def vocab_size(self):
+    def vocab_size(self) -> int:
         return len(self.sp_model)
 
-    def get_vocab(self):
+    def get_vocab(self) -> Dict[str, int]:
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index a60be94224db..4c5f1d3138c2 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -286,7 +286,7 @@ def align_loss(similarity: torch.Tensor) -> torch.Tensor:
     return (caption_loss + image_loss) / 2.0
 
 
-# Copied from transformers.models.efficientnet.modeling_efficientnet.round_filters with EfficientNet -> AlignVision
+# Copied from transformers.models.efficientnet.modeling_efficientnet.round_filters with EfficientNet->AlignVision
 def round_filters(config: AlignVisionConfig, num_channels: int):
     r"""
     Round number of filters based on depth multiplier.
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index 7129474b4ee4..4f104efce1a4 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -49,6 +49,7 @@
         "BartForQuestionAnswering",
         "BartForSequenceClassification",
         "BartModel",
+        "BartPreTrainedModel",
         "BartPretrainedModel",
         "PretrainedBartModel",
     ]
@@ -107,6 +108,7 @@
             BartForQuestionAnswering,
             BartForSequenceClassification,
             BartModel,
+            BartPreTrainedModel,
             BartPretrainedModel,
             PretrainedBartModel,
         )
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 91f7bac90667..fe3fbb1f8cd0 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -502,7 +502,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class BartPretrainedModel(PreTrainedModel):
+class BartPreTrainedModel(PreTrainedModel):
     config_class = BartConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -536,10 +536,18 @@ def dummy_inputs(self):
         return dummy_inputs
 
 
-class PretrainedBartModel(BartPretrainedModel):
+class PretrainedBartModel(BartPreTrainedModel):
     def __init_subclass__(self):
         warnings.warn(
-            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
+            FutureWarning,
+        )
+
+
+class BartPretrainedModel(BartPreTrainedModel):
+    def __init_subclass__(self):
+        warnings.warn(
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
             FutureWarning,
         )
 
@@ -700,7 +708,7 @@ def __init_subclass__(self):
 """
 
 
-class BartEncoder(BartPretrainedModel):
+class BartEncoder(BartPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
     [`BartEncoderLayer`].
@@ -882,7 +890,7 @@ def custom_forward(*inputs):
         )
 
 
-class BartDecoder(BartPretrainedModel):
+class BartDecoder(BartPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]
 
@@ -1169,7 +1177,7 @@ def custom_forward(*inputs):
     "The bare BART Model outputting raw hidden-states without any specific head on top.",
     BART_START_DOCSTRING,
 )
-class BartModel(BartPretrainedModel):
+class BartModel(BartPreTrainedModel):
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig):
@@ -1296,7 +1304,7 @@ def forward(
 @add_start_docstrings(
     "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
-class BartForConditionalGeneration(BartPretrainedModel):
+class BartForConditionalGeneration(BartPreTrainedModel):
     base_model_prefix = "model"
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
@@ -1471,7 +1479,7 @@ def _reorder_cache(past_key_values, beam_idx):
     """,
     BART_START_DOCSTRING,
 )
-class BartForSequenceClassification(BartPretrainedModel):
+class BartForSequenceClassification(BartPreTrainedModel):
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config: BartConfig, **kwargs):
@@ -1601,7 +1609,7 @@ def forward(
     """,
     BART_START_DOCSTRING,
 )
-class BartForQuestionAnswering(BartPretrainedModel):
+class BartForQuestionAnswering(BartPreTrainedModel):
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
 
     def __init__(self, config):
@@ -1719,7 +1727,7 @@ def forward(
         )
 
 
-class BartDecoderWrapper(BartPretrainedModel):
+class BartDecoderWrapper(BartPreTrainedModel):
     """
     This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
     used in combination with the [`EncoderDecoderModel`] framework.
@@ -1739,7 +1747,7 @@ def forward(self, *args, **kwargs):
     """,
     BART_START_DOCSTRING,
 )
-class BartForCausalLM(BartPretrainedModel):
+class BartForCausalLM(BartPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 284ff5e2de82..12a5ecd42b74 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -300,7 +300,7 @@ def forward(self, pixel_values: Tensor) -> Tensor:
 
 
 # Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 6796f48163a7..3f5c73a6c3bc 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -22,7 +22,6 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
-import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -205,7 +204,7 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index e13b90c06009..77e6b1704ba6 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -216,7 +216,7 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 4cab4425f18d..3dc006179c52 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -160,7 +160,7 @@ def to_tuple(self) -> Tuple[Any]:
 
 
 class CLIPSegVisionEmbeddings(nn.Module):
-    # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.__init__ with CLIP->CLIPSeg
     def __init__(self, config: CLIPSegVisionConfig):
         super().__init__()
         self.config = config
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 5bb90d5d74b1..4f6497d112b0 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -861,7 +861,7 @@ def prepare_annotation(
         return target
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
-    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
         logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 3733fb941409..e6cf336517a5 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -61,7 +61,7 @@
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index 70c35a85af3f..3a268c713d50 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -61,7 +61,7 @@
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index 99e3a02febf4..d21b5c9a8749 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -78,7 +78,7 @@ class BaseModelOutputWithCLSToken(ModelOutput):
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/deprecated/van/modeling_van.py b/src/transformers/models/deprecated/van/modeling_van.py
index f7feebae4df6..4ef18f54158f 100644
--- a/src/transformers/models/deprecated/van/modeling_van.py
+++ b/src/transformers/models/deprecated/van/modeling_van.py
@@ -54,7 +54,7 @@
 
 
 # Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index b15d7d187ed0..89c6ed2e2a88 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -269,7 +269,7 @@ def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index 3e49b50f213d..a0cf8be82f8c 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -316,7 +316,7 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
 
 
 # Copied from transformers.models.beit.modeling_beit.BeitDropPath
-class Dinov2DropPath:
+class Dinov2DropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
     def __init__(self, drop_prob: Optional[float] = None) -> None:
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 65c48eb81f83..0d833406e259 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -295,8 +295,8 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
         return input_feature
 
 
-# Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
index c3ed5cace852..5f03a5ab7472 100644
--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -246,7 +246,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index c5cdf53c59e6..3e9223087ba9 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -667,7 +667,7 @@ def call(
 
 
 # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Esm
-class TFEsmPooler(Layer):
+class TFEsmPooler(tf.keras.layers.Layer):
     def __init__(self, config: EsmConfig, **kwargs):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index fc327ad0b39f..8d18a8c63fda 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -286,7 +286,7 @@ def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tens
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py
index d9ebb64f6653..d2ddef5c41e1 100755
--- a/src/transformers/models/glpn/modeling_glpn.py
+++ b/src/transformers/models/glpn/modeling_glpn.py
@@ -52,8 +52,8 @@
 ]
 
 
-# Copied from transformers.models.segformer.modeling_segformer.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index d0f71d382f27..415c6ac0dcb2 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -272,7 +272,7 @@ def __init__(self, intermediate_size, config):
         self.dropout = nn.Dropout(config.resid_pdrop)
 
     # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
-    def forward(self, hidden_states: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
         hidden_states = self.c_fc(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.c_proj(hidden_states)
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 110ffdce7583..667b92793abc 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -30,7 +30,8 @@
 
 
 if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
+    from ...pipelines.conversational import Conversation
+    from ...tokenization_utils_base import TextInput
 
 logger = logging.get_logger(__name__)
 
@@ -168,7 +169,7 @@ def get_vocab(self):
         return vocab
 
     # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
-    def tokenize(self, text, **kwargs) -> List[str]:
+    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
         # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
         # the beginning of the text
         if not self.legacy:
@@ -176,7 +177,7 @@ def tokenize(self, text, **kwargs) -> List[str]:
         return super().tokenize(text, **kwargs)
 
     # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
-    def _tokenize(self, text):
+    def _tokenize(self, text, **kwargs):
         """
         Returns a tokenized string.
 
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 7fa708c59956..96c0b7df2c69 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -56,7 +56,7 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index c3d89b693a67..f197126277bc 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -227,7 +227,7 @@ def create_sinusoidal_positions(n_pos, dim):
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index 7016b598e853..357ac9d4aaca 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -123,7 +123,7 @@ def window_reverse(windows, window_size, height, width):
 
 
 # Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py
index 35ed55f5f578..5e34faf40885 100644
--- a/src/transformers/models/mgp_str/modeling_mgp_str.py
+++ b/src/transformers/models/mgp_str/modeling_mgp_str.py
@@ -51,7 +51,7 @@
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index 2293661f2b4c..ecc745b558dd 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -263,7 +263,7 @@ def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 0dccdf129a34..684bb40f2dbd 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -47,7 +47,6 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.detr.modeling_detr._upcast
 def _upcast(t):
     # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
     if t.is_floating_point():
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 2cf67e63f23a..f2a9607a6e98 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -22,7 +22,7 @@
 import numpy as np
 import torch
 import torch.utils.checkpoint
-from torch import nn
+from torch import Tensor, nn
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
@@ -120,7 +120,7 @@ def to_tuple(self) -> Tuple[Any]:
 
 
 # Copied from transformers.models.detr.modeling_detr._upcast
-def _upcast(t: torch.Tensor) -> torch.Tensor:
+def _upcast(t: Tensor) -> Tensor:
     # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
     if t.is_floating_point():
         return t if t.dtype in (torch.float32, torch.float64) else t.float()
@@ -129,7 +129,7 @@ def _upcast(t: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.detr.modeling_detr.box_area
-def box_area(boxes: torch.Tensor) -> torch.Tensor:
+def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
 
@@ -146,7 +146,7 @@ def box_area(boxes: torch.Tensor) -> torch.Tensor:
 
 
 # Copied from transformers.models.detr.modeling_detr.box_iou
-def box_iou(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+def box_iou(boxes1, boxes2):
     area1 = box_area(boxes1)
     area2 = box_area(boxes2)
 
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index ddd83709e911..fdf7f019f22c 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -210,7 +210,7 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
@@ -223,7 +223,7 @@ def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_tok
 
 
 # Copied from transformers.models.marian.modeling_flax_marian.create_sinusoidal_positions
-def create_sinusoidal_positions(n_pos, dim, dtype):
+def create_sinusoidal_positions(n_pos, dim):
     position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
     sentinel = dim // 2 + dim % 2
     out = np.zeros_like(position_enc)
@@ -686,9 +686,7 @@ def setup(self):
         self.max_source_positions = self.config.max_position_embeddings
         self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
 
-        self.embed_positions = create_sinusoidal_positions(
-            self.config.max_position_embeddings, embed_dim, dtype=self.dtype
-        )
+        self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim)
         self.layers = FlaxPegasusEncoderLayerCollection(self.config, self.dtype)
         self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
 
@@ -755,9 +753,7 @@ def setup(self):
         self.max_target_positions = self.config.max_position_embeddings
         self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
 
-        self.embed_positions = create_sinusoidal_positions(
-            self.config.max_position_embeddings, embed_dim, dtype=self.dtype
-        )
+        self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim)
 
         self.layers = FlaxPegasusDecoderLayerCollection(self.config, self.dtype)
         self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py
index 688a9239f0a0..6acc8ec98e69 100755
--- a/src/transformers/models/poolformer/modeling_poolformer.py
+++ b/src/transformers/models/poolformer/modeling_poolformer.py
@@ -50,7 +50,7 @@
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py
index 09f75092a814..2dd452ec1df1 100755
--- a/src/transformers/models/pvt/modeling_pvt.py
+++ b/src/transformers/models/pvt/modeling_pvt.py
@@ -55,8 +55,8 @@
 ]
 
 
-# Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 6701f66f9af4..47f42b5e0ed5 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -84,8 +84,8 @@ class SegFormerImageClassifierOutput(ImageClassifierOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.convnext.modeling_convnext.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py
index a29ed38fb42f..ff72f87506d3 100644
--- a/src/transformers/models/swiftformer/modeling_swiftformer.py
+++ b/src/transformers/models/swiftformer/modeling_swiftformer.py
@@ -86,7 +86,7 @@ def forward(self, x):
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index b324cfdcd935..2cf1d33a5113 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -380,7 +380,7 @@ def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index cd58b7065058..9464981bafee 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -105,8 +105,8 @@ def window_reverse(windows, window_size, height, width):
     return windows
 
 
-# Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 97b460479d6d..e05643a63583 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -242,7 +242,7 @@ def window_reverse(windows, window_size, height, width):
 
 
 # Copied from transformers.models.swin.modeling_swin.drop_path
-def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index bc26ade028dc..cc74c30c1d33 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -56,7 +56,7 @@
 
 
 # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
-def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+def shift_tokens_right(input_ids: jnp.array, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
     """
     Shift input ids one token to the right.
     """
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index d5836de3394f..f4392073b9a4 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1603,7 +1603,7 @@ def forward(
 )
 class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
-    def __init__(self, config, target_lang=None):
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 6053f479aac6..45fd5ed4e78a 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -15,7 +15,7 @@
 """Tokenization classes for Whisper."""
 import json
 import os
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
 import regex as re
@@ -25,6 +25,10 @@
 from .english_normalizer import EnglishTextNormalizer
 
 
+if TYPE_CHECKING:
+    from ...pipelines.conversational import Conversation
+
+
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
     "tokenizer_file": "tokenizer.json",
@@ -697,7 +701,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         return (text, kwargs)
 
     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._build_conversation_input_ids with GPT2 -> Whisper
-    def _build_conversation_input_ids(self, conversation) -> List[int]:
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
         input_ids = []
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index 4861de65286c..689da150009c 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -15,7 +15,7 @@
 """Tokenization classes for Whisper."""
 import json
 import os
-from typing import List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 
 import numpy as np
 from tokenizers import pre_tokenizers, processors
@@ -27,6 +27,10 @@
 from .tokenization_whisper import LANGUAGES, TASK_IDS, TO_LANGUAGE_CODE, WhisperTokenizer, _decode_asr
 
 
+if TYPE_CHECKING:
+    from ...pipelines.conversational import Conversation
+
+
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
@@ -468,7 +472,7 @@ def get_special_tokens_mask(
         return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
 
     # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._build_conversation_input_ids
-    def _build_conversation_input_ids(self, conversation) -> List[int]:
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
         input_ids = []
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 44c6706afac9..d6f9bf9d818f 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -360,7 +360,7 @@ def forward(
 
 
 # Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
     """
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index b18c50b79568..873df14a6984 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -135,7 +135,7 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i
 
 
 # Copied from transformers.models.bart.modeling_tf_bart._expand_mask
-def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0):
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index e37db77bec25..f01dbbf89225 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -770,7 +770,7 @@ def prepare_annotation(
         return target
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
-    def prepare(self, image, target, return_segmentation_masks=False, masks_path=None):
+    def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
         logger.warning_once(
             "The `prepare` method is deprecated and will be removed in a v4.33. "
             "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 65fedf02d8c9..c27d8c3da972 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -899,6 +899,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class BartPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BartPretrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/repo_utils/test_check_copies.py b/tests/repo_utils/test_check_copies.py
index 57cecf6653ff..e3e8e47a873f 100644
--- a/tests/repo_utils/test_check_copies.py
+++ b/tests/repo_utils/test_check_copies.py
@@ -13,19 +13,19 @@
 # limitations under the License.
 
 import os
-import re
 import shutil
 import sys
 import tempfile
 import unittest
-
-import black
+from contextlib import contextmanager
+from pathlib import Path
 
 
 git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 sys.path.append(os.path.join(git_repo_path, "utils"))
 
 import check_copies  # noqa: E402
+from check_copies import convert_to_localized_md, find_code_in_transformers, is_copy_consistent  # noqa: E402
 
 
 # This is the reference code that will be used in the tests.
@@ -49,78 +49,137 @@ def forward(self, hidden_states):
         return hidden_states
 """
 
+MOCK_BERT_CODE = """from ...modeling_utils import PreTrainedModel
+
+def bert_function(x):
+    return x
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+
+class BertModel(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__()
+        self.bert = BertEncoder(config)
+
+    @add_docstring(BERT_DOCSTRING)
+    def forward(self, x):
+        return self.bert(x)
+"""
+
+MOCK_BERT_COPY_CODE = """from ...modeling_utils import PreTrainedModel
+
+# Copied from transformers.models.bert.modeling_bert.bert_function
+def bert_copy_function(x):
+    return x
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention
+class BertCopyAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
 
-class CopyCheckTester(unittest.TestCase):
-    def setUp(self):
-        self.transformer_dir = tempfile.mkdtemp()
-        os.makedirs(os.path.join(self.transformer_dir, "models/bert/"))
-        check_copies.TRANSFORMER_PATH = self.transformer_dir
-        shutil.copy(
-            os.path.join(git_repo_path, "src/transformers/models/bert/modeling_bert.py"),
-            os.path.join(self.transformer_dir, "models/bert/modeling_bert.py"),
-        )
 
-    def tearDown(self):
-        check_copies.TRANSFORMER_PATH = "src/transformers"
-        shutil.rmtree(self.transformer_dir)
-
-    def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None):
-        code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code
-        if overwrite_result is not None:
-            expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result
-        mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119)
-        code = black.format_str(code, mode=mode)
-        fname = os.path.join(self.transformer_dir, "new_code.py")
-        with open(fname, "w", newline="\n") as f:
+# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->BertCopy all-casing
+class BertCopyModel(BertCopyPreTrainedModel):
+    def __init__(self, config):
+        super().__init__()
+        self.bertcopy = BertCopyEncoder(config)
+
+    @add_docstring(BERTCOPY_DOCSTRING)
+    def forward(self, x):
+        return self.bertcopy(x)
+"""
+
+
+def replace_in_file(filename, old, new):
+    with open(filename, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    content = content.replace(old, new)
+
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def create_tmp_repo(tmp_dir):
+    """
+    Creates a mock repository in a temporary folder for testing.
+    """
+    tmp_dir = Path(tmp_dir)
+    if tmp_dir.exists():
+        shutil.rmtree(tmp_dir)
+    tmp_dir.mkdir(exist_ok=True)
+
+    model_dir = tmp_dir / "src" / "transformers" / "models"
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    models = {"bert": MOCK_BERT_CODE, "bertcopy": MOCK_BERT_COPY_CODE}
+    for model, code in models.items():
+        model_subdir = model_dir / model
+        model_subdir.mkdir(exist_ok=True)
+        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8") as f:
             f.write(code)
-        if overwrite_result is None:
-            self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0)
-        else:
-            check_copies.is_copy_consistent(f.name, overwrite=True)
-            with open(fname, "r") as f:
-                self.assertTrue(f.read(), expected)
 
+
+@contextmanager
+def patch_transformer_repo_path(new_folder):
+    """
+    Temporarily patches the variables defines in `check_copies` to use a different location for the repo.
+    """
+    old_repo_path = check_copies.REPO_PATH
+    old_doc_path = check_copies.PATH_TO_DOCS
+    old_transformer_path = check_copies.TRANSFORMERS_PATH
+    repo_path = Path(new_folder).resolve()
+    check_copies.REPO_PATH = str(repo_path)
+    check_copies.PATH_TO_DOCS = str(repo_path / "docs" / "source" / "en")
+    check_copies.TRANSFORMERS_PATH = str(repo_path / "src" / "transformers")
+    try:
+        yield
+    finally:
+        check_copies.REPO_PATH = old_repo_path
+        check_copies.PATH_TO_DOCS = old_doc_path
+        check_copies.TRANSFORMERS_PATH = old_transformer_path
+
+
+class CopyCheckTester(unittest.TestCase):
     def test_find_code_in_transformers(self):
-        code = check_copies.find_code_in_transformers("models.bert.modeling_bert.BertLMPredictionHead")
-        self.assertEqual(code, REFERENCE_CODE)
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                code = find_code_in_transformers("models.bert.modeling_bert.BertAttention")
 
-    def test_is_copy_consistent(self):
-        # Base copy consistency
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead",
-            "BertLMPredictionHead",
-            REFERENCE_CODE + "\n",
+        reference_code = (
+            "class BertAttention(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n"
         )
+        self.assertEqual(code, reference_code)
 
-        # With no empty line at the end
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead",
-            "BertLMPredictionHead",
-            REFERENCE_CODE,
-        )
+    def test_is_copy_consistent(self):
+        path_to_check = ["src", "transformers", "models", "bertcopy", "modeling_bertcopy.py"]
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            # Base check
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
+                diffs = is_copy_consistent(file_to_check)
+                self.assertEqual(diffs, [])
 
-        # Copy consistency with rename
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->TestModel",
-            "TestModelLMPredictionHead",
-            re.sub("Bert", "TestModel", REFERENCE_CODE),
-        )
+            # Base check with an inconsistency
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
 
-        # Copy consistency with a really long name
-        long_class_name = "TestModelWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason"
-        self.check_copy_consistency(
-            f"# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->{long_class_name}",
-            f"{long_class_name}LMPredictionHead",
-            re.sub("Bert", long_class_name, REFERENCE_CODE),
-        )
+                replace_in_file(file_to_check, "self.bertcopy(x)", "self.bert(x)")
+                diffs = is_copy_consistent(file_to_check)
+                self.assertEqual(diffs, [["models.bert.modeling_bert.BertModel", 22]])
 
-        # Copy consistency with overwrite
-        self.check_copy_consistency(
-            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->TestModel",
-            "TestModelLMPredictionHead",
-            REFERENCE_CODE,
-            overwrite_result=re.sub("Bert", "TestModel", REFERENCE_CODE),
-        )
+                diffs = is_copy_consistent(file_to_check, overwrite=True)
+
+                with open(file_to_check, "r", encoding="utf-8") as f:
+                    self.assertEqual(f.read(), MOCK_BERT_COPY_CODE)
 
     def test_convert_to_localized_md(self):
         localized_readme = check_copies.LOCALIZED_READMES["README_zh-hans.md"]
@@ -168,14 +227,14 @@ def test_convert_to_localized_md(self):
             " Christopher D. Manning 发布。\n"
         )
 
-        num_models_equal, converted_md_list = check_copies.convert_to_localized_md(
+        num_models_equal, converted_md_list = convert_to_localized_md(
             md_list, localized_md_list, localized_readme["format_model_list"]
         )
 
         self.assertFalse(num_models_equal)
         self.assertEqual(converted_md_list, converted_md_list_sample)
 
-        num_models_equal, converted_md_list = check_copies.convert_to_localized_md(
+        num_models_equal, converted_md_list = convert_to_localized_md(
             md_list, converted_md_list, localized_readme["format_model_list"]
         )
 
@@ -201,7 +260,7 @@ def test_convert_to_localized_md(self):
             " Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。\n"
         )
 
-        num_models_equal, converted_md_list = check_copies.convert_to_localized_md(
+        num_models_equal, converted_md_list = convert_to_localized_md(
             link_changed_md_list, link_unchanged_md_list, localized_readme["format_model_list"]
         )
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 959c7b2d329b..0352b6419e30 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -12,6 +12,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks whether the copies defined in the library match the original or not. This includes:
+- All code commented with `# Copied from` comments,
+- The list of models in the main README.md matches the ones in the localized READMEs and in the index.md,
+- Files that are registered as full copies of one another in the `FULL_COPIES` constant of this script.
+
+This also checks the list of models in the README is complete (has all models) and add a line to complete if there is
+a model missing.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_copies.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`) or
+
+```bash
+python utils/check_copies.py --fix_and_overwrite
+```
+
+for a check that will fix all inconsistencies automatically (used by `make fix-copies`).
+"""
 
 import argparse
 import glob
@@ -103,7 +126,9 @@
 
 
 def _should_continue(line, indent):
-    return line.startswith(indent) or len(line) <= 1 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
+    # Helper function. Returns `True` if `line` is empty, starts with the `indent` or is the end parenthesis of a
+    # function definition
+    return line.startswith(indent) or len(line.strip()) == 0 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
 
 
 def find_code_in_transformers(object_name):
@@ -140,7 +165,7 @@ def find_code_in_transformers(object_name):
         raise ValueError(f" {object_name} does not match any function or class in {module}.")
 
     # We found the beginning of the class / func, now let's find the end (when the indent diminishes).
-    start_index = line_index
+    start_index = line_index - 1
     while line_index < len(lines) and _should_continue(lines[line_index], indent):
         line_index += 1
     # Clean up empty lines at the end (if any).
@@ -179,6 +204,33 @@ def blackify(code):
     return result[len("class Bla:\n") :] if has_indent else result
 
 
+def check_codes_match(observed_code, theoretical_code):
+    """
+    Checks if the code in `observed_code` and `theoretical_code` match with the exception of the class/function name.
+    Returns the index of the first line where there is a difference (if any) and `None` if the codes match.
+    """
+    observed_code_header = observed_code.split("\n")[0]
+    theoretical_code_header = theoretical_code.split("\n")[0]
+
+    _re_class_match = re.compile(r"class\s+([^\(:]+)(?:\(|:)")
+    _re_func_match = re.compile(r"def\s+([^\(]+)\(")
+    for re_pattern in [_re_class_match, _re_func_match]:
+        if re_pattern.match(observed_code_header) is not None:
+            observed_obj_name = re_pattern.search(observed_code_header).groups()[0]
+            theoretical_name = re_pattern.search(theoretical_code_header).groups()[0]
+            theoretical_code_header = theoretical_code_header.replace(theoretical_name, observed_obj_name)
+
+    diff_index = 0
+    if theoretical_code_header != observed_code_header:
+        return 0
+
+    diff_index = 1
+    for observed_line, theoretical_line in zip(observed_code.split("\n")[1:], theoretical_code.split("\n")[1:]):
+        if observed_line != theoretical_line:
+            return diff_index
+        diff_index += 1
+
+
 def is_copy_consistent(filename, overwrite=False):
     """
     Check if the code commented as a copy in `filename` matches the original.
@@ -201,10 +253,11 @@ def is_copy_consistent(filename, overwrite=False):
         theoretical_code = find_code_in_transformers(object_name)
         theoretical_indent = get_indent(theoretical_code)
 
-        start_index = line_index + 1 if indent == theoretical_indent else line_index + 2
-        indent = theoretical_indent
-        line_index = start_index
+        start_index = line_index + 1 if indent == theoretical_indent else line_index
+        line_index = start_index + 1
 
+        subcode = "\n".join(theoretical_code.split("\n")[1:])
+        indent = get_indent(subcode)
         # Loop to check the observed code, stop when indentation diminishes or if we see a End copy comment.
         should_continue = True
         while line_index < len(lines) and should_continue:
@@ -212,6 +265,8 @@ def is_copy_consistent(filename, overwrite=False):
             if line_index >= len(lines):
                 break
             line = lines[line_index]
+            # There is a special pattern `# End copy` to stop early. It's not documented cause it shouldn't really be
+            # used.
             should_continue = _should_continue(line, indent) and re.search(f"^{indent}# End copy", line) is None
         # Clean up empty lines at the end (if any).
         while len(lines[line_index - 1]) <= 1:
@@ -233,19 +288,12 @@ def is_copy_consistent(filename, overwrite=False):
                     theoretical_code = re.sub(obj1.lower(), obj2.lower(), theoretical_code)
                     theoretical_code = re.sub(obj1.upper(), obj2.upper(), theoretical_code)
 
-            # Blackify after replacement. To be able to do that, we need the header (class or function definition)
-            # from the previous line
-            theoretical_code = blackify(lines[start_index - 1] + theoretical_code)
-            theoretical_code = theoretical_code[len(lines[start_index - 1]) :]
+            theoretical_code = blackify(theoretical_code)
 
         # Test for a diff and act accordingly.
-        if observed_code != theoretical_code:
-            diff_index = start_index + 1
-            for observed_line, theoretical_line in zip(observed_code.split("\n"), theoretical_code.split("\n")):
-                if observed_line != theoretical_line:
-                    break
-                diff_index += 1
-            diffs.append([object_name, diff_index])
+        diff_index = check_codes_match(observed_code, theoretical_code)
+        if diff_index is not None:
+            diffs.append([object_name, diff_index + start_index + 1])
             if overwrite:
                 lines = lines[:start_index] + [theoretical_code] + lines[line_index:]
                 line_index = start_index + 1
@@ -259,6 +307,10 @@ def is_copy_consistent(filename, overwrite=False):
 
 
 def check_copies(overwrite: bool = False):
+    """
+    Check every file is copy-consistent with the original and maybe `overwrite` content when it is not. Also check the
+    model list in the main README and other READMEs/index.md are consistent.
+    """
     all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
     diffs = []
     for filename in all_files:
@@ -275,6 +327,10 @@ def check_copies(overwrite: bool = False):
 
 
 def check_full_copies(overwrite: bool = False):
+    """
+    Check the files that are full copies of others (as indicated in `FULL_COPIES`) are copy-consistent and maybe
+    `overwrite` to fix issues.
+    """
     diffs = []
     for target, source in FULL_COPIES.items():
         with open(source, "r", encoding="utf-8") as f:
@@ -299,7 +355,7 @@ def check_full_copies(overwrite: bool = False):
 
 
 def get_model_list(filename, start_prompt, end_prompt):
-    """Extracts the model list from the README."""
+    """Extracts the model list from a README, between `start_prompt` and `end_prompt`."""
     with open(os.path.join(REPO_PATH, filename), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     # Find the start of the list.
@@ -327,7 +383,20 @@ def get_model_list(filename, start_prompt, end_prompt):
 
 
 def convert_to_localized_md(model_list, localized_model_list, format_str):
-    """Convert `model_list` to each localized README."""
+    """
+    Compare the model list from the main README to the one in a localized README.
+
+    Args:
+        model_list (`str`): The model list in the main README.
+        localized_model_list (`str`): The model list in one of the localized README.
+        format_str (`str`):
+            The template for a model entry in the localized README (look at the `format_model_list` in the entries of
+            `LOCALIZED_READMES` for examples).
+
+    Returns:
+        `Tuple[bool, str]`: A tuple where the first value indicates if the READMEs match or not, and the second value
+        is the correct localized README.
+    """
 
     def _rep(match):
         title, model_link, paper_affiliations, paper_title_link, paper_authors, supplements = match.groups()
@@ -341,7 +410,8 @@ def _rep(match):
         )
 
     # This regex captures metadata from an English model description, including model title, model link,
-    # affiliations of the paper, title of the paper, authors of the paper, and supplemental data (see DistilBERT for example).
+    # affiliations of the paper, title of the paper, authors of the paper, and supplemental data (see DistilBERT for
+    # example).
     _re_capture_meta = re.compile(
         r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\* \(from ([^)]*)\)[^\[]*([^\)]*\)).*?by (.*?[A-Za-z\*]{2,}?)\. (.*)$"
     )
@@ -389,6 +459,10 @@ def _rep(match):
 
 
 def convert_readme_to_index(model_list):
+    """
+    Converts the model list of the README to the index.md format.
+    """
+    # We need to replce both link to the main doc and stable doc (the order of the next two instructions is important).
     model_list = model_list.replace("https://huggingface.co/docs/transformers/main/", "")
     return model_list.replace("https://huggingface.co/docs/transformers/", "")
 
@@ -420,7 +494,9 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
 
 
 def check_model_list_copy(overwrite=False, max_per_line=119):
-    """Check the model lists in the README and index.rst are consistent and maybe `overwrite`."""
+    """
+    Check the model lists in the README is consistent with the ones in the other READMES and also with `index.nmd`.
+    """
     # Fix potential doc links in the README
     with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
         readme = f.read()
@@ -490,6 +566,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
             )
 
 
+# Map a model name with the name it has in the README for the check_readme check
 SPECIAL_MODEL_NAMES = {
     "Bert Generation": "BERT For Sequence Generation",
     "BigBird": "BigBird-RoBERTa",
@@ -522,7 +599,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     "VisionTextDualEncoder",
 ]
 
-
+# Template for new entries to add in the main README when we have missing models.
 README_TEMPLATE = (
     "1. **[{model_name}](https://huggingface.co/docs/main/transformers/model_doc/{model_type})** (from "
     "<FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>."
@@ -530,6 +607,10 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
 
 
 def check_readme(overwrite=False):
+    """
+    Check if the main README contains all the models in the library or not. If `overwrite`, will add an entry for the
+    missing models using `README_TEMPLATE`.
+    """
     info = LOCALIZED_READMES["README.md"]
     models, start_index, end_index, lines = _find_text_in_file(
         os.path.join(REPO_PATH, "README.md"),

From ce6d153a533632138be70ba983f009fd65053b35 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 4 Aug 2023 15:13:14 +0200
Subject: [PATCH 858/935] Make `bark` could have tiny model (#25290)

* temp

* update

* update

* update

* small dim

* small dim

* small dim

* fix

* update

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/bark/configuration_bark.py         |   7 +-
 tests/models/bark/test_modeling_bark.py       | 110 ++++++++++++++----
 utils/create_dummy_models.py                  |   4 +
 3 files changed, 95 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
index 635cb0aa1a94..15efb11dc7d4 100644
--- a/src/transformers/models/bark/configuration_bark.py
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -19,7 +19,7 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import add_start_docstrings, logging
-from ..auto import AutoConfig
+from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
@@ -299,7 +299,8 @@ def __init__(
         self.semantic_config = BarkSemanticConfig(**semantic_config)
         self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config)
         self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config)
-        self.codec_config = AutoConfig.for_model(**codec_config)
+        codec_model_type = codec_config["model_type"] if "model_type" in codec_config else "encodec"
+        self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config)
 
         self.initializer_range = initializer_range
 
@@ -311,7 +312,7 @@ def from_sub_model_configs(
         semantic_config: BarkSemanticConfig,
         coarse_acoustics_config: BarkCoarseConfig,
         fine_acoustics_config: BarkFineConfig,
-        codec_config: AutoConfig,
+        codec_config: PretrainedConfig,
         **kwargs,
     ):
         r"""
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index 4141e5c188a8..82a902ded44a 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -22,6 +22,7 @@
 
 from transformers import (
     BarkCoarseConfig,
+    BarkConfig,
     BarkFineConfig,
     BarkSemanticConfig,
     is_torch_available,
@@ -37,6 +38,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ..encodec.test_modeling_encodec import EncodecModelTester
 
 
 if is_torch_available():
@@ -72,8 +74,6 @@ def __init__(
         initializer_range=0.02,
         n_codes_total=8,  # for BarkFineModel
         n_codes_given=1,  # for BarkFineModel
-        config_class=None,
-        model_class=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -98,8 +98,6 @@ def __init__(
         self.n_codes_given = n_codes_given
 
         self.is_encoder_decoder = False
-        self.config_class = config_class
-        self.model_class = model_class
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -121,7 +119,7 @@ def prepare_config_and_inputs(self):
         return config, inputs_dict
 
     def get_config(self):
-        return self.config_class(
+        return BarkSemanticConfig(
             vocab_size=self.vocab_size,
             output_vocab_size=self.output_vocab_size,
             hidden_size=self.hidden_size,
@@ -137,6 +135,7 @@ def get_config(self):
     def get_pipeline_config(self):
         config = self.get_config()
         config.vocab_size = 300
+        config.output_vocab_size = 300
         return config
 
     def prepare_config_and_inputs_for_common(self):
@@ -144,7 +143,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = self.model_class(config=config).to(torch_device).eval()
+        model = BarkSemanticModel(config=config).to(torch_device).eval()
 
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
@@ -211,8 +210,6 @@ def __init__(
         initializer_range=0.02,
         n_codes_total=8,  # for BarkFineModel
         n_codes_given=1,  # for BarkFineModel
-        config_class=None,
-        model_class=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -237,8 +234,6 @@ def __init__(
         self.n_codes_given = n_codes_given
 
         self.is_encoder_decoder = False
-        self.config_class = config_class
-        self.model_class = model_class
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -260,7 +255,7 @@ def prepare_config_and_inputs(self):
         return config, inputs_dict
 
     def get_config(self):
-        return self.config_class(
+        return BarkCoarseConfig(
             vocab_size=self.vocab_size,
             output_vocab_size=self.output_vocab_size,
             hidden_size=self.hidden_size,
@@ -276,6 +271,7 @@ def get_config(self):
     def get_pipeline_config(self):
         config = self.get_config()
         config.vocab_size = 300
+        config.output_vocab_size = 300
         return config
 
     def prepare_config_and_inputs_for_common(self):
@@ -283,7 +279,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = self.model_class(config=config).to(torch_device).eval()
+        model = BarkCoarseModel(config=config).to(torch_device).eval()
 
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
@@ -350,8 +346,6 @@ def __init__(
         initializer_range=0.02,
         n_codes_total=8,  # for BarkFineModel
         n_codes_given=1,  # for BarkFineModel
-        config_class=None,
-        model_class=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -376,8 +370,6 @@ def __init__(
         self.n_codes_given = n_codes_given
 
         self.is_encoder_decoder = False
-        self.config_class = config_class
-        self.model_class = model_class
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length, self.n_codes_total], self.vocab_size)
@@ -403,7 +395,7 @@ def prepare_config_and_inputs(self):
         return config, inputs_dict
 
     def get_config(self):
-        return self.config_class(
+        return BarkFineConfig(
             vocab_size=self.vocab_size,
             output_vocab_size=self.output_vocab_size,
             hidden_size=self.hidden_size,
@@ -419,6 +411,7 @@ def get_config(self):
     def get_pipeline_config(self):
         config = self.get_config()
         config.vocab_size = 300
+        config.output_vocab_size = 300
         return config
 
     def prepare_config_and_inputs_for_common(self):
@@ -426,7 +419,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
     def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = self.model_class(config=config).to(torch_device).eval()
+        model = BarkFineModel(config=config).to(torch_device).eval()
 
         input_ids = inputs_dict["input_ids"]
         attention_mask = inputs_dict["attention_mask"]
@@ -473,6 +466,79 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
 
+class BarkModelTester:
+    def __init__(
+        self,
+        parent,
+        semantic_kwargs=None,
+        coarse_acoustics_kwargs=None,
+        fine_acoustics_kwargs=None,
+        codec_kwargs=None,
+        is_training=False,  # for now training is not supported
+    ):
+        if semantic_kwargs is None:
+            semantic_kwargs = {}
+        if coarse_acoustics_kwargs is None:
+            coarse_acoustics_kwargs = {}
+        if fine_acoustics_kwargs is None:
+            fine_acoustics_kwargs = {}
+        if codec_kwargs is None:
+            codec_kwargs = {}
+
+        self.parent = parent
+        self.semantic_model_tester = BarkSemanticModelTester(parent, **semantic_kwargs)
+        self.coarse_acoustics_model_tester = BarkCoarseModelTester(parent, **coarse_acoustics_kwargs)
+        self.fine_acoustics_model_tester = BarkFineModelTester(parent, **fine_acoustics_kwargs)
+        self.codec_model_tester = EncodecModelTester(parent, **codec_kwargs)
+
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        # TODO: @Yoach: Preapre `inputs_dict`
+        inputs_dict = {}
+        config = self.get_config()
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return BarkConfig.from_sub_model_configs(
+            self.semantic_model_tester.get_config(),
+            self.coarse_acoustics_model_tester.get_config(),
+            self.fine_acoustics_model_tester.get_config(),
+            self.codec_model_tester.get_config(),
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+
+        # follow the `get_pipeline_config` of the sub component models
+        config.semantic_config.vocab_size = 300
+        config.coarse_acoustics_config.vocab_size = 300
+        config.fine_acoustics_config.vocab_size = 300
+
+        config.semantic_config.output_vocab_size = 300
+        config.coarse_acoustics_config.output_vocab_size = 300
+        config.fine_acoustics_config.output_vocab_size = 300
+
+        return config
+
+    def prepare_config_and_inputs_for_common(self):
+        # TODO: @Yoach
+        pass
+        # return config, inputs_dict
+
+
+# Need this class in oder to create tiny model for `bark`
+# TODO (@Yoach) Implement actual test methods
+@unittest.skip("So far all tests will fail.")
+class BarkModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BarkModel,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = BarkModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BarkConfig, n_embd=37)
+
+
 @require_torch
 class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (BarkSemanticModel,) if is_torch_available() else ()
@@ -488,9 +554,7 @@ class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Te
     test_resize_embeddings = True
 
     def setUp(self):
-        self.model_tester = BarkSemanticModelTester(
-            self, config_class=BarkSemanticConfig, model_class=BarkSemanticModel
-        )
+        self.model_tester = BarkSemanticModelTester(self)
         self.config_tester = ConfigTester(self, config_class=BarkSemanticConfig, n_embd=37)
 
     def test_config(self):
@@ -556,7 +620,7 @@ class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
     test_resize_embeddings = True
 
     def setUp(self):
-        self.model_tester = BarkCoarseModelTester(self, config_class=BarkCoarseConfig, model_class=BarkCoarseModel)
+        self.model_tester = BarkCoarseModelTester(self)
         self.config_tester = ConfigTester(self, config_class=BarkCoarseConfig, n_embd=37)
 
     def test_config(self):
@@ -623,7 +687,7 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
     test_resize_embeddings = True
 
     def setUp(self):
-        self.model_tester = BarkFineModelTester(self, config_class=BarkFineConfig, model_class=BarkFineModel)
+        self.model_tester = BarkFineModelTester(self)
         self.config_tester = ConfigTester(self, config_class=BarkFineConfig, n_embd=37)
 
     def test_config(self):
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index f1b8736b597f..87f3326504f0 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -974,6 +974,10 @@ def get_token_id_from_tokenizer(token_id_name, tokenizer, original_token_id):
 
 
 def get_config_overrides(config_class, processors):
+    # `Bark` configuration is too special. Let's just not handle this for now.
+    if config_class.__name__ == "BarkConfig":
+        return {}
+
     config_overrides = {}
 
     # Check if there is any tokenizer (prefer fast version if any)

From fdaef3368b7495f6d3f26739fece0ee370fa7ce6 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 4 Aug 2023 16:24:04 +0200
Subject: [PATCH 859/935] Document toc check and doctest check scripts (#25319)

* Clean doc toc check and make doctest list better

* Add to Makefile
---
 Makefile                    |  1 +
 utils/check_doc_toc.py      | 38 +++++++++++++++++++++++++-
 utils/check_doctest_list.py | 53 +++++++++++++++++++++++++++++++------
 3 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 25ab889148bd..2c2f3786f7cd 100644
--- a/Makefile
+++ b/Makefile
@@ -80,6 +80,7 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_table.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
+	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_task_guides.py --fix_and_overwrite
 
 # Run tests for the library
diff --git a/utils/check_doc_toc.py b/utils/check_doc_toc.py
index a01804284c34..83c6be479536 100644
--- a/utils/check_doc_toc.py
+++ b/utils/check_doc_toc.py
@@ -12,6 +12,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This script is responsible for cleaning the model section of the table of content by removing duplicates and sorting
+the entries in alphabetical order.
+
+Usage (from the root of the repo):
+
+Check that the table of content is properly sorted (used in `make quality`):
+
+```bash
+python utils/check_doc_toc.py
+```
+
+Auto-sort the table of content if it is not properly sorted (used in `make style`):
+
+```bash
+python utils/check_doc_toc.py --fix_and_overwrite
+```
+"""
+
 
 import argparse
 from collections import defaultdict
@@ -24,7 +43,15 @@
 
 def clean_model_doc_toc(model_doc):
     """
-    Cleans the table of content of the model documentation by removing duplicates and sorting models alphabetically.
+    Cleans a section of the table of content of the model documentation (one specific modality) by removing duplicates
+    and sorting models alphabetically.
+
+    Args:
+        model_doc (`List[dict]`):
+            The list of dictionaries extracted from the `_toctree.yml` file for this specific modality.
+
+    Returns:
+        `List[dict]`: List of dictionaries like the input, but cleaned up and sorted.
     """
     counts = defaultdict(int)
     for doc in model_doc:
@@ -51,6 +78,14 @@ def clean_model_doc_toc(model_doc):
 
 
 def check_model_doc(overwrite=False):
+    """
+    Check that the content of the table of content in `_toctree.yml` is clean (no duplicates and sorted for the model
+    API doc) and potentially auto-cleans it.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether to just check if the TOC is clean or to auto-clean it (when `overwrite=True`).
+    """
     with open(PATH_TO_TOC, encoding="utf-8") as f:
         content = yaml.safe_load(f.read())
 
@@ -67,6 +102,7 @@ def check_model_doc(overwrite=False):
 
     model_doc = api_doc[model_idx]["sections"]
 
+    # Extract the modalities and clean them one by one.
     modalities_docs = [(idx, section) for idx, section in enumerate(model_doc) if "sections" in section]
     diff = False
     for idx, modality_doc in modalities_docs:
diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py
index 13ef8c52d595..3815a2bda0ba 100644
--- a/utils/check_doctest_list.py
+++ b/utils/check_doctest_list.py
@@ -12,28 +12,65 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This script is responsible for cleaning the list of doctests by making sure the entries all exist and are in
+alphabetical order.
 
+Usage (from the root of the repo):
+
+Check that the doctest list is properly sorted and all files exist (used in `make repo-consistency`):
+
+```bash
+python utils/check_doctest_list.py
+```
+
+Auto-sort the doctest list if it is not properly sorted (used in `make fix-copies`):
+
+```bash
+python utils/check_doctest_list.py --fix_and_overwrite
+```
+"""
+import argparse
 import os
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_doctest_list.py
 REPO_PATH = "."
+DOCTEST_FILE_PATHS = ["documentation_tests.txt", "slow_documentation_tests.txt"]
 
 
-if __name__ == "__main__":
-    doctest_file_path = os.path.join(REPO_PATH, "utils/documentation_tests.txt")
+def clean_doctest_list(doctest_file, overwrite=False):
     non_existent_paths = []
     all_paths = []
-    with open(doctest_file_path) as fp:
-        for line in fp:
+    with open(doctest_file, "r", encoding="utf-8") as f:
+        for line in f:
             line = line.strip()
             path = os.path.join(REPO_PATH, line)
             if not (os.path.isfile(path) or os.path.isdir(path)):
                 non_existent_paths.append(line)
-            all_paths.append(path)
+            all_paths.append(line)
+
     if len(non_existent_paths) > 0:
-        non_existent_paths = "\n".join(non_existent_paths)
+        non_existent_paths = "\n".join([f"- {f}" for f in non_existent_paths])
         raise ValueError(f"`utils/documentation_tests.txt` contains non-existent paths:\n{non_existent_paths}")
-    if all_paths != sorted(all_paths):
-        raise ValueError("Files in `utils/documentation_tests.txt` are not in alphabetical order.")
+
+    sorted_paths = sorted(all_paths)
+    if all_paths != sorted_paths:
+        if not overwrite:
+            raise ValueError(
+                f"Files in `{doctest_file}` are not in alphabetical order, run `make fix-copies` to fix "
+                "this automatically."
+            )
+        with open(doctest_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(sorted_paths) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    for doctest_file in DOCTEST_FILE_PATHS:
+        doctest_file = os.path.join(REPO_PATH, "utils", doctest_file)
+        clean_doctest_list(doctest_file, args.fix_and_overwrite)

From fdd81aea12f06e24ab5cf5ba3c7316df3ab1a779 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 4 Aug 2023 15:53:57 +0100
Subject: [PATCH 860/935] [Whisper] Better error message for outdated
 generation config (#25298)

---
 src/transformers/models/whisper/modeling_whisper.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 8f40379cc271..7aac3e1c182a 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1649,9 +1649,21 @@ def generate(
             generation_config.return_timestamps = False
 
         if language is not None:
+            if not hasattr(generation_config, "lang_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `language` argument"
+                    "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
             language = language.lower()
             generation_config.language = language
         if task is not None:
+            if not hasattr(generation_config, "task_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `task` argument"
+                    "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
             generation_config.task = task
 
         forced_decoder_ids = None

From a6e6b1c622d8d08e2510a82cb6266d7b654f1cbf Mon Sep 17 00:00:00 2001
From: mariecwhite <mariewhite@google.com>
Date: Sat, 5 Aug 2023 03:36:57 +1000
Subject: [PATCH 861/935] Remove jnp.DeviceArray since it is deprecated.
 (#24875)

* Remove jnp.DeviceArray since it is deprecated.

* Replace all instances of jnp.DeviceArray with jax.Array

* Update src/transformers/models/bert/modeling_flax_bert.py

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/bart/modeling_flax_bart.py          | 6 +++---
 src/transformers/models/bert/modeling_flax_bert.py          | 2 +-
 src/transformers/models/big_bird/modeling_flax_big_bird.py  | 2 +-
 .../models/blenderbot/modeling_flax_blenderbot.py           | 4 ++--
 .../blenderbot_small/modeling_flax_blenderbot_small.py      | 4 ++--
 src/transformers/models/electra/modeling_flax_electra.py    | 2 +-
 .../models/encoder_decoder/modeling_flax_encoder_decoder.py | 4 ++--
 src/transformers/models/gpt2/modeling_flax_gpt2.py          | 2 +-
 src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py    | 2 +-
 src/transformers/models/gptj/modeling_flax_gptj.py          | 2 +-
 src/transformers/models/longt5/modeling_flax_longt5.py      | 4 ++--
 src/transformers/models/marian/modeling_flax_marian.py      | 4 ++--
 src/transformers/models/mbart/modeling_flax_mbart.py        | 4 ++--
 src/transformers/models/opt/modeling_flax_opt.py            | 2 +-
 src/transformers/models/pegasus/modeling_flax_pegasus.py    | 4 ++--
 src/transformers/models/roberta/modeling_flax_roberta.py    | 2 +-
 .../modeling_flax_roberta_prelayernorm.py                   | 2 +-
 .../modeling_flax_speech_encoder_decoder.py                 | 4 ++--
 src/transformers/models/t5/modeling_flax_t5.py              | 4 ++--
 .../modeling_flax_vision_encoder_decoder.py                 | 2 +-
 src/transformers/models/whisper/modeling_flax_whisper.py    | 4 ++--
 src/transformers/models/xglm/modeling_flax_xglm.py          | 2 +-
 .../models/xlm_roberta/modeling_flax_xlm_roberta.py         | 2 +-
 .../modeling_flax_{{cookiecutter.lowercase_modelname}}.py   | 6 +++---
 24 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index b7ce63ffcc77..9858eb2d1bf4 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -1467,8 +1467,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
@@ -1960,7 +1960,7 @@ def __call__(
 class FlaxBartForCausalLM(FlaxBartDecoderPreTrainedModel):
     module_class = FlaxBartForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 6e8eb829b909..99dfa2a0e2f9 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -1677,7 +1677,7 @@ def __call__(
 class FlaxBertForCausalLM(FlaxBertPreTrainedModel):
     module_class = FlaxBertForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index a6f503aa3e48..afdac2645f26 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -2599,7 +2599,7 @@ def __call__(
 class FlaxBigBirdForCausalLM(FlaxBigBirdPreTrainedModel):
     module_class = FlaxBigBirdForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 3f5c73a6c3bc..1035272fd053 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -1443,8 +1443,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 77e6b1704ba6..2bf8b59e2757 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -1441,8 +1441,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index f7c150f56d5b..32e76b8b586f 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -1565,7 +1565,7 @@ def __call__(
 class FlaxElectraForCausalLM(FlaxElectraPreTrainedModel):
     module_class = FlaxElectraForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index a500398d67a9..3d9679f26a1c 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -722,8 +722,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index 2a449360b889..8973e081a308 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -742,7 +742,7 @@ def __call__(
 class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
     module_class = FlaxGPT2LMHeadModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
index 0749911f7a15..5639ca50f166 100644
--- a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@@ -654,7 +654,7 @@ def __call__(
 class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
     module_class = FlaxGPTNeoForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index 8ec53aec46d5..9f0d4d6e8600 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -683,7 +683,7 @@ def __call__(
 class FlaxGPTJForCausalLM(FlaxGPTJPreTrainedModel):
     module_class = FlaxGPTJForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 96c0b7df2c69..6b7bc7c28fcf 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -2388,8 +2388,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index f197126277bc..a713fdb05dcf 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -1436,8 +1436,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index aeeec3e583b7..907fd53aa1e5 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -1502,8 +1502,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/opt/modeling_flax_opt.py b/src/transformers/models/opt/modeling_flax_opt.py
index b7038008f5d4..5d9839f12048 100644
--- a/src/transformers/models/opt/modeling_flax_opt.py
+++ b/src/transformers/models/opt/modeling_flax_opt.py
@@ -763,7 +763,7 @@ def __call__(
 class FlaxOPTForCausalLM(FlaxOPTPreTrainedModel):
     module_class = FlaxOPTForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index fdf7f019f22c..c5189746b106 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -1450,8 +1450,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index 5de7375d3838..845fcea44297 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -1452,7 +1452,7 @@ def __call__(
 class FlaxRobertaForCausalLM(FlaxRobertaPreTrainedModel):
     module_class = FlaxRobertaForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
index 8f5dc7944c16..b7c347693d95 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -1478,7 +1478,7 @@ def __call__(
 class FlaxRobertaPreLayerNormForCausalLM(FlaxRobertaPreLayerNormPreTrainedModel):
     module_class = FlaxRobertaPreLayerNormForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index d7e7bdf57fc9..b9975510abfd 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -745,8 +745,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index cc74c30c1d33..b2a718142152 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -1740,8 +1740,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 8561875ed599..3d914c9658da 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -688,7 +688,7 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index c9f3bbba9c3c..0f158fb60208 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -1448,8 +1448,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs,
     ):
diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py
index b2acd66f4476..d6b90a7f00f7 100644
--- a/src/transformers/models/xglm/modeling_flax_xglm.py
+++ b/src/transformers/models/xglm/modeling_flax_xglm.py
@@ -766,7 +766,7 @@ def __call__(
 class FlaxXGLMForCausalLM(FlaxXGLMPreTrainedModel):
     module_class = FlaxXGLMForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index ed5e113770e5..f6f39ee93ba6 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -1469,7 +1469,7 @@ def __call__(
 class FlaxXLMRobertaForCausalLM(FlaxXLMRobertaPreTrainedModel):
     module_class = FlaxXLMRobertaForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
index f6283197b0f6..83263a6a47ef 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
@@ -1469,7 +1469,7 @@ def __call__(
 class Flax{{cookiecutter.camelcase_modelname}}ForCausalLM(Flax{{cookiecutter.camelcase_modelname}}PreTrainedModel):
     module_class = Flax{{cookiecutter.camelcase_modelname}}ForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 
@@ -2969,8 +2969,8 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
         encoder_outputs=None,
         **kwargs
     ):

From d533465150532b0c5de167b574e59f64c68b1154 Mon Sep 17 00:00:00 2001
From: "Guillaume \"Vermeille\" Sanchez" <Guillaume.V.Sanchez@gmail.com>
Date: Sun, 6 Aug 2023 21:15:24 +0200
Subject: [PATCH 862/935] add CFG for .generate() (#24654)

---
 src/transformers/generation/__init__.py       |   2 +
 src/transformers/generation/logits_process.py | 118 +++++++++++++++++-
 src/transformers/generation/utils.py          |  27 +++-
 tests/generation/test_logits_process.py       |  52 ++++++++
 tests/generation/test_utils.py                |  40 ++++++
 5 files changed, 235 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index 0a522e9bb797..f0da9f514e7a 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -65,6 +65,7 @@
         "EncoderNoRepeatNGramLogitsProcessor",
         "ExponentialDecayLengthPenalty",
         "LogitNormalization",
+        "UnbatchedClassifierFreeGuidanceLogitsProcessor",
     ]
     _import_structure["stopping_criteria"] = [
         "MaxNewTokensCriteria",
@@ -188,6 +189,7 @@
             TopKLogitsWarper,
             TopPLogitsWarper,
             TypicalLogitsWarper,
+            UnbatchedClassifierFreeGuidanceLogitsProcessor,
         )
         from .stopping_criteria import (
             MaxLengthCriteria,
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 6b1093761fb9..f33cef909f08 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -15,7 +15,7 @@
 
 import inspect
 import math
-from typing import Callable, Dict, Iterable, List, Tuple, Union
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -1334,3 +1334,119 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
             scores[:, : self.semantic_vocab_size + self.codebook_size] = -float("inf")
 
         return scores
+
+
+class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
+    r"""Logits processor for Classifier-Free Guidance (CFG). The processors
+    computes a weighted average across scores from prompt conditional and prompt unconditional (or negative) logits,
+    parameterized by the `guidance_scale`. The unconditional scores are computed internally by prompting `model` with
+    the `unconditional_ids` branch.
+
+    See [the paper](https://arxiv.org/abs/2306.17806) for more information.
+
+    Args:
+        guidance_scale (`float`):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
+        unconditional_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary for the unconditional branch. If unset, will default to
+            the last token of the prompt.
+        unconditional_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, **optional**):
+            Attention mask for unconditional_ids.
+        model (`PreTrainedModel`):
+            The model computing the unconditional scores. Supposedly the same as the one computing the conditional
+            scores. Both models must use the same tokenizer.
+        smooth_factor (`float`, **optional**):
+            The interpolation weight for CFG Rescale. 1 means no rescaling, 0 reduces to the conditional scores without
+            CFG. Turn it lower if the output degenerates.
+        use_cache (`bool`, **optional**):
+            Whether to cache key/values during the negative prompt forward pass.
+
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["Today, a dragon flew over Paris, France,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=1.5)
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    The dragon flew over Paris, France, landing in Lyon, a city of a few million. Dragon-flying was a new form of
+    transport, and the dragon was the first in Europe.
+
+    >>> # with a negative prompt
+    >>> neg_inputs = tokenizer(["A very happy event happened,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=2, negative_prompt_ids=neg_inputs["input_ids"])
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    The dragon flew over Paris, France, crashing into Notre Dame Cathedral in the French capital killing at least 127
+    people and injuring more than 350.
+    ```
+    """
+
+    def __init__(
+        self,
+        guidance_scale: float,
+        model,
+        unconditional_ids: Optional[torch.LongTensor] = None,
+        unconditional_attention_mask: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = True,
+    ):
+        self.guidance_scale = guidance_scale
+        self.model = model
+        self.unconditional_context = {
+            "input_ids": unconditional_ids,
+            "attention_mask": unconditional_attention_mask,
+            "use_cache": use_cache,
+            "past_key_values": None,
+            "first_pass": True,
+        }
+
+    def get_unconditional_logits(self, input_ids):
+        if self.unconditional_context["first_pass"]:
+            if self.unconditional_context["input_ids"] is None:
+                self.unconditional_context["input_ids"] = input_ids[:, -1:]
+            if self.unconditional_context["attention_mask"] is None:
+                self.unconditional_context["attention_mask"] = torch.ones_like(
+                    self.unconditional_context["input_ids"], dtype=torch.long
+                )
+            input_ids = self.unconditional_context["input_ids"]
+            attention_mask = self.unconditional_context["attention_mask"]
+            self.unconditional_context["first_pass"] = False
+        else:
+            attention_mask = torch.cat(
+                [
+                    self.unconditional_context["attention_mask"],
+                    torch.ones_like(input_ids[:, -1:], dtype=torch.long),
+                ],
+                dim=1,
+            )
+            if not self.unconditional_context["use_cache"]:
+                input_ids = torch.cat([self.unconditional_context["input_ids"], input_ids[:, -1:]], dim=1)
+            else:
+                input_ids = input_ids[:, -1:]
+            self.unconditional_context["input_ids"] = input_ids
+            self.unconditional_context["attention_mask"] = attention_mask
+
+        out = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            use_cache=self.unconditional_context["use_cache"],
+            past_key_values=self.unconditional_context["past_key_values"],
+        )
+        self.unconditional_context["past_key_values"] = out.get("past_key_values", None)
+
+        return out.logits
+
+    def __call__(self, input_ids, scores):
+        scores = torch.nn.functional.log_softmax(scores, dim=-1)
+        if self.guidance_scale == 1:
+            return scores
+
+        logits = self.get_unconditional_logits(input_ids)
+
+        unconditional_logits = torch.nn.functional.log_softmax(logits[:, -1], dim=-1)
+        out = self.guidance_scale * (scores - unconditional_logits) + unconditional_logits
+        return out
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 75b2c4f7145e..344326ae31bc 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -38,7 +38,6 @@
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .configuration_utils import GenerationConfig
 from .logits_process import (
-    ClassifierFreeGuidanceLogitsProcessor,
     EncoderNoRepeatNGramLogitsProcessor,
     EncoderRepetitionPenaltyLogitsProcessor,
     EpsilonLogitsWarper,
@@ -64,6 +63,7 @@
     TopKLogitsWarper,
     TopPLogitsWarper,
     TypicalLogitsWarper,
+    UnbatchedClassifierFreeGuidanceLogitsProcessor,
 )
 from .stopping_criteria import (
     MaxLengthCriteria,
@@ -893,6 +893,9 @@ def _get_logits_processor(
         encoder_input_ids: torch.LongTensor,
         prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
         logits_processor: Optional[LogitsProcessorList],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
     ) -> LogitsProcessorList:
         """
         This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
@@ -901,6 +904,16 @@ def _get_logits_processor(
         # instantiate processors list
         processors = LogitsProcessorList()
 
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            processors.append(
+                UnbatchedClassifierFreeGuidanceLogitsProcessor(
+                    generation_config.guidance_scale,
+                    self,
+                    unconditional_ids=negative_prompt_ids,
+                    unconditional_attention_mask=negative_prompt_attention_mask,
+                    use_cache=model_kwargs["use_cache"],
+                )
+            )
         if generation_config.sequence_bias is not None:
             processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias))
 
@@ -998,8 +1011,6 @@ def _get_logits_processor(
             )
         if generation_config.forced_decoder_ids is not None:
             processors.append(ForceTokensLogitsProcessor(generation_config.forced_decoder_ids))
-        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
-            processors.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
         processors = self._merge_criteria_processor_list(processors, logits_processor)
         # `LogitNormalization` should always be the last logit processor, when present
         if generation_config.renormalize_logits is True:
@@ -1251,6 +1262,8 @@ def generate(
         synced_gpus: Optional[bool] = None,
         assistant_model: Optional["PreTrainedModel"] = None,
         streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -1308,6 +1321,11 @@ def generate(
             streamer (`BaseStreamer`, *optional*):
                 Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
+                size. This is an experimental feature, subject to breaking API changes in future versions.
+            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Attention_mask for `negative_prompt_ids`.
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -1511,6 +1529,9 @@ def generate(
             encoder_input_ids=inputs_tensor,
             prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
             logits_processor=logits_processor,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
         )
 
         # 9. prepare stopping criteria
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index fed27097a0d3..e161f791caf8 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -51,6 +51,7 @@
         TopKLogitsWarper,
         TopPLogitsWarper,
         TypicalLogitsWarper,
+        UnbatchedClassifierFreeGuidanceLogitsProcessor,
     )
 
 
@@ -743,3 +744,54 @@ def test_normalization(self):
         self.assertTrue(normalized_scores.sum(dim=-1).allclose(ones))
 
         self.assertTrue(normalized_scores.allclose(scores.softmax(dim=-1)))
+
+    def test_classifier_free_guidance(self):
+        class Namespace(dict):
+            pass
+
+        logits_uncond = torch.tensor([[[1.0, 0, 1.5]]])
+        logits_cond = torch.tensor([[[1.0, 1.0, 1.0]]])
+
+        def dummy_model(input_ids, attention_mask, use_cache=True, past_key_values=None):
+            out = Namespace()
+            out.logits = logits_uncond
+            out.past_key_values = None
+            return out
+
+        def lsm(x):
+            return torch.nn.functional.log_softmax(x, dim=-1)
+
+        # explicit unconditional prompt + attention mask
+        input_ids = torch.LongTensor([[0]])
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(
+            1.5, dummy_model, input_ids, torch.ones_like(input_ids, dtype=torch.long)
+        )
+        out = cfg(input_ids, logits_cond)[0, -1]
+
+        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
+
+        self.assertAlmostEqual(out[0].item(), res[0].item())
+        self.assertAlmostEqual(out[1].item(), res[1].item())
+        self.assertAlmostEqual(out[2].item(), res[2].item())
+
+        # explicit unconditional prompt
+        input_ids = torch.LongTensor([[0]])
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(1.5, dummy_model, input_ids)
+        out = cfg(input_ids, logits_cond)[0, -1]
+
+        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
+
+        self.assertAlmostEqual(out[0].item(), res[0].item())
+        self.assertAlmostEqual(out[1].item(), res[1].item())
+        self.assertAlmostEqual(out[2].item(), res[2].item())
+
+        # all implicit
+        input_ids = torch.LongTensor([[0]])
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(1.5, dummy_model)
+        out = cfg(input_ids, logits_cond)[0, -1]
+
+        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
+
+        self.assertAlmostEqual(out[0].item(), res[0].item())
+        self.assertAlmostEqual(out[1].item(), res[1].item())
+        self.assertAlmostEqual(out[2].item(), res[2].item())
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 0f50632c63e4..0cdc92398ba8 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2585,6 +2585,46 @@ def test_constrained_beam_search_mixed_mixin(self):
             ],
         )
 
+    @slow
+    def test_cfg_mixin(self):
+        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+
+        input = tokenizer(["The dragon flew over Paris,"], return_tensors="pt", return_attention_mask=True)
+        input["input_ids"] = input["input_ids"].to(torch_device)
+        input["attention_mask"] = input["attention_mask"].to(torch_device)
+
+        outputs = model.generate(**input, max_new_tokens=32, guidance_scale=1.5)
+        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        self.assertListEqual(
+            generated_text,
+            [
+                "The dragon flew over Paris, landing in the Rue de la Bastille. The crowd was so excited "
+                'that they had to leave the city.\n\n"We\'re going to Paris!"\n'
+            ],
+        )
+
+        neg = tokenizer(["France,"], return_tensors="pt", return_attention_mask=True)
+        neg["input_ids"] = neg["input_ids"].to(torch_device)
+        neg["attention_mask"] = neg["attention_mask"].to(torch_device)
+        outputs = model.generate(
+            **input,
+            max_new_tokens=32,
+            guidance_scale=1.5,
+            negative_prompt_ids=neg["input_ids"],
+            negative_prompt_attention_mask=neg["attention_mask"],
+        )
+        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        self.assertListEqual(
+            generated_text,
+            [
+                'The dragon flew over Paris, landing on the pavement.\n\n"Paris!"\n\n"Paris!"\n\n"'
+                'Paris!"\n\n"Paris!"\n\n"Paris!"\n\n'
+            ],
+        )
+
     @slow
     def test_constrained_beam_search_example_translation_mixin(self):
         # PT-only test: TF doesn't have constrained beam search

From b9da44bd3e45c2b2b77147a70c61c546faef9774 Mon Sep 17 00:00:00 2001
From: Injin Paek <71638597+eenzeenee@users.noreply.github.com>
Date: Mon, 7 Aug 2023 15:37:29 +0900
Subject: [PATCH 863/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perf=5Finfer=5Fgpu=5Fone.md`=20to=20Korean=20(#24978)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: perf_infer_gpu_one

* feat: chatgpt draft

* fix: manual edits

* fix: manual edits

* fix: resolve suggestions

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: TaeYupNoh <107118671+TaeYupNoh@users.noreply.github.com>

* fix: resolve suggestions

* fix: resolve suggestions

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

---------

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: TaeYupNoh <107118671+TaeYupNoh@users.noreply.github.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml          |   4 +-
 docs/source/ko/perf_infer_gpu_one.md | 184 +++++++++++++++++++++++++++
 2 files changed, 186 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perf_infer_gpu_one.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 71985b68328e..f5efb7027246 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -125,8 +125,8 @@
       title: (번역중) Training on Specialized Hardware
     - local: perf_infer_cpu
       title: CPU로 추론하기
-    - local: in_translation
-      title: (번역중) Inference on one GPU
+    - local: perf_infer_gpu_one
+      title: 하나의 GPU를 활용한 추론
     - local: perf_infer_gpu_many
       title: 여러 GPU에서 추론
     - local: in_translation
diff --git a/docs/source/ko/perf_infer_gpu_one.md b/docs/source/ko/perf_infer_gpu_one.md
new file mode 100644
index 000000000000..73cef858b97d
--- /dev/null
+++ b/docs/source/ko/perf_infer_gpu_one.md
@@ -0,0 +1,184 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 단일 GPU에서 효율적인 추론 [[efficient-inference-on-a-single-gpu]]
+
+이 가이드 외에도, [단일 GPU에서의 훈련 가이드](perf_train_gpu_one)와 [CPU에서의 추론 가이드](perf_infer_cpu)에서도 관련 정보를 찾을 수 있습니다.
+
+## Better Transformer: PyTorch 네이티브 Transformer 패스트패스 [[better-transformer-pytorchnative-transformer-fastpath]]
+
+PyTorch 네이티브 [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) 어텐션 패스트패스인 BetterTransformer는 [🤗 Optimum 라이브러리](https://huggingface.co/docs/optimum/bettertransformer/overview)의 통합을 통해 Transformers와 함께 사용할 수 있습니다.
+
+PyTorch의 어텐션 패스트패스는 커널 퓨전과 [중첩된 텐서](https://pytorch.org/docs/stable/nested.html)의 사용을 통해 추론 속도를 높일 수 있습니다. 자세한 벤치마크는 [이 블로그 글](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2)에서 확인할 수 있습니다.
+
+[`optimum`](https://github.com/huggingface/optimum) 패키지를 설치한 후에는 추론 중 Better Transformer를 사용할 수 있도록 [`~PreTrainedModel.to_bettertransformer`]를 호출하여 관련 내부 모듈을 대체합니다:
+
+```python
+model = model.to_bettertransformer()
+```
+
+[`~PreTrainedModel.reverse_bettertransformer`] 메소드는 정규화된 transformers 모델링을 사용하기 위해 모델을 저장하기 전 원래의 모델링으로 돌아갈 수 있도록 해줍니다:
+
+```python
+model = model.reverse_bettertransformer()
+model.save_pretrained("saved_model")
+```
+
+PyTorch 2.0부터는 어텐션 패스트패스가 인코더와 디코더 모두에서 지원됩니다. 지원되는 아키텍처 목록은 [여기](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models)에서 확인할 수 있습니다.
+
+## FP4 혼합 정밀도 추론을 위한 `bitsandbytes` 통합 [[bitsandbytes-integration-for-fp4-mixedprecision-inference]]
+
+`bitsandbytes`를 설치하면 GPU에서 손쉽게 모델을 압축할 수 있습니다. FP4 양자화를 사용하면 원래의 전체 정밀도 버전과 비교하여 모델 크기를 최대 8배 줄일 수 있습니다. 아래에서 시작하는 방법을 확인하세요.
+
+<Tip>
+
+이 기능은 다중 GPU 설정에서도 사용할 수 있습니다.
+
+</Tip>
+
+### 요구 사항 [[requirements-for-fp4-mixedprecision-inference]]
+
+- 최신 `bitsandbytes` 라이브러리
+`pip install bitsandbytes>=0.39.0`
+
+- 최신 `accelerate`를 소스에서 설치
+`pip install git+https://github.com/huggingface/accelerate.git`
+
+- 최신 `transformers`를 소스에서 설치
+`pip install git+https://github.com/huggingface/transformers.git`
+
+### FP4 모델 실행 - 단일 GPU 설정 - 빠른 시작 [[running-fp4-models-single-gpu-setup-quickstart]]
+
+다음 코드를 실행하여 단일 GPU에서 빠르게 FP4 모델을 실행할 수 있습니다.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+`device_map`은 선택 사항입니다. 그러나 `device_map = 'auto'`로 설정하는 것이 사용 가능한 리소스를 효율적으로 디스패치하기 때문에 추론에 있어 권장됩니다.
+
+### FP4 모델 실행 - 다중 GPU 설정 [[running-fp4-models-multi-gpu-setup]]
+
+다중 GPU에서 혼합 4비트 모델을 가져오는 방법은 단일 GPU 설정과 동일합니다(동일한 명령어 사용):
+```py
+model_name = "bigscience/bloom-2b5"
+model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+```
+하지만 `accelerate`를 사용하여 각 GPU에 할당할 GPU RAM을 제어할 수 있습니다. 다음과 같이 `max_memory` 인수를 사용하세요:
+
+```py
+max_memory_mapping = {0: "600MB", 1: "1GB"}
+model_name = "bigscience/bloom-3b"
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
+)
+```
+이 예에서는 첫 번째 GPU가 600MB의 메모리를 사용하고 두 번째 GPU가 1GB를 사용합니다.
+
+### 고급 사용법 [[advanced-usage]]
+
+이 방법의 더 고급 사용법에 대해서는 [양자화](main_classes/quantization) 문서 페이지를 참조하세요.
+
+## Int8 혼합 정밀도 행렬 분해를 위한 `bitsandbytes` 통합 [[bitsandbytes-integration-for-int8-mixedprecision-matrix-decomposition]]
+
+<Tip>
+
+이 기능은 다중 GPU 설정에서도 사용할 수 있습니다.
+
+</Tip>
+
+[`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339) 논문에서 우리는 몇 줄의 코드로 Hub의 모든 모델에 대한 Hugging Face 통합을 지원합니다.
+이 방법은 `float16` 및 `bfloat16` 가중치에 대해 `nn.Linear` 크기를 2배로 줄이고, `float32` 가중치에 대해 4배로 줄입니다. 이는 절반 정밀도에서 이상치를 처리함으로써 품질에 거의 영향을 미치지 않습니다.
+
+![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
+
+Int8 혼합 정밀도 행렬 분해는 행렬 곱셈을 두 개의 스트림으로 분리합니다: (1) fp16로 곱해지는 체계적인 특이값 이상치 스트림 행렬(0.01%) 및 (2) int8 행렬 곱셈의 일반적인 스트림(99.9%). 이 방법을 사용하면 매우 큰 모델에 대해 예측 저하 없이 int8 추론이 가능합니다.
+이 방법에 대한 자세한 내용은 [논문](https://arxiv.org/abs/2208.07339)이나 [통합에 관한 블로그 글](https://huggingface.co/blog/hf-bitsandbytes-integration)에서 확인할 수 있습니다.
+
+![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
+
+커널은 GPU 전용으로 컴파일되어 있기 때문에 혼합 8비트 모델을 실행하려면 GPU가 필요합니다. 이 기능을 사용하기 전에 모델의 1/4(또는 모델 가중치가 절반 정밀도인 경우 절반)을 저장할 충분한 GPU 메모리가 있는지 확인하세요.
+이 모듈을 사용하는 데 도움이 되는 몇 가지 참고 사항이 아래에 나와 있습니다. 또는 [Google colab](#colab-demos)에서 데모를 따라할 수도 있습니다.
+
+### 요구 사항 [[requirements-for-int8-mixedprecision-matrix-decomposition]]
+
+- `bitsandbytes<0.37.0`을 사용하는 경우, 8비트 텐서 코어(Turing, Ampere 또는 이후 아키텍처 - 예: T4, RTX20s RTX30s, A40-A100)를 지원하는 NVIDIA GPU에서 실행하는지 확인하세요. `bitsandbytes>=0.37.0`을 사용하는 경우, 모든 GPU가 지원됩니다.
+- 올바른 버전의 `bitsandbytes`를 다음 명령으로 설치하세요:
+`pip install bitsandbytes>=0.31.5`
+- `accelerate`를 설치하세요
+`pip install accelerate>=0.12.0`
+
+### 혼합 Int8 모델 실행 - 단일 GPU 설정 [[running-mixedint8-models-single-gpu-setup]]
+
+필요한 라이브러리를 설치한 후 혼합 8비트 모델을 가져오는 방법은 다음과 같습니다:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+
+텍스트 생성의 경우:
+
+* `pipeline()` 함수 대신 모델의 `generate()` 메소드를 사용하는 것을 권장합니다. `pipeline()` 함수로는 추론이 가능하지만, 혼합 8비트 모델에 최적화되지 않았기 때문에 `generate()` 메소드를 사용하는 것보다 느릴 수 있습니다. 또한, nucleus 샘플링과 같은 일부 샘플링 전략은 혼합 8비트 모델에 대해 `pipeline()` 함수에서 지원되지 않습니다.
+* 입력을 모델과 동일한 GPU에 배치하는 것이 좋습니다.
+
+다음은 간단한 예입니다:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "bigscience/bloom-2b5"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
+
+
+### 혼합 Int8 모델 실행 - 다중 GPU 설정 [[running-mixedint8-models-multi-gpu-setup]]
+
+다중 GPU에서 혼합 8비트 모델을 로드하는 방법은 단일 GPU 설정과 동일합니다(동일한 명령어 사용):
+```py
+model_name = "bigscience/bloom-2b5"
+model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
+```
+하지만 `accelerate`를 사용하여 각 GPU에 할당할 GPU RAM을 제어할 수 있습니다. 다음과 같이 `max_memory` 인수를 사용하세요:
+
+```py
+max_memory_mapping = {0: "1GB", 1: "2GB"}
+model_name = "bigscience/bloom-3b"
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
+)
+```
+이 예시에서는 첫 번째 GPU가 1GB의 메모리를 사용하고 두 번째 GPU가 2GB를 사용합니다.
+
+### Colab 데모 [[colab-demos]]
+
+이 방법을 사용하면 이전에 Google Colab에서 추론할 수 없었던 모델에 대해 추론할 수 있습니다.
+Google Colab에서 8비트 양자화를 사용하여 T5-11b(42GB in fp32)를 실행하는 데모를 확인하세요:
+
+[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+
+또는 BLOOM-3B에 대한 데모를 확인하세요:
+
+[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
\ No newline at end of file

From b0f23036f16b961e14cf480522fef8581c4bf19c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 7 Aug 2023 12:32:34 +0200
Subject: [PATCH 864/935] Update TF pin in docker image (#25343)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 +-
 docker/transformers-tensorflow-gpu/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index eb4092eacb6f..eda1fca6a61b 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -31,7 +31,7 @@ RUN echo torch=$VERSION
 # TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
 RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability
 
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
 
diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile
index e404dd5a62c7..df9039a0c4d2 100644
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
 
 # If set to nothing, will install the latest version
-ARG TENSORFLOW='2.12'
+ARG TENSORFLOW='2.13'
 
 RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip uninstall -y torch flax

From d6bfba76be6341a14be015214657d48ffc231f83 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 7 Aug 2023 11:25:15 -0300
Subject: [PATCH 865/935] Generalize CFG to allow for positive prompts (#25339)

* Generalize CFG to allow for positive prompts

* Add documentation, fix the correct class
---
 src/transformers/generation/logits_process.py | 11 +++++++++--
 src/transformers/generation/utils.py          |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index f33cef909f08..b953b09a904a 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1346,9 +1346,10 @@ class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
 
     Args:
         guidance_scale (`float`):
-            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale != 1`.
             Higher guidance scale encourages the model to generate samples that are more closely linked to the input
-            prompt, usually at the expense of poorer quality.
+            prompt, usually at the expense of poorer quality. A value smaller than 1 has the opposite effect, while
+            making the negative prompt provided with negative_prompt_ids (if any) act as a positive prompt.
         unconditional_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary for the unconditional branch. If unset, will default to
             the last token of the prompt.
@@ -1383,6 +1384,12 @@ class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
     >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
     The dragon flew over Paris, France, crashing into Notre Dame Cathedral in the French capital killing at least 127
     people and injuring more than 350.
+
+    >>> # with a positive prompt
+    >>> neg_inputs = tokenizer(["A very happy event happened,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=0, negative_prompt_ids=neg_inputs["input_ids"])
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    Today, a dragon flew over Paris, France, and I'm very happy to be here.
     ```
     """
 
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 344326ae31bc..aca7bdd6f454 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -904,7 +904,7 @@ def _get_logits_processor(
         # instantiate processors list
         processors = LogitsProcessorList()
 
-        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale != 1:
             processors.append(
                 UnbatchedClassifierFreeGuidanceLogitsProcessor(
                     generation_config.guidance_scale,

From 65001cb1c8a7cfb431929f228be8efa7e99bc5a2 Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Mon, 7 Aug 2023 16:31:15 +0200
Subject: [PATCH 866/935] Loosen output shape restrictions on GPT-style models
 (#25188)

* Loosen output shape restrictions on GPT-style models

* Use more self-explanatory variables

* Revert "Use more self-explanatory variables"

This reverts commit 5fd9ab39119558b7e750f61aa4a19014dccc5ed5.
---
 .../decision_transformer/modeling_decision_transformer.py       | 2 +-
 src/transformers/models/gpt2/modeling_gpt2.py                   | 2 +-
 src/transformers/models/gpt_neo/modeling_gpt_neo.py             | 2 +-
 src/transformers/models/gptj/modeling_gptj.py                   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index d92aa20edba5..7414eb41c01c 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -603,7 +603,7 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index bdc14b4c54f6..3c2bfc6a6c9d 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -849,7 +849,7 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index b2192dfeff5a..c6b95a768aba 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -588,7 +588,7 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 56011471bdb9..fd7083e4fdad 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -641,7 +641,7 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
 
         if self.gradient_checkpointing and self.training:
             if use_cache:

From 145109382a972d53ed3eac17216dc509d1d19262 Mon Sep 17 00:00:00 2001
From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com>
Date: Mon, 7 Aug 2023 22:32:25 +0800
Subject: [PATCH 867/935] Allow `trust_remote_code` in example scripts (#25248)

* pytorch examples

* pytorch mim no trainer

* cookiecutter

* flax examples

* missed line in pytorch run_glue

* tensorflow examples

* tensorflow run_clip

* tensorflow run_mlm

* tensorflow run_ner

* tensorflow run_clm

* pytorch example from_configs

* pytorch no trainer examples

* Revert "tensorflow run_clip"

This reverts commit 261f86ac1f1c9e05dd3fd0291e1a1f8e573781d5.

* fix: duplicated argument
---
 .../run_image_captioning_flax.py              | 13 ++++++
 .../flax/language-modeling/run_clm_flax.py    | 16 ++++++++
 .../flax/language-modeling/run_mlm_flax.py    | 16 ++++++++
 examples/flax/question-answering/run_qa.py    | 13 ++++++
 .../summarization/run_summarization_flax.py   | 16 ++++++++
 .../flax/text-classification/run_flax_glue.py | 13 ++++++
 .../flax/token-classification/run_flax_ner.py | 14 +++++++
 .../flax/vision/run_image_classification.py   | 14 +++++++
 .../run_audio_classification.py               | 13 ++++++
 .../contrastive-image-text/run_clip.py        | 12 ++++++
 .../run_image_classification.py               | 13 ++++++
 .../run_image_classification_no_trainer.py    | 17 +++++++-
 examples/pytorch/image-pretraining/run_mim.py | 14 ++++++-
 .../image-pretraining/run_mim_no_trainer.py   | 17 +++++++-
 examples/pytorch/language-modeling/run_clm.py | 15 ++++++-
 .../language-modeling/run_clm_no_trainer.py   | 31 +++++++++++---
 examples/pytorch/language-modeling/run_mlm.py | 15 ++++++-
 .../language-modeling/run_mlm_no_trainer.py   | 25 +++++++++---
 examples/pytorch/multiple-choice/run_swag.py  | 13 ++++++
 .../multiple-choice/run_swag_no_trainer.py    | 25 +++++++++---
 examples/pytorch/question-answering/run_qa.py | 13 ++++++
 .../question-answering/run_qa_no_trainer.py   | 25 +++++++++---
 .../question-answering/run_seq2seq_qa.py      | 13 ++++++
 .../run_semantic_segmentation.py              | 13 ++++++
 .../run_semantic_segmentation_no_trainer.py   | 22 ++++++++--
 .../run_speech_recognition_ctc.py             | 14 +++++++
 .../run_speech_recognition_ctc_adapter.py     | 15 +++++++
 .../run_speech_recognition_seq2seq.py         | 14 +++++++
 .../summarization/run_summarization.py        | 13 ++++++
 .../run_summarization_no_trainer.py           | 25 +++++++++---
 .../text-classification/run_classification.py | 13 ++++++
 .../pytorch/text-classification/run_glue.py   | 13 ++++++
 .../run_glue_no_trainer.py                    | 22 +++++++++-
 .../pytorch/text-classification/run_xnli.py   | 13 ++++++
 .../pytorch/token-classification/run_ner.py   | 14 +++++++
 .../run_ner_no_trainer.py                     | 29 +++++++++++---
 .../pytorch/translation/run_translation.py    | 13 ++++++
 .../translation/run_translation_no_trainer.py | 25 +++++++++---
 .../run_image_classification.py               | 13 ++++++
 .../tensorflow/language-modeling/run_clm.py   | 37 +++++++++++++----
 .../tensorflow/language-modeling/run_mlm.py   | 36 +++++++++++++----
 .../tensorflow/multiple-choice/run_swag.py    | 13 ++++++
 .../tensorflow/question-answering/run_qa.py   | 13 ++++++
 .../summarization/run_summarization.py        | 13 ++++++
 .../text-classification/run_glue.py           | 13 ++++++
 .../run_text_classification.py                | 14 +++++++
 .../token-classification/run_ner.py           | 40 ++++++++++++++++---
 .../tensorflow/translation/run_translation.py | 13 ++++++
 .../run_{{cookiecutter.example_shortcut}}.py  | 16 ++++++++
 49 files changed, 790 insertions(+), 65 deletions(-)

diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index f71641e30f7d..bbc79977a467 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -198,6 +198,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -489,17 +499,20 @@ def main():
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
 
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 7c4206a0c7c9..1a296a4fa992 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -185,6 +185,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -477,12 +487,14 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -494,6 +506,7 @@ def main():
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
@@ -501,6 +514,7 @@ def main():
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -515,12 +529,14 @@ def main():
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForCausalLM.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # Preprocessing the datasets.
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index d5e44feaf3d8..0c49a2cff7b0 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -190,6 +190,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -509,12 +519,14 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -526,6 +538,7 @@ def main():
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
@@ -533,6 +546,7 @@ def main():
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -652,12 +666,14 @@ def group_texts(examples):
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForMaskedLM.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     if training_args.gradient_checkpointing:
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 925e182e7e96..77bd37bab879 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -171,6 +171,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
@@ -534,6 +544,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -541,6 +552,7 @@ def main():
         use_fast=True,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -888,6 +900,7 @@ def write_eval_metric(summary_writer, eval_metrics, step):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         seed=training_args.seed,
         dtype=getattr(jnp, model_args.dtype),
     )
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index 83af31d8d22a..fc1595ac7ea9 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -204,6 +204,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -517,12 +527,14 @@ def main():
             model_args.config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -534,6 +546,7 @@ def main():
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
@@ -541,6 +554,7 @@ def main():
             cache_dir=model_args.cache_dir,
             use_fast=model_args.use_fast_tokenizer,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -555,12 +569,14 @@ def main():
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForSeq2SeqLM.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     if training_args.gradient_checkpointing:
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 3f6d0a5eb6c3..b9a23f9dd5eb 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -117,6 +117,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -425,16 +435,19 @@ def main():
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         use_fast=not model_args.use_slow_tokenizer,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = FlaxAutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Preprocessing the datasets
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index f66d9b3128a7..e0a0cb1eff7c 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -165,6 +165,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -504,6 +514,7 @@ def get_label_list(labels):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
     if config.model_type in {"gpt2", "roberta"}:
@@ -512,6 +523,7 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             add_prefix_space=True,
         )
     else:
@@ -520,6 +532,7 @@ def get_label_list(labels):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     model = FlaxAutoModelForTokenClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -527,6 +540,7 @@ def get_label_list(labels):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Preprocessing the datasets
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 63b638423a76..d3454f26723b 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -175,6 +175,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -352,6 +362,7 @@ def main():
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
@@ -360,6 +371,7 @@ def main():
             image_size=data_args.image_size,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -372,12 +384,14 @@ def main():
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         model = FlaxAutoModelForImageClassification.from_config(
             config,
             seed=training_args.seed,
             dtype=getattr(jnp, model_args.dtype),
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     # Store some constant
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index c1568867e232..8be727200a99 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -167,6 +167,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     freeze_feature_extractor: Optional[bool] = field(
         default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
     )
@@ -293,6 +303,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # `datasets` takes care of automatically loading and resampling the audio,
@@ -353,6 +364,7 @@ def compute_metrics(eval_pred):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -361,6 +373,7 @@ def compute_metrics(eval_pred):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index dad10ea27495..86612b8b7eeb 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -102,6 +102,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     freeze_vision_model: bool = field(
         default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
     )
@@ -350,6 +360,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     model = AutoModel.from_pretrained(
@@ -357,6 +368,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     config = model.config
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 35a74b253263..a7fe144da269 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -158,6 +158,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -290,6 +300,7 @@ def compute_metrics(p):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForImageClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -298,6 +309,7 @@ def compute_metrics(p):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
     image_processor = AutoImageProcessor.from_pretrained(
@@ -305,6 +317,7 @@ def compute_metrics(p):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Define torchvision transforms to be applied to each image.
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index c416273132ed..62434649c3be 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -146,6 +146,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -300,13 +310,18 @@ def main():
         i2label=id2label,
         label2id=label2id,
         finetuning_task="image-classification",
+        trust_remote_code=args.trust_remote_code,
+    )
+    image_processor = AutoImageProcessor.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=args.trust_remote_code,
     )
-    image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
     model = AutoModelForImageClassification.from_pretrained(
         args.model_name_or_path,
         from_tf=bool(".ckpt" in args.model_name_or_path),
         config=config,
         ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+        trust_remote_code=args.trust_remote_code,
     )
 
     # Preprocessing the datasets
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 25e780ab48be..9fa3a1b68a83 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -169,6 +169,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     image_size: Optional[int] = field(
         default=None,
         metadata={
@@ -319,6 +329,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name_or_path:
         config = AutoConfig.from_pretrained(model_args.config_name_or_path, **config_kwargs)
@@ -371,10 +382,11 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedImageModeling.from_config(config)
+        model = AutoModelForMaskedImageModeling.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
     if training_args.do_train:
         column_names = ds["train"].column_names
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 44066daee52a..5053dabf91af 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -195,6 +195,16 @@ def parse_args():
             "with private models)."
         ),
     )
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--image_size",
         type=int,
@@ -448,6 +458,7 @@ def main():
         "cache_dir": args.cache_dir,
         "revision": args.model_revision,
         "use_auth_token": True if args.use_auth_token else None,
+        "trust_remote_code": args.trust_remote_code,
     }
     if args.config_name_or_path:
         config = AutoConfig.from_pretrained(args.config_name_or_path, **config_kwargs)
@@ -498,10 +509,14 @@ def main():
             cache_dir=args.cache_dir,
             revision=args.model_revision,
             token=True if args.use_auth_token else None,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedImageModeling.from_config(config)
+        model = AutoModelForMaskedImageModeling.from_config(
+            config,
+            trust_remote_code=args.trust_remote_code,
+        )
 
     column_names = ds["train"].column_names
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index c1ed1c3412cc..3d491f784c6e 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -127,6 +127,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     torch_dtype: Optional[str] = field(
         default=None,
         metadata={
@@ -387,6 +397,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -405,6 +416,7 @@ def main():
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
         "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -429,11 +441,12 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             torch_dtype=torch_dtype,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
-        model = AutoModelForCausalLM.from_config(config)
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 58954bd85367..b195e33cc980 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -193,6 +193,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -362,17 +372,27 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(
+            args.config_name,
+            trust_remote_code=args.trust_remote_code,
+        )
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path,
+            trust_remote_code=args.trust_remote_code,
+        )
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -385,10 +405,11 @@ def main():
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
             low_cpu_mem_usage=args.low_cpu_mem_usage,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForCausalLM.from_config(config)
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index d6c756edc387..84347804dbbb 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -123,6 +123,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
@@ -380,6 +390,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -398,6 +409,7 @@ def main():
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
         "token": model_args.token,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -417,11 +429,12 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
+        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index af1e8d5209f9..2fe8beca1b50 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -200,6 +200,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -367,17 +377,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -390,10 +404,11 @@ def main():
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
             low_cpu_mem_usage=args.low_cpu_mem_usage,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
+        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 61f72fd60528..d5f4be569956 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -95,6 +95,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -328,6 +338,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -335,6 +346,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForMultipleChoice.from_pretrained(
         model_args.model_name_or_path,
@@ -343,6 +355,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # When using your own dataset or a different dataset from swag, you will probably need to change this.
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 26ae2d49443c..e7ec89a429fc 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -182,6 +182,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -374,17 +384,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -396,10 +410,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMultipleChoice.from_config(config)
+        model = AutoModelForMultipleChoice.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index d823b28256ba..76f836ece6d3 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -95,6 +95,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -336,6 +346,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -343,6 +354,7 @@ def main():
         use_fast=True,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
@@ -351,6 +363,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Tokenizer check: this script requires a fast tokenizer.
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 2a7cdc727352..0fc438d70b69 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -273,6 +273,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -415,17 +425,21 @@ def main():
     # download model & vocab.
 
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=True, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=True, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -437,10 +451,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForQuestionAnswering.from_config(config)
+        model = AutoModelForQuestionAnswering.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # Preprocessing the datasets.
     # Preprocessing is slighlty different for training and evaluation.
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index ac3fb64974f8..404a27ba3aad 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -96,6 +96,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -381,6 +391,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -388,6 +399,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -396,6 +408,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index ce63e15faca7..a7bde9eb21d1 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -257,6 +257,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 def main():
@@ -393,6 +403,7 @@ def compute_metrics(eval_pred):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSemanticSegmentation.from_pretrained(
         model_args.model_name_or_path,
@@ -401,12 +412,14 @@ def compute_metrics(eval_pred):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # Define torchvision transforms to be applied to each image + target.
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index af9880901ca3..f0518ada5d94 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -273,6 +273,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -400,9 +410,15 @@ def main():
     label2id = {v: k for k, v in id2label.items()}
 
     # Load pretrained model and image processor
-    config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id)
-    image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
-    model = AutoModelForSemanticSegmentation.from_pretrained(args.model_name_or_path, config=config)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path, id2label=id2label, label2id=label2id, trust_remote_code=args.trust_remote_code
+    )
+    image_processor = AutoImageProcessor.from_pretrained(
+        args.model_name_or_path, trust_remote_code=args.trust_remote_code
+    )
+    model = AutoModelForSemanticSegmentation.from_pretrained(
+        args.model_name_or_path, config=config, trust_remote_code=args.trust_remote_code
+    )
 
     # Preprocessing the datasets
     # Define torchvision transforms to be applied to each image + target.
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index bff864ddb305..3674dd7d419a 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -244,6 +244,16 @@ class DataTrainingArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     unk_token: str = field(
         default="[UNK]",
         metadata={"help": "The unk token for the tokenizer"},
@@ -505,6 +515,7 @@ def remove_special_characters(batch):
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # 4. Next, if no tokenizer file is defined,
@@ -561,12 +572,14 @@ def remove_special_characters(batch):
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
         **tokenizer_kwargs,
     )
     feature_extractor = AutoFeatureExtractor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # adapt config
@@ -595,6 +608,7 @@ def remove_special_characters(batch):
         cache_dir=model_args.cache_dir,
         config=config,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # freeze encoder
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index be9021874a48..b91fc87d05f0 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -247,6 +247,16 @@ class DataTrainingArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     unk_token: str = field(
         default="[UNK]",
         metadata={"help": "The unk token for the tokenizer"},
@@ -501,6 +511,7 @@ def remove_special_characters(batch):
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # 4. Next, if no tokenizer file is defined,
@@ -517,6 +528,7 @@ def remove_special_characters(batch):
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_name_or_path,
             token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
         )
         vocab_dict = tokenizer.vocab.copy()
         if tokenizer.target_lang is None:
@@ -584,12 +596,14 @@ def remove_special_characters(batch):
     tokenizer = AutoTokenizer.from_pretrained(
         tokenizer_name_or_path,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
         **tokenizer_kwargs,
     )
     feature_extractor = AutoFeatureExtractor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     # adapt config
@@ -615,6 +629,7 @@ def remove_special_characters(batch):
         cache_dir=model_args.cache_dir,
         config=config,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
         ignore_mismatched_sizes=True,
     )
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index aea7d3b7fdbd..f9744994ad8e 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -101,6 +101,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     freeze_feature_encoder: bool = field(
         default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
     )
@@ -384,6 +394,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
@@ -397,6 +408,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -404,6 +416,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
@@ -411,6 +424,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     if model.config.decoder_start_token_id is None:
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 4a27df7582f6..d145a8549de2 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -115,6 +115,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     resize_position_embeddings: Optional[bool] = field(
         default=None,
         metadata={
@@ -431,6 +441,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -438,6 +449,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -446,6 +458,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 543b6bfaa206..65a924a800e7 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -266,6 +266,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -406,17 +416,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -428,10 +442,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForSeq2SeqLM.from_config(config)
+        model = AutoModelForSeq2SeqLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index a5a4ad49f852..68d2475fd567 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -243,6 +243,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -482,6 +492,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     if is_regression:
@@ -500,6 +511,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -508,6 +520,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 3f8bcc2fbb61..1d702ef524ab 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -204,6 +204,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -375,6 +385,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -382,6 +393,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -390,6 +402,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index e137f8763857..1ab8bb0db845 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -156,6 +156,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -309,13 +319,21 @@ def main():
     #
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
-    config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        trust_remote_code=args.trust_remote_code,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+    )
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path,
         from_tf=bool(".ckpt" in args.model_name_or_path),
         config=config,
         ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+        trust_remote_code=args.trust_remote_code,
     )
 
     # Preprocessing the datasets
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 459a59282e32..1fae632518c7 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -168,6 +168,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -292,6 +302,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -300,6 +311,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -308,6 +320,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 0e9d16041289..fae5142c0ce6 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -95,6 +95,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -362,6 +372,7 @@ def get_label_list(labels):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
@@ -372,6 +383,7 @@ def get_label_list(labels):
             use_fast=True,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             add_prefix_space=True,
         )
     else:
@@ -381,6 +393,7 @@ def get_label_list(labels):
             use_fast=True,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     model = AutoModelForTokenClassification.from_pretrained(
@@ -390,6 +403,7 @@ def get_label_list(labels):
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index dc57b514e728..a38f0e47d025 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -210,6 +210,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -388,9 +398,13 @@ def get_label_list(labels):
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            args.config_name, num_labels=num_labels, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path, num_labels=num_labels, trust_remote_code=args.trust_remote_code
+        )
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
@@ -403,9 +417,13 @@ def get_label_list(labels):
         )
 
     if config.model_type in {"bloom", "gpt2", "roberta"}:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path, use_fast=True, add_prefix_space=True, trust_remote_code=args.trust_remote_code
+        )
     else:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path, use_fast=True, trust_remote_code=args.trust_remote_code
+        )
 
     if args.model_name_or_path:
         model = AutoModelForTokenClassification.from_pretrained(
@@ -413,10 +431,11 @@ def get_label_list(labels):
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
             ignore_mismatched_sizes=args.ignore_mismatched_sizes,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForTokenClassification.from_config(config)
+        model = AutoModelForTokenClassification.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 179ef71ad3d2..1e39ab31bac9 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -105,6 +105,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -380,6 +390,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -387,6 +398,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_args.model_name_or_path,
@@ -395,6 +407,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 77efa14f4f3c..b61e715d3748 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -257,6 +257,16 @@ def parse_args():
         "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
     )
     parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=str,
@@ -386,17 +396,21 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, trust_remote_code=args.trust_remote_code)
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=args.trust_remote_code)
     else:
         config = CONFIG_MAPPING[args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -408,10 +422,11 @@ def main():
             args.model_name_or_path,
             from_tf=bool(".ckpt" in args.model_name_or_path),
             config=config,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForSeq2SeqLM.from_config(config)
+        model = AutoModelForSeq2SeqLM.from_config(config, trust_remote_code=args.trust_remote_code)
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 23e04c93181d..5223b7db0b3a 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -173,6 +173,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
@@ -323,12 +333,14 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -449,6 +461,7 @@ def compute_metrics(p):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
             ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
         )
         num_replicas = training_args.strategy.num_replicas_in_sync
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 7c068d5c8c62..43d5a1d8a3de 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -128,6 +128,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -366,17 +376,26 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name)
+        config = AutoConfig.from_pretrained(
+            model_args.config_name,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -479,12 +498,16 @@ def group_texts(examples):
     with training_args.strategy.scope():
         # region Prepare model
         if checkpoint is not None:
-            model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config)
+            model = TFAutoModelForCausalLM.from_pretrained(
+                checkpoint, config=config, trust_remote_code=model_args.trust_remote_code
+            )
         elif model_args.model_name_or_path:
-            model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config)
+            model = TFAutoModelForCausalLM.from_pretrained(
+                model_args.model_name_or_path, config=config, trust_remote_code=model_args.trust_remote_code
+            )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForCausalLM.from_config(config)
+            model = TFAutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 7ea1a2f8534a..75edfa954aae 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -126,6 +126,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -348,19 +358,25 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if checkpoint is not None:
-        config = AutoConfig.from_pretrained(checkpoint)
+        config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=model_args.trust_remote_code)
     elif model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name)
+        config = AutoConfig.from_pretrained(model_args.config_name, trust_remote_code=model_args.trust_remote_code)
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
 
     if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -495,12 +511,16 @@ def group_texts(examples):
     with training_args.strategy.scope():
         # region Prepare model
         if checkpoint is not None:
-            model = TFAutoModelForMaskedLM.from_pretrained(checkpoint, config=config)
+            model = TFAutoModelForMaskedLM.from_pretrained(
+                checkpoint, config=config, trust_remote_code=model_args.trust_remote_code
+            )
         elif model_args.model_name_or_path:
-            model = TFAutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path, config=config)
+            model = TFAutoModelForMaskedLM.from_pretrained(
+                model_args.model_name_or_path, config=config, trust_remote_code=model_args.trust_remote_code
+            )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForMaskedLM.from_config(config)
+            model = TFAutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 170117ad9d72..d78060dfa4dc 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -162,6 +162,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -349,6 +359,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -356,6 +367,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -442,6 +454,7 @@ def preprocess_function(examples):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         num_replicas = training_args.strategy.num_replicas_in_sync
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 8bbc986d0e90..6152263d6b24 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -93,6 +93,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -352,6 +362,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -359,6 +370,7 @@ def main():
         use_fast=True,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -639,6 +651,7 @@ def compute_metrics(p: EvalPrediction):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         if training_args.do_train:
             training_dataset = model.prepare_tf_dataset(
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 82359f0921d4..03c434330b85 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -115,6 +115,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -402,6 +412,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -409,6 +420,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -527,6 +539,7 @@ def postprocess_text(preds, labels):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index daeed0c6eab2..785e88a71d6b 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -180,6 +180,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 # endregion
@@ -298,6 +308,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -305,6 +316,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -388,6 +400,7 @@ def compute_metrics(preds, label_ids):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         # endregion
 
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index a68aed942d8e..0d2ea87b96cb 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -186,6 +186,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 # endregion
@@ -315,6 +325,7 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         config = AutoConfig.from_pretrained(
@@ -322,12 +333,14 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     # endregion
 
@@ -416,6 +429,7 @@ def preprocess_function(examples):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         # endregion
 
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index ed8ab00e4f85..bcf47eb83c1e 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -91,6 +91,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -304,9 +314,17 @@ def get_label_list(labels):
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            model_args.config_name,
+            num_labels=num_labels,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels)
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path,
+            num_labels=num_labels,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
@@ -319,9 +337,18 @@ def get_label_list(labels):
         )
 
     if config.model_type in {"gpt2", "roberta"}:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            use_fast=True,
+            add_prefix_space=True,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     else:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            use_fast=True,
+            trust_remote_code=model_args.trust_remote_code,
+        )
     # endregion
 
     # region Preprocessing the raw datasets
@@ -392,10 +419,13 @@ def tokenize_and_align_labels(examples):
             model = TFAutoModelForTokenClassification.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
+                trust_remote_code=model_args.trust_remote_code,
             )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForTokenClassification.from_config(config)
+            model = TFAutoModelForTokenClassification.from_config(
+                config, trust_remote_code=model_args.trust_remote_code
+            )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 68e9e9c16b3a..2673983ec9fa 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -109,6 +109,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 
 
 @dataclass
@@ -366,6 +376,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -373,6 +384,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
@@ -480,6 +492,7 @@ def preprocess_function(examples):
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index c6b72f4fa30b..12e59b8b2449 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -122,6 +122,16 @@ class ModelArguments:
             "with private models)."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
 {% endif %}
 
 
@@ -290,6 +300,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
         "token": True if model_args.token else None,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -304,6 +315,7 @@ def main():
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
         "token": True if model_args.token else None,
+        "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
@@ -323,6 +335,7 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=True if model_args.token else None,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
@@ -337,6 +350,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=True if model_args.token else None,
+        trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
@@ -344,6 +358,7 @@ def main():
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
         token=True if model_args.token else None,
+        trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path,
@@ -352,6 +367,7 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         token=True if model_args.token else None,
+        trust_remote_code=model_args.trust_remote_code,
     )
 {% endif %}
 

From 7d65697da746066aa75238347d8c86bde1acbf1b Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 7 Aug 2023 15:38:24 +0100
Subject: [PATCH 868/935] Generate: remove Marian hack (#25294)

Remove Marian hack
---
 src/transformers/generation/tf_utils.py       | 21 -------------------
 src/transformers/generation/utils.py          | 18 ----------------
 .../models/marian/modeling_marian.py          |  4 ----
 .../models/marian/modeling_tf_marian.py       | 15 -------------
 4 files changed, 58 deletions(-)

diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index dcb407f5707a..648ec710cfe5 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -474,27 +474,6 @@ def prepare_inputs_for_generation(self, *args, **kwargs):
             "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
         )
 
-    def adjust_logits_during_generation(
-        self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs
-    ):
-        """
-        Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method.
-        """
-        vocab_size = getattr(self.config, "vocab_size", None)
-        if vocab_size is None and self.config.is_encoder_decoder:
-            decoder_config = getattr(self.config, "decoder", None)
-            if decoder_config is not None:
-                vocab_size = getattr(self.config.decoder, "vocab_size", None)
-
-        if cur_len == 1 and forced_bos_token_id is not None:
-            vocab_range = tf.constant(range(vocab_size))
-            return tf.where(vocab_range != forced_bos_token_id, -1e8, logits)
-        elif cur_len == max_length - 1 and forced_eos_token_id is not None:
-            vocab_range = tf.constant(range(vocab_size))
-            return tf.where(vocab_range != forced_eos_token_id, -1e8, logits)
-        else:
-            return logits
-
     def compute_transition_scores(
         self,
         sequences: tf.Tensor,
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index aca7bdd6f454..29d636b3c63b 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -578,12 +578,6 @@ def _prepare_model_inputs(
         inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs)
         return inputs, input_name, model_kwargs
 
-    def adjust_logits_during_generation(self, logits: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
-        """
-        Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method.
-        """
-        return logits
-
     def _maybe_initialize_input_ids_for_generation(
         self,
         inputs: Optional[torch.Tensor] = None,
@@ -3060,9 +3054,6 @@ def beam_search(
                 continue  # don't waste resources running the code we don't need
 
             next_token_logits = outputs.logits[:, -1, :]
-            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
@@ -3388,9 +3379,6 @@ def beam_sample(
 
             next_token_logits = outputs.logits[:, -1, :]
 
-            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
@@ -3751,9 +3739,6 @@ def group_beam_search(
                 # select outputs of beams of current group only
                 next_token_logits = outputs.logits[batch_group_indices, -1, :]
 
-                # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-                # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-                next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
                 next_token_scores = nn.functional.log_softmax(
                     next_token_logits, dim=-1
                 )  # (batch_size * group_size, vocab_size)
@@ -4110,9 +4095,6 @@ def constrained_beam_search(
                 continue  # don't waste resources running the code we don't need
 
             next_token_logits = outputs.logits[:, -1, :]
-            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
-            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
-            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 55f8127dc9d2..bad7ff719443 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1524,10 +1524,6 @@ def prepare_inputs_for_generation(
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
-    def adjust_logits_during_generation(self, logits, cur_len):
-        logits[:, self.config.pad_token_id] = float("-inf")  # never predict pad token.
-        return logits
-
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 8e0cecb99b59..f163c8217131 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -1443,18 +1443,3 @@ def prepare_inputs_for_generation(
 
     def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
         return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
-
-    def adjust_logits_during_generation(
-        self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs
-    ):
-        """Never predict pad_token_id. Predict </s> when max_length is reached."""
-        vocab_range = tf.constant(range(self.config.vocab_size))
-        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
-        if cur_len == 1 and forced_bos_token_id is not None:
-            vocab_range = tf.constant(range(self.config.vocab_size))
-            return tf.where(vocab_range != forced_bos_token_id, LARGE_NEGATIVE, logits)
-        elif cur_len == max_length - 1 and forced_eos_token_id is not None:
-            vocab_range = tf.constant(range(self.config.vocab_size))
-            return tf.where(vocab_range != forced_eos_token_id, LARGE_NEGATIVE, logits)
-        else:
-            return logits

From c177606fb47fd15109df0025ba15bdf6fe1f8bea Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:45:41 +0200
Subject: [PATCH 869/935] Fix more offload edge cases (#25342)

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/longformer/test_modeling_longformer.py | 3 +++
 tests/test_modeling_common.py                       | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index b40e464e6032..7edcd206ab41 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -326,6 +326,9 @@ class LongformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         else {}
     )
 
+    # Need to use `0.6` instead of `0.5` for `test_disk_offload`
+    model_split_percents = [0.6, 0.7, 0.9]
+
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
         self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index fe2e2bc81953..1850ae8e5bf5 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2521,15 +2521,17 @@ def test_disk_offload(self):
             base_output = model(**inputs_dict_class)
 
             model_size = compute_module_sizes(model)[""]
-            max_size = int(self.model_split_percents[1] * model_size)
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 
-                max_memory = {0: max_size, "cpu": max_size}
                 with self.assertRaises(ValueError):
+                    max_size = int(self.model_split_percents[0] * model_size)
+                    max_memory = {0: max_size, "cpu": max_size}
                     # This errors out cause it's missing an offload folder
                     new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
 
+                max_size = int(self.model_split_percents[1] * model_size)
+                max_memory = {0: max_size, "cpu": max_size}
                 new_model = model_class.from_pretrained(
                     tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
                 )
@@ -2559,7 +2561,7 @@ def test_cpu_offload(self):
 
             model_size = compute_module_sizes(model)[""]
             # We test several splits of sizes to make sure it works.
-            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents]
+            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 

From baf1daa58eb2960248fd9f7c3af0ed245b8ce4af Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 7 Aug 2023 17:47:22 +0200
Subject: [PATCH 870/935] Migrate Trainer from `Repository` to `upload_folder`
 (#25095)

* First draft

* Deal with progress bars

* Update src/transformers/utils/hub.py

Co-authored-by: Lucain <lucainp@gmail.com>

* Address review comments

* Forgot one

* Pin hf_hub

* Add argument for push all and fix tests

* Fix tests

* Address review comments

---------

Co-authored-by: Lucain <lucainp@gmail.com>
---
 setup.py                                      |   2 +-
 src/transformers/dependency_versions_table.py |   2 +-
 src/transformers/trainer.py                   | 182 +++++++++++-------
 src/transformers/training_args.py             |  11 ++
 src/transformers/utils/__init__.py            |   1 +
 src/transformers/utils/hub.py                 |  24 +++
 tests/trainer/test_trainer.py                 |  40 ++--
 7 files changed, 171 insertions(+), 91 deletions(-)

diff --git a/setup.py b/setup.py
index 2a0ef33b8831..5047727141e5 100644
--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,7 @@
     "fugashi>=1.0",
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.14.1,<1.0",
+    "huggingface-hub>=0.15.1,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index cce16d66da67..117180c2eebc 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -25,7 +25,7 @@
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.14.1,<1.0",
+    "huggingface-hub": "huggingface-hub>=0.15.1,<1.0",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 1e68fcc4bd53..cb4c04321027 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -44,10 +44,11 @@
 
 # isort: on
 
+import huggingface_hub.utils as hf_hub_utils
 import numpy as np
 import torch
 import torch.distributed as dist
-from huggingface_hub import Repository, create_repo
+from huggingface_hub import Repository, create_repo, upload_folder
 from packaging import version
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
@@ -127,6 +128,7 @@
     SAFE_WEIGHTS_NAME,
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
+    PushInProgress,
     can_return_loss,
     find_labels,
     is_accelerate_available,
@@ -548,15 +550,10 @@ def __init__(
         # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
         self._loggers_initialized = False
 
-        # Create clone of distant repo and output directory if needed
+        # Create distant repo and output directory if needed
+        self.hub_model_id = None
         if self.args.push_to_hub:
-            self.init_git_repo(at_init=True)
-            # In case of pull, we need to make sure every process has the latest.
-            if is_torch_tpu_available():
-                xm.rendezvous("init git repo")
-            elif args.parallel_mode == ParallelMode.DISTRIBUTED:
-                dist.barrier()
-
+            self.init_hf_repo()
         if self.args.should_save:
             os.makedirs(self.args.output_dir, exist_ok=True)
 
@@ -1531,12 +1528,25 @@ def train(
         inner_training_loop = find_executable_batch_size(
             self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
         )
-        return inner_training_loop(
-            args=args,
-            resume_from_checkpoint=resume_from_checkpoint,
-            trial=trial,
-            ignore_keys_for_eval=ignore_keys_for_eval,
-        )
+        if args.push_to_hub:
+            try:
+                # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
+                hf_hub_utils.disable_progress_bars()
+                return inner_training_loop(
+                    args=args,
+                    resume_from_checkpoint=resume_from_checkpoint,
+                    trial=trial,
+                    ignore_keys_for_eval=ignore_keys_for_eval,
+                )
+            finally:
+                hf_hub_utils.enable_progress_bars()
+        else:
+            return inner_training_loop(
+                args=args,
+                resume_from_checkpoint=resume_from_checkpoint,
+                trial=trial,
+                ignore_keys_for_eval=ignore_keys_for_eval,
+            )
 
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
@@ -1968,6 +1978,9 @@ def _inner_training_loop(
 
         self.control = self.callback_handler.on_train_end(args, self.state, self.control)
 
+        # Wait for the checkpoint to be uploaded.
+        self._finish_current_push()
+
         return TrainOutput(self.state.global_step, train_loss, metrics)
 
     def _get_output_dir(self, trial):
@@ -3386,16 +3399,43 @@ def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
         else:
             return 0
 
+    def init_hf_repo(self):
+        """
+        Initializes a git repo in `self.args.hub_model_id`.
+        """
+        # Only on process zero
+        if not self.is_world_process_zero():
+            return
+
+        if self.args.hub_model_id is None:
+            repo_name = Path(self.args.output_dir).absolute().name
+        else:
+            repo_name = self.args.hub_model_id
+
+        repo_url = create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True)
+        self.hub_model_id = repo_url.repo_id
+        self.push_in_progress = None
+
     def init_git_repo(self, at_init: bool = False):
         """
         Initializes a git repo in `self.args.hub_model_id`.
 
+        <Tip warning={true}>
+
+        This function is deprecated and will be removed in v4.34.0 of Transformers.
+
+        </Tip>
+
         Args:
             at_init (`bool`, *optional*, defaults to `False`):
                 Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
                 `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
                 out.
         """
+        warnings.warn(
+            "`Trainer.init_git_repo` is deprecated and will be removed in v4.34.0 of Transformers. Use "
+            "`Trainer.init_hf_repo` instead."
+        )
         if not self.is_world_process_zero():
             return
 
@@ -3493,8 +3533,8 @@ def _push_from_checkpoint(self, checkpoint_folder):
         # Only push from one node.
         if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
             return
-        # If we haven't finished the last push, we don't do this one.
-        if self.push_in_progress is not None and not self.push_in_progress.is_done:
+        # If we haven't finished the last push, we don't do this one unless args.hub_always_push=True.
+        if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
             return
 
         output_dir = self.args.output_dir
@@ -3511,34 +3551,51 @@ def _push_from_checkpoint(self, checkpoint_folder):
         # Same for the training arguments
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
-        try:
-            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
-                # Temporarily move the checkpoint just saved for the push
-                tmp_checkpoint = os.path.join(output_dir, "last-checkpoint")
-                # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a
-                # subfolder.
-                if os.path.isdir(tmp_checkpoint):
-                    shutil.rmtree(tmp_checkpoint)
-                shutil.move(checkpoint_folder, tmp_checkpoint)
-
-            if self.args.save_strategy == IntervalStrategy.STEPS:
-                commit_message = f"Training in progress, step {self.state.global_step}"
-            else:
-                commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
-            push_work = self.repo.push_to_hub(commit_message=commit_message, blocking=False, auto_lfs_prune=True)
-            # Return type of `Repository.push_to_hub` is either None or a tuple.
-            if push_work is not None:
-                self.push_in_progress = push_work[1]
-        except Exception as e:
-            logger.error(f"Error when pushing to hub: {e}")
-        finally:
-            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
-                # Move back the checkpoint to its place
-                shutil.move(tmp_checkpoint, checkpoint_folder)
+        if self.args.save_strategy == IntervalStrategy.STEPS:
+            commit_message = f"Training in progress, step {self.state.global_step}"
+        else:
+            commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
+
+        model_push_job = upload_folder(
+            repo_id=self.hub_model_id,
+            folder_path=output_dir,
+            commit_message=commit_message,
+            token=self.args.hub_token,
+            run_as_future=True,
+            ignore_patterns=["_*", "**/*"],
+        )
+
+        push_jobs = [model_push_job]
+
+        if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]:
+            path_in_repo = (
+                "last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name
+            )
+            checkpoint_push = upload_folder(
+                repo_id=self.hub_model_id,
+                folder_path=checkpoint_folder,
+                path_in_repo=path_in_repo,
+                commit_message=commit_message + ", checkpoint",
+                token=self.args.hub_token,
+                run_as_future=True,
+            )
+            push_jobs.append(checkpoint_push)
+
+        if self.push_in_progress is None or self.push_in_progress.is_done():
+            self.push_in_progress = PushInProgress(push_jobs)
+        else:
+            self.push_in_progress.jobs.extend(push_jobs)
+
+    def _finish_current_push(self):
+        if not hasattr(self, "push_in_progress"):
+            return
+        if self.push_in_progress is not None and not self.push_in_progress.is_done():
+            logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.")
+            self.push_in_progress.wait_until_done()
 
     def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
         """
-        Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
+        Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.
 
         Parameters:
             commit_message (`str`, *optional*, defaults to `"End of training"`):
@@ -3549,14 +3606,9 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
                 Additional keyword arguments passed along to [`~Trainer.create_model_card`].
 
         Returns:
-            The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of
-            the commit and an object to track the progress of the commit if `blocking=True`
+            The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
+            progress of the commit if `blocking=True`.
         """
-        # If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, we try to create the repo but
-        # it might fail.
-        if not hasattr(self, "repo"):
-            self.init_git_repo()
-
         model_name = kwargs.pop("model_name", None)
         if model_name is None and self.args.should_save:
             if self.args.hub_model_id is None:
@@ -3564,6 +3616,10 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
             else:
                 model_name = self.args.hub_model_id.split("/")[-1]
 
+        # In case the user calls this method with args.push_to_hub = False
+        if self.hub_model_id is None:
+            self.init_hf_repo()
+
         # Needs to be executed on all processes for TPU training, but will only save on the processed determined by
         # self.args.should_save.
         self.save_model(_internal_call=True)
@@ -3572,25 +3628,19 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
         if not self.is_world_process_zero():
             return
 
-        # Cancel any async push in progress if blocking=True. The commits will all be pushed together.
-        if blocking and self.push_in_progress is not None and not self.push_in_progress.is_done:
-            self.push_in_progress._process.kill()
-            self.push_in_progress = None
+        self.create_model_card(model_name=model_name, **kwargs)
 
-        git_head_commit_url = self.repo.push_to_hub(
-            commit_message=commit_message, blocking=blocking, auto_lfs_prune=True
-        )
-        # push separately the model card to be independant from the rest of the model
-        if self.args.should_save:
-            self.create_model_card(model_name=model_name, **kwargs)
-            try:
-                self.repo.push_to_hub(
-                    commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True
-                )
-            except EnvironmentError as exc:
-                logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
+        # Wait for the current upload to be finished.
+        self._finish_current_push()
 
-        return git_head_commit_url
+        return upload_folder(
+            repo_id=self.hub_model_id,
+            folder_path=self.args.output_dir,
+            commit_message=commit_message,
+            token=self.args.hub_token,
+            run_as_future=not blocking,
+            ignore_patterns=["_*", "**/*"],
+        )
 
     #
     # Deprecated code
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 59f2cac02b38..1f6e81959ec1 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -568,6 +568,8 @@ class TrainingArguments:
             `huggingface-cli login`.
         hub_private_repo (`bool`, *optional*, defaults to `False`):
             If True, the Hub repo will be set to private.
+        hub_always_push (`bool`, *optional*, defaults to `False`):
+            Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not finished.
         gradient_checkpointing (`bool`, *optional*, defaults to `False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
         include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
@@ -1110,6 +1112,10 @@ class TrainingArguments:
     )
     hub_token: Optional[str] = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
     hub_private_repo: bool = field(default=False, metadata={"help": "Whether the model repository is private or not."})
+    hub_always_push: bool = field(
+        default=False,
+        metadata={"help": "Unless `True`, the Trainer will skip pushes if the previous one wasn't finished yet."},
+    )
     gradient_checkpointing: bool = field(
         default=False,
         metadata={
@@ -2367,6 +2373,7 @@ def set_push_to_hub(
         strategy: Union[str, HubStrategy] = "every_save",
         token: Optional[str] = None,
         private_repo: bool = False,
+        always_push: bool = False,
     ):
         """
         A method that regroups all arguments linked to synchronizing checkpoints with the Hub.
@@ -2407,6 +2414,9 @@ def set_push_to_hub(
                 with `huggingface-cli login`.
             private_repo (`bool`, *optional*, defaults to `False`):
                 If True, the Hub repo will be set to private.
+            always_push (`bool`, *optional*, defaults to `False`):
+                Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not
+                finished.
 
         Example:
 
@@ -2424,6 +2434,7 @@ def set_push_to_hub(
         self.hub_strategy = HubStrategy(strategy)
         self.hub_token = token
         self.hub_private_repo = private_repo
+        self.hub_always_push = always_push
         return self
 
     def set_optimizer(
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 3a5dea46161d..728fb5c911d1 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -70,6 +70,7 @@
     TRANSFORMERS_CACHE,
     TRANSFORMERS_DYNAMIC_MODULE_NAME,
     EntryNotFoundError,
+    PushInProgress,
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 9fba97f098f1..a7afccb74a57 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -22,6 +22,7 @@
 import tempfile
 import traceback
 import warnings
+from concurrent import futures
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
@@ -1177,6 +1178,29 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
         )
 
 
+class PushInProgress:
+    """
+    Internal class to keep track of a push in progress (which might contain multiple `Future` jobs).
+    """
+
+    def __init__(self, jobs: Optional[futures.Future] = None) -> None:
+        self.jobs = [] if jobs is None else jobs
+
+    def is_done(self):
+        return all(job.done() for job in self.jobs)
+
+    def wait_until_done(self):
+        futures.wait(self.jobs)
+
+    def cancel(self) -> None:
+        self.jobs = [
+            job
+            for job in self.jobs
+            # Cancel the job if it wasn't started yet and remove cancelled/done jobs from the list
+            if not (job.cancel() or job.done())
+        ]
+
+
 cache_version_file = os.path.join(TRANSFORMERS_CACHE, "version.txt")
 if not os.path.isfile(cache_version_file):
     cache_version = 0
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index d8df18bdcbee..db7f9bb20c9c 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -23,14 +23,13 @@
 import subprocess
 import sys
 import tempfile
-import time
 import unittest
 from itertools import product
 from pathlib import Path
 from unittest.mock import Mock, patch
 
 import numpy as np
-from huggingface_hub import HfFolder, Repository, delete_repo
+from huggingface_hub import HfFolder, delete_repo, list_repo_commits
 from parameterized import parameterized
 from requests.exceptions import HTTPError
 
@@ -2226,21 +2225,17 @@ def test_push_to_hub_with_saves_each_epoch(self):
                 output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
                 push_to_hub=True,
                 hub_token=self._token,
+                # To avoid any flakiness if the training goes faster than the uploads.
+                hub_always_push=True,
                 save_strategy="epoch",
             )
             trainer.train()
 
-            # Wait for the async pushes to be finished
-            while trainer.push_in_progress is not None and not trainer.push_in_progress.is_done:
-                time.sleep(0.5)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-epoch", token=self._token)
-            commits = self.get_commit_history(tmp_dir)
-            self.assertIn("initial commit", commits)
-            # We can't test that epoch 2 and 3 are in the commits without being flaky as those might be skipped if
-            # the push for epoch 1 wasn't finished at the time.
-            self.assertIn("Training in progress, epoch 1", commits)
+        commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
+        commits = [c.title for c in commits]
+        self.assertIn("initial commit", commits)
+        for i in range(1, 4):
+            self.assertIn(f"Training in progress, epoch {i}", commits)
 
     def test_push_to_hub_with_saves_each_n_steps(self):
         num_gpus = max(1, get_gpu_count())
@@ -2252,22 +2247,21 @@ def test_push_to_hub_with_saves_each_n_steps(self):
                 output_dir=os.path.join(tmp_dir, "test-trainer-step"),
                 push_to_hub=True,
                 hub_token=self._token,
+                # To avoid any flakiness if the training goes faster than the uploads.
+                hub_always_push=True,
                 save_strategy="steps",
                 save_steps=5,
             )
             trainer.train()
 
-            # Wait for the async pushes to be finished
-            while trainer.push_in_progress is not None and not trainer.push_in_progress.is_done:
-                time.sleep(0.5)
+        commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
+        commits = [c.title for c in commits]
+        self.assertIn("initial commit", commits)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-step", token=self._token)
-            commits = self.get_commit_history(tmp_dir)
-            self.assertIn("initial commit", commits)
-            # We can't test that epoch 2 and 3 are in the commits without being flaky as those might be skipped if
-            # the push for epoch 1 wasn't finished at the time.
-            self.assertIn("Training in progress, step 5", commits)
+        # max_steps depend on the number of available GPUs
+        max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
+        for i in range(5, max_steps, 5):
+            self.assertIn(f"Training in progress, step {i}", commits)
 
 
 @require_torch

From 5fe36970e569fdfce86d16436566ca7ec58cdf4a Mon Sep 17 00:00:00 2001
From: Phuc Van Phan <phanphuc1100@gmail.com>
Date: Mon, 7 Aug 2023 22:56:13 +0700
Subject: [PATCH 871/935] Adding more information in help parser on train_file
 and validation_file (#25324)

chorse: adding new doc on train and val
---
 examples/pytorch/language-modeling/run_clm_no_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index b195e33cc980..7d6e48863251 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -82,10 +82,10 @@ def parse_args():
         help="The configuration name of the dataset to use (via the datasets library).",
     )
     parser.add_argument(
-        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+        "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data."
     )
     parser.add_argument(
-        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+        "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data."
     )
     parser.add_argument(
         "--validation_split_percentage",

From 676247fd6b57ba926dd731c53183bed6efaf16ff Mon Sep 17 00:00:00 2001
From: Rishab26 <rbk9426@gmail.com>
Date: Mon, 7 Aug 2023 17:02:14 +0100
Subject: [PATCH 872/935] [DOCS] Add `NoRepeatNGramLogitsProcessor` Example for
 `LogitsProcessor` class (#25186)

* Add Description And Example to Docstring

* make style corrections

* make style

* Doc Style Consistent With HF

* Apply make style

* Modify Docstring

* Edit Type in Docstring

* Feedback Incorporated

* Edit Docstring

* make style

* Post Review Changes

* Review Feedback Incorporated

* Styling

* Formatting

* make style

* pep8
---
 src/transformers/generation/logits_process.py | 70 +++++++++++++++++--
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index b953b09a904a..cf5e1f3aea24 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -577,10 +577,28 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 
 def _get_ngrams(ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int):
+    """
+    Assume ngram_size=2 and prev_input_ids=tensor([[40, 2883, 2712, 4346]]). The output of generated ngrams look like
+    this {(40,): [2883], (2883,): [2712], (2712,): [4346]}.
+
+    Args:
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        prev_input_ids (`torch.Tensor`):
+           Generated token ids for the current hypothesis.
+        num_hypos (`int`):
+            The number of hypotheses for which n-grams need to be generated.
+
+    Returns:
+        generated_ngrams (`dict`):
+            Dictionary of generated ngrams.
+    """
+    # Initialize an empty list of dictionaries, one for each hypothesis (index) in the range of num_hypos
     generated_ngrams = [{} for _ in range(num_hypos)]
     for idx in range(num_hypos):
         gen_tokens = prev_input_ids[idx].tolist()
         generated_ngram = generated_ngrams[idx]
+        # Loop through each n-gram of size ngram_size in the list of tokens (gen_tokens)
         for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
             prev_ngram_tuple = tuple(ngram[:-1])
             generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
@@ -588,6 +606,22 @@ def _get_ngrams(ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int):
 
 
 def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
+    """
+    Determines the banned tokens for the current hypothesis based on previously generated n-grams.
+
+    Args:
+        banned_ngrams (`dict`):
+            A dictionary containing previously generated n-grams for each hypothesis.
+        prev_input_ids (`torch.Tensor`):
+            Generated token ids for the current hypothesis.
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        cur_len (`int`):
+            The current length of the token sequences for which the n-grams are being checked.
+
+    Returns:
+        List of tokens that are banned.
+    """
     # Before decoding the next token, prevent decoding of ngrams that have already appeared
     start_idx = cur_len + 1 - ngram_size
     ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
@@ -601,9 +635,7 @@ def _calc_banned_ngram_tokens(
     if cur_len + 1 < ngram_size:
         # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
         return [[] for _ in range(num_hypos)]
-
     generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
-
     banned_tokens = [
         _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len)
         for hypo_idx in range(num_hypos)
@@ -613,12 +645,43 @@ def _calc_banned_ngram_tokens(
 
 class NoRepeatNGramLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that enforces no repetition of n-grams. See
+    N-grams are groups of "n" consecutive words, characters, or tokens taken from a sequence of text. Given the
+    sentence: "She runs fast", the bi-grams (n=2) would be ("she", "runs") and ("runs", "fast"). In text generation,
+    avoiding repetitions of word sequences provides a more diverse output. This [`LogitsProcessor`] enforces no
+    repetition of n-grams by setting the scores of banned tokens to negative infinity which eliminates those tokens
+    from consideration when further processing the scores.
     [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
 
+    <Tip>
+
+    Use n-gram penalties with care. For instance, penalizing 2-grams (bigrams) in an article about the city of New York
+    might lead to undesirable outcomes where the city's name appears only once in the entire text.
+    [Reference](https://huggingface.co/blog/how-to-generate)
+
+    </Tip>
+
     Args:
         ngram_size (`int`):
             All ngrams of size `ngram_size` can only occur once.
+
+    Examples:
+
+    ```py
+    >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    >>> inputs = tokenizer(["I enjoy watching football"], return_tensors="pt")
+
+    >>> output = model.generate(**inputs, max_length=50)
+    >>> print(tokenizer.decode(output[0], skip_special_tokens=True))
+    "I enjoy playing football on the weekends, but I'm not a big fan of the idea of playing in the middle of the night. I'm not a big fan of the idea of playing in the middle of the night. I'm not a big"
+
+    >>> # Now let's add ngram size using <no_repeat_ngram_size> in model.generate. This should stop the repetitions in the output.
+    >>> output = model.generate(**inputs, max_length=50, no_repeat_ngram_size=2)
+    >>> print(tokenizer.decode(output[0], skip_special_tokens=True))
+    I enjoy playing football on the weekends, but I'm not a big fan of the idea of playing in the middle of a game. I think it's a bit of an overreaction to the fact that we're playing a team that's playing"
+    ```
     """
 
     def __init__(self, ngram_size: int):
@@ -631,7 +694,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         num_batch_hypotheses = scores.shape[0]
         cur_len = input_ids.shape[-1]
         banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len)
-
         for i, banned_tokens in enumerate(banned_batch_tokens):
             scores[i, banned_tokens] = -float("inf")
 

From 5ee9693a1c77c617ebc43ef20194b6d3b674318e Mon Sep 17 00:00:00 2001
From: Merve Noyan <merveenoyan@gmail.com>
Date: Mon, 7 Aug 2023 19:18:43 +0300
Subject: [PATCH 873/935] =?UTF-8?q?Docs:=20Added=20benchmarks=20for=20`tor?=
 =?UTF-8?q?ch.compile()`=C2=A0for=20vision=20models=20(#24748)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* added benchmarks for compile

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* added more models

* added more models fr

* added visualizations

* minor fix

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/perf_torch_compile.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Added links to models and put charts side by side

* Added batch comparisons

* Added more comparisons

* Fix table

* Added link to wheel

* Update perf_torch_compile.md

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/_toctree.yml          |   2 +
 docs/source/en/perf_torch_compile.md | 354 +++++++++++++++++++++++++++
 2 files changed, 356 insertions(+)
 create mode 100644 docs/source/en/perf_torch_compile.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b28c6dbc1267..c17669026c20 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -150,6 +150,8 @@
       title: Troubleshooting
     - local: tf_xla
       title: XLA Integration for TensorFlow Models
+    - local: perf_torch_compile
+      title: Optimize inference using `torch.compile()`
   title: Performance and scalability
 - sections:
     - local: contributing
diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md
new file mode 100644
index 000000000000..6c51cd2e8ea3
--- /dev/null
+++ b/docs/source/en/perf_torch_compile.md
@@ -0,0 +1,354 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimize inference using torch.compile()
+
+This guide aims to provide a benchmark on the inference speed-ups introduced with [`torch.compile()`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for [computer vision models in 🤗 Transformers](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers&sort=trending).
+
+## Benefits of torch.compile
+   
+Depending on the model and the GPU, `torch.compile()` yields up to 30% speed-up during inference. To use `torch.compile()`, simply install any version of `torch` above 2.0. 
+
+Compiling a model takes time, so it's useful if you are compiling the model only once instead of every time you infer.
+To compile any computer vision model of your choice, call `torch.compile()` on the model as shown below:
+
+```diff
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda")
++ model = torch.compile(model)
+```
+
+`compile()` comes with multiple modes for compiling, which essentially differ in compilation time and inference overhead. `max-autotune` takes longer than `reduce-overhead` but results in faster inference. Default mode is fastest for compilation but is not as efficient compared to `reduce-overhead` for inference time. In this guide, we used the default mode. You can learn more about it [here](https://pytorch.org/get-started/pytorch-2.0/#user-experience).
+
+We benchmarked `torch.compile` with different computer vision models, tasks, types of hardware, and batch sizes on `torch` version 2.0.1.
+
+## Benchmarking code 
+
+Below you can find the benchmarking code for each task. We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time.
+
+### Image Classification with ViT
+
+```python 
+import torch
+from PIL import Image
+import requests
+import numpy as np
+from transformers import AutoImageProcessor, AutoModelForImageClassification
+
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda")
+model = torch.compile(model)
+
+processed_input = processor(image, return_tensors='pt').to(device="cuda")
+
+with torch.no_grad():
+    _ = model(**processed_input)
+
+```
+
+#### Object Detection with DETR
+
+```python 
+from transformers import AutoImageProcessor, AutoModelForObjectDetection
+
+processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda")
+model = torch.compile(model)
+
+texts = ["a photo of a cat", "a photo of a dog"]
+inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    _ = model(**inputs)
+```
+
+#### Image Segmentation with Segformer
+
+```python 
+from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+
+processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda")
+model = torch.compile(model)
+seg_inputs = processor(images=image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    _ = model(**seg_inputs)
+```
+
+Below you can find the list of the models we benchmarked.
+
+**Image Classification** 
+- [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224)
+- [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k)
+- [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224)
+- [microsoft/resnet-50](https://huggingface.co/)
+
+**Image Segmentation** 
+- [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [facebook/mask2former-swin-tiny-coco-panoptic](https://huggingface.co/facebook/mask2former-swin-tiny-coco-panoptic)
+- [facebook/maskformer-swin-base-ade](https://huggingface.co/facebook/maskformer-swin-base-ade)
+- [google/deeplabv3_mobilenet_v2_1.0_513](https://huggingface.co/google/deeplabv3_mobilenet_v2_1.0_513)
+
+**Object Detection** 
+- [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32)
+- [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101)
+- [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50)
+
+Below you can find visualization of inference durations with and without `torch.compile()` and percentage improvements for each model in different hardware and batch sizes. 
+
+<div class="flex">
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/a100_batch_comp.png" />
+  </div>
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_batch_comp.png" />
+  </div>
+   <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/t4_batch_comp.png" />
+  </div>
+</div>
+
+<div class="flex">
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/A100_1_duration.png" />
+  </div>
+  <div>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/A100_1_percentage.png" />
+  </div>
+</div>
+
+
+![Duration Comparison on V100 with Batch Size of 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_1_duration.png)
+
+![Percentage Improvement on T4 with Batch Size of 4](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/T4_4_percentage.png)
+
+Below you can find inference durations in milliseconds for each model with and without `compile()`. Note that OwlViT results in OOM in larger batch sizes.
+
+### A100 (batch size: 1)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 9.325 | 7.584 | 
+| Image Segmentation/Segformer | 11.759 | 10.500 |
+| Object Detection/OwlViT | 24.978 | 18.420 |
+| Image Classification/BeiT | 11.282 | 8.448 | 
+| Object Detection/DETR | 34.619 | 19.040 |
+| Image Classification/ConvNeXT | 10.410 | 10.208 | 
+| Image Classification/ResNet | 6.531 | 4.124 |
+| Image Segmentation/Mask2former | 60.188 | 49.117 |
+| Image Segmentation/Maskformer | 75.764 | 59.487 | 
+| Image Segmentation/MobileNet | 8.583 | 3.974 |
+| Object Detection/Resnet-101 | 36.276 | 18.197 |
+| Object Detection/Conditional-DETR | 31.219 | 17.993 |
+
+
+### A100 (batch size: 4)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 14.832 | 14.499 | 
+| Image Segmentation/Segformer | 18.838 | 16.476 |
+| Image Classification/BeiT | 13.205 | 13.048 | 
+| Object Detection/DETR | 48.657 | 32.418|
+| Image Classification/ConvNeXT | 22.940 | 21.631 | 
+| Image Classification/ResNet | 6.657 | 4.268 |
+| Image Segmentation/Mask2former | 74.277 | 61.781 |
+| Image Segmentation/Maskformer | 180.700 | 159.116 | 
+| Image Segmentation/MobileNet | 14.174 | 8.515 |
+| Object Detection/Resnet-101 | 68.101 | 44.998 |
+| Object Detection/Conditional-DETR | 56.470 | 35.552 |
+
+### A100 (batch size: 16)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 40.944 | 40.010 | 
+| Image Segmentation/Segformer | 37.005 | 31.144 |
+| Image Classification/BeiT | 41.854 | 41.048 | 
+| Object Detection/DETR | 164.382 | 161.902 |
+| Image Classification/ConvNeXT | 82.258 | 75.561 | 
+| Image Classification/ResNet | 7.018 | 5.024 |
+| Image Segmentation/Mask2former | 178.945 | 154.814 |
+| Image Segmentation/Maskformer | 638.570 | 579.826 | 
+| Image Segmentation/MobileNet | 51.693 | 30.310 |
+| Object Detection/Resnet-101 | 232.887 | 155.021 |
+| Object Detection/Conditional-DETR | 180.491 | 124.032 |
+
+### V100 (batch size: 1)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 10.495 | 6.00 | 
+| Image Segmentation/Segformer | 13.321 | 5.862 | 
+| Object Detection/OwlViT | 25.769 | 22.395 | 
+| Image Classification/BeiT | 11.347 | 7.234 | 
+| Object Detection/DETR | 33.951 | 19.388 |
+| Image Classification/ConvNeXT | 11.623 | 10.412 | 
+| Image Classification/ResNet | 6.484 | 3.820 |
+| Image Segmentation/Mask2former | 64.640 | 49.873 |
+| Image Segmentation/Maskformer | 95.532 | 72.207 | 
+| Image Segmentation/MobileNet | 9.217 | 4.753 |
+| Object Detection/Resnet-101 | 52.818 | 28.367 |
+| Object Detection/Conditional-DETR | 39.512 | 20.816 |
+
+### V100 (batch size: 4)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 15.181 | 14.501 | 
+| Image Segmentation/Segformer | 16.787 | 16.188 |
+| Image Classification/BeiT | 15.171 | 14.753 | 
+| Object Detection/DETR | 88.529 | 64.195 |
+| Image Classification/ConvNeXT | 29.574 | 27.085 | 
+| Image Classification/ResNet | 6.109 | 4.731 |
+| Image Segmentation/Mask2former | 90.402 | 76.926 |
+| Image Segmentation/Maskformer | 234.261 | 205.456 | 
+| Image Segmentation/MobileNet | 24.623 | 14.816 |
+| Object Detection/Resnet-101 | 134.672 | 101.304 |
+| Object Detection/Conditional-DETR | 97.464 | 69.739 |
+
+### V100 (batch size: 16)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 52.209 | 51.633 | 
+| Image Segmentation/Segformer | 61.013 | 55.499 |
+| Image Classification/BeiT | 53.938 | 53.581  |
+| Object Detection/DETR | OOM | OOM |
+| Image Classification/ConvNeXT | 109.682 | 100.771 | 
+| Image Classification/ResNet | 14.857 | 12.089 |
+| Image Segmentation/Mask2former | 249.605 | 222.801 |
+| Image Segmentation/Maskformer | 831.142 | 743.645 | 
+| Image Segmentation/MobileNet | 93.129 | 55.365 |
+| Object Detection/Resnet-101 | 482.425 | 361.843 |
+| Object Detection/Conditional-DETR | 344.661 | 255.298 |
+
+### T4 (batch size: 1)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 16.520 | 15.786 | 
+| Image Segmentation/Segformer | 16.116 | 14.205 |
+| Object Detection/OwlViT | 53.634 | 51.105 |
+| Image Classification/BeiT | 16.464 | 15.710 | 
+| Object Detection/DETR | 73.100 | 53.99 |
+| Image Classification/ConvNeXT | 32.932 | 30.845 | 
+| Image Classification/ResNet | 6.031 | 4.321 |
+| Image Segmentation/Mask2former | 79.192 | 66.815 |
+| Image Segmentation/Maskformer | 200.026 | 188.268 | 
+| Image Segmentation/MobileNet | 18.908 | 11.997 |
+| Object Detection/Resnet-101 | 106.622 | 82.566 |
+| Object Detection/Conditional-DETR | 77.594 | 56.984 |
+
+### T4 (batch size: 4)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 43.653 | 43.626 | 
+| Image Segmentation/Segformer | 45.327 | 42.445 |
+| Image Classification/BeiT | 52.007 | 51.354 | 
+| Object Detection/DETR | 277.850 | 268.003 |
+| Image Classification/ConvNeXT | 119.259 | 105.580 | 
+| Image Classification/ResNet | 13.039 | 11.388 |
+| Image Segmentation/Mask2former | 201.540 | 184.670 |
+| Image Segmentation/Maskformer | 764.052 | 711.280 | 
+| Image Segmentation/MobileNet | 74.289 | 48.677 |
+| Object Detection/Resnet-101 | 421.859 | 357.614 |
+| Object Detection/Conditional-DETR | 289.002 | 226.945 |
+
+### T4 (batch size: 16)
+
+| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|
+| Image Classification/ViT | 163.914 | 160.907 | 
+| Image Segmentation/Segformer | 192.412 | 163.620 |
+| Image Classification/BeiT | 188.978 | 187.976 | 
+| Object Detection/DETR | OOM | OOM |
+| Image Classification/ConvNeXT | 422.886 | 388.078 | 
+| Image Classification/ResNet | 44.114 | 37.604 |
+| Image Segmentation/Mask2former | 756.337 | 695.291 |
+| Image Segmentation/Maskformer | 2842.940 | 2656.88 | 
+| Image Segmentation/MobileNet | 299.003 | 201.942 |
+| Object Detection/Resnet-101 |  1619.505 | 1262.758 | 
+| Object Detection/Conditional-DETR | 1137.513 | 897.390|
+
+## PyTorch Nightly
+We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://download.pytorch.org/whl/nightly/cu118)) and observed improvement in latency both for uncompiled and compiled models. 
+
+### A100
+| **Task/Model** | **Batch Size** | **torch 2.0 - no compile** | **torch 2.0 -<br> compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | 
+| Image Classification/BeiT | 4 | 14.109 | 12.851 | 
+| Image Classification/BeiT | 16 | 42.179 | 42.147 | 
+| Object Detection/DETR | Unbatched | 30.484 | 15.221 |
+| Object Detection/DETR | 4 | 46.816 | 30.942 |
+| Object Detection/DETR | 16 | 163.749 | 163.706  |
+
+### T4
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | 
+| Image Classification/BeiT | 4 | 47.381 | 46.604 | 
+| Image Classification/BeiT | 16 | 42.179 | 42.147  | 
+| Object Detection/DETR | Unbatched | 68.382 | 53.481 |
+| Object Detection/DETR | 4 | 269.615 | 204.785 |
+| Object Detection/DETR | 16 | OOM | OOM   |
+
+### V100
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | 
+| Image Classification/BeiT | 4 | 15.103 | 14.378 | 
+| Image Classification/BeiT | 16 | 52.517 | 51.691  | 
+| Object Detection/DETR | Unbatched | 28.706 | 19.077 |
+| Object Detection/DETR | 4 | 88.402 | 62.949|
+| Object Detection/DETR | 16 | OOM | OOM  |
+
+
+## Reduce Overhead
+We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly.
+
+### A100
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
+|:---:|:---:|:---:|:---:|
+| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | 
+| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | 
+| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | 
+| Image Classification/ResNet | 4 | 7.261 | 2.187 | 
+| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627  | 
+| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831  | 
+| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 |
+| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 |
+
+
+### T4
+| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** | 
+|:---:|:---:|:---:|:---:|
+| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | 
+| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | 
+| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | 
+| Image Classification/ResNet | 4 | 15.215 | 13.871 | 
+| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660  | 
+| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543  | 
+| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339  |
+| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 |
+
+

From 080a97119c0dabfd0fb5c3e26a872ad2958e4f77 Mon Sep 17 00:00:00 2001
From: Pedro Lira <pedrohml@gmail.com>
Date: Mon, 7 Aug 2023 16:07:29 -0300
Subject: [PATCH 874/935] Add mask2former fp16 support (#25093)

* Add mask2former fp16 support

* Clear consistency/quality issues

* Fix consistency/quality (2)

* Add integration test for mask2former (fp16 case)

* Fix code quality

* Add integration test for maskformer (fp16 case)

* Add integration test for oneformer (fp16 case)

* Remove slow decorator from fp16 tests

* Fix lint

* Remove usage of full inference and value checks for fp16

* Temporarily comment slow for {mask, mask2, one}former

* Add fp16 support to oneformer

* Revert "Temporarily comment slow for {mask, mask2, one}former"

This reverts commit e5371edabd301cf56079def0421a0a87df307cb0.

* Remove dtype conversion noop
---
 .../mask2former/modeling_mask2former.py       | 26 ++++++++--------
 .../models/maskformer/modeling_maskformer.py  |  8 ++---
 .../models/oneformer/modeling_oneformer.py    | 31 +++++++++----------
 .../mask2former/test_modeling_mask2former.py  | 23 +++++++++++++-
 .../maskformer/test_modeling_maskformer.py    | 23 +++++++++++++-
 .../oneformer/test_modeling_oneformer.py      | 23 +++++++++++++-
 6 files changed, 97 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index f814e1a0817a..96e03b841886 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -864,15 +864,15 @@ def __init__(
     def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
         if mask is None:
             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        not_mask = (~mask).to(x.dtype)
+        y_embed = not_mask.cumsum(1)
+        x_embed = not_mask.cumsum(2)
         if self.normalize:
             eps = 1e-6
             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
 
         pos_x = x_embed[:, :, :, None] / dim_t
@@ -1104,8 +1104,8 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
         reference_points_list = []
         for lvl, (height, width) in enumerate(spatial_shapes):
             ref_y, ref_x = torch.meshgrid(
-                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
-                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
                 indexing="ij",
             )
             ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * height)
@@ -1267,14 +1267,14 @@ def __init__(self, config: Mask2FormerConfig, feature_channels):
         self.lateral_convolutions = lateral_convs[::-1]
         self.output_convolutions = output_convs[::-1]
 
-    def get_valid_ratio(self, mask):
+    def get_valid_ratio(self, mask, dtype=torch.float32):
         """Get the valid ratio of all feature maps."""
 
         _, height, width = mask.shape
         valid_height = torch.sum(~mask[:, :, 0], 1)
         valid_width = torch.sum(~mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.float() / height
-        valid_ratio_width = valid_width.float() / width
+        valid_ratio_heigth = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
         valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
         return valid_ratio
 
@@ -1295,8 +1295,8 @@ def forward(
         input_embeds = []
         position_embeddings = []
         for level, x in enumerate(features[::-1][: self.num_feature_levels]):
-            input_embeds.append(self.input_projections[level](x.float()))
-            position_embeddings.append(self.position_embedding(x.float()))
+            input_embeds.append(self.input_projections[level](x))
+            position_embeddings.append(self.position_embedding(x))
 
         masks = [
             torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in input_embeds
@@ -1313,7 +1313,7 @@ def forward(
         level_pos_embed_flat = torch.cat(level_pos_embed_flat, 1)
 
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        valid_ratios = torch.stack([self.get_valid_ratio(mask) for mask in masks], 1)
+        valid_ratios = torch.stack([self.get_valid_ratio(mask, dtype=input_embeds_flat.dtype) for mask in masks], 1)
 
         # Send input_embeds_flat + masks_flat + level_pos_embed_flat (backbone + proj layer output) through encoder
         if encoder_outputs is None:
@@ -1351,7 +1351,7 @@ def forward(
         for idx, feature in enumerate(features[: self.num_fpn_levels][::-1]):
             lateral_conv = self.lateral_convolutions[idx]
             output_conv = self.output_convolutions[idx]
-            current_fpn = lateral_conv(feature.float())
+            current_fpn = lateral_conv(feature)
 
             # Following FPN implementation, we use nearest upsampling here
             out = current_fpn + nn.functional.interpolate(
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 8a9a950cbd9d..4f9500bd127d 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -1290,15 +1290,15 @@ def __init__(
     def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
         if mask is None:
             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        not_mask = (~mask).to(x.dtype)
+        y_embed = not_mask.cumsum(1)
+        x_embed = not_mask.cumsum(2)
         if self.normalize:
             eps = 1e-6
             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
 
         pos_x = x_embed[:, :, :, None] / dim_t
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 06d64709540f..0b15e28d6413 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -1179,8 +1179,8 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
         reference_points_list = []
         for lvl, (height, width) in enumerate(spatial_shapes):
             ref_y, ref_x = torch.meshgrid(
-                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
-                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
             )
             ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * height)
             ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * width)
@@ -1352,14 +1352,14 @@ def __init__(self, config: OneFormerConfig, feature_channels):
         self.lateral_convs = lateral_convs[::-1]
         self.output_convs = output_convs[::-1]
 
-    def get_valid_ratio(self, mask):
+    def get_valid_ratio(self, mask, dtype=torch.float32):
         """Get the valid ratio of all feature maps."""
 
         _, height, width = mask.shape
         valid_height = torch.sum(~mask[:, :, 0], 1)
         valid_width = torch.sum(~mask[:, 0, :], 1)
-        valid_ratio_heigth = valid_height.float() / height
-        valid_ratio_width = valid_width.float() / width
+        valid_ratio_heigth = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
         valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
         return valid_ratio
 
@@ -1380,9 +1380,8 @@ def forward(
         sources = []
         position_embeddings_list = []
         for level, source in enumerate(features[::-1][: self.num_feature_levels]):
-            feats = source.float()
-            sources.append(self.input_projections[level](feats))
-            position_embeddings_list.append(self.position_embedding(feats))
+            sources.append(self.input_projections[level](source))
+            position_embeddings_list.append(self.position_embedding(source))
 
         masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in sources]
 
@@ -1407,8 +1406,7 @@ def forward(
         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
         spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
-        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
-        valid_ratios = valid_ratios.float()
+        valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
 
         # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
         # Also provide spatial_shapes, level_start_index and valid_ratios
@@ -1445,7 +1443,6 @@ def forward(
         # append `out` with extra FPN levels
         # Reverse feature maps into top-down order (from low to high resolution)
         for idx, feats in enumerate(features[: self.num_fpn_levels][::-1]):
-            feats = feats.float()
             lateral_conv = self.lateral_convs[idx]
             output_conv = self.output_convs[idx]
             cur_fpn = lateral_conv(feats)
@@ -2396,15 +2393,15 @@ def __init__(
     def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
         if mask is None:
             mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        not_mask = (~mask).to(x.dtype)
+        y_embed = not_mask.cumsum(1)
+        x_embed = not_mask.cumsum(2)
         if self.normalize:
             eps = 1e-6
             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device)
         dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)
 
         pos_x = x_embed[:, :, :, None] / dim_t
@@ -2744,7 +2741,7 @@ def __init__(self, config: OneFormerConfig):
         )
 
     def forward(self, inputs: Tensor) -> Tensor:
-        task_tokens = self.task_mlp(inputs.float())
+        task_tokens = self.task_mlp(inputs)
         return task_tokens
 
 
@@ -2980,7 +2977,7 @@ def forward(
         multi_scale_features = pixel_level_module_output.decoder_features
         mask_features = pixel_level_module_output.decoder_last_feature
 
-        task_token = self.task_encoder(task_inputs)
+        task_token = self.task_encoder(task_inputs.to(self.dtype))
 
         if self.is_training:
             text_queries = self.text_mapper(text_inputs)
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 898f21999229..c05901a9dccd 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -21,7 +21,14 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import Mask2FormerConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -420,6 +427,20 @@ def test_inference_universal_segmentation_head(self):
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
 
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        model = (
+            Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
+            .to(torch_device, dtype=torch.float16)
+            .eval()
+        )
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        with torch.no_grad():
+            _ = model(**inputs)
+
     def test_with_segmentation_maps_and_loss(self):
         model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).to(torch_device).eval()
         image_processor = self.default_image_processor
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index c37127cea732..32d5e03b0311 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -22,7 +22,14 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import DetrConfig, MaskFormerConfig, SwinConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -509,6 +516,20 @@ def test_inference_instance_segmentation_head_resnet_backbone(self):
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
 
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        model = (
+            MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-resnet101-coco-stuff")
+            .to(torch_device, dtype=torch.float16)
+            .eval()
+        )
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        with torch.no_grad():
+            _ = model(**inputs)
+
     def test_with_segmentation_maps_and_loss(self):
         model = (
             MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-small-coco")
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index ef4a45021acb..222f1ce66b0f 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -22,7 +22,14 @@
 
 from tests.test_modeling_common import floats_tensor
 from transformers import OneFormerConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -533,6 +540,20 @@ def test_inference_universal_segmentation_head(self):
         ).to(torch_device)
         self.assertTrue(torch.allclose(class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
 
+    @require_torch_gpu
+    def test_inference_fp16(self):
+        model = (
+            OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
+            .to(torch_device, dtype=torch.float16)
+            .eval()
+        )
+        processor = self.default_processor
+        image = prepare_img()
+        inputs = processor(image, ["semantic"], return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        with torch.no_grad():
+            _ = model(**inputs)
+
     def test_with_segmentation_maps_and_loss(self):
         dummy_model = OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
         processor = self.default_processor

From a23ac36f8cc091f737a7e3536828293fb7893901 Mon Sep 17 00:00:00 2001
From: David Reguera <33068707+nablabits@users.noreply.github.com>
Date: Tue, 8 Aug 2023 08:09:17 +0200
Subject: [PATCH 875/935] [DOCS] Add descriptive docstring to
 MinNewTokensLength (#25196)

* Add descriptive docstring to MinNewTokensLength

It addresses https://github.com/huggingface/transformers/issues/24783

* Refine the differences between `min_length` and `min_new_tokens`

* Remove extra line

* Remove extra arguments in generate

* Add a missing space

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Run the linter

* Add clarification comments

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/generation/logits_process.py | 42 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index cf5e1f3aea24..f7cf2e89682a 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -133,14 +133,53 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] enforcing a min-length of new tokens by setting EOS (End-Of-Sequence) token probability to 0.
+    Note that for decoder-only models, such as Llama2, `min_length` will compute the length of `prompt + newly
+    generated tokens` whereas for other models it will behave as `min_new_tokens`, that is, taking only into account
+    the newly generated ones.
 
     Args:
         prompt_length_to_skip (`int`):
-            The input tokens length.
+            The input tokens length. Not a valid argument when used with `generate` as it will automatically assign the
+            input length.
         min_new_tokens (`int`):
             The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`.
         eos_token_id (`Union[int, List[int]]`):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> model.config.pad_token_id = model.config.eos_token_id
+    >>> model.generation_config.pad_token_id = model.config.eos_token_id
+    >>> input_context = "Hugging Face Company is"
+    >>> input_ids = tokenizer.encode(input_context, return_tensors="pt")
+
+    >>> # Without `eos_token_id`, it will generate the default length, 20, ignoring `min_new_tokens`
+    >>> outputs = model.generate(input_ids=input_ids, min_new_tokens=30)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a company that has been working on a new product for the past year.
+
+    >>> # If `eos_token_id` is set to ` company` it will take into account how many `min_new_tokens` have been generated
+    >>> # before stopping. Note that ` Company` (5834) and ` company` (1664) are not actually the same token, and even
+    >>> # if they were ` Company` would be ignored by `min_new_tokens` as it excludes the prompt.
+    >>> outputs = model.generate(input_ids=input_ids, min_new_tokens=1, eos_token_id=1664)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a company
+
+    >>> # Increasing `min_new_tokens` will bury the first occurrence of ` company` generating a different sequence.
+    >>> outputs = model.generate(input_ids=input_ids, min_new_tokens=2, eos_token_id=1664)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a new company
+
+    >>> # If no more occurrences of the `eos_token` happen after `min_new_tokens` it returns to the 20 default tokens.
+    >>> outputs = model.generate(input_ids=input_ids, min_new_tokens=10, eos_token_id=1664)
+    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    Hugging Face Company is a new and innovative brand of facial recognition technology that is designed to help you
+    ```
     """
 
     def __init__(self, prompt_length_to_skip: int, min_new_tokens: int, eos_token_id: Union[int, List[int]]):
@@ -194,7 +233,6 @@ class TemperatureLogitsWarper(LogitsWarper):
     >>> import torch
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-
     >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
     >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
     >>> model.config.pad_token_id = model.config.eos_token_id

From d4bd33cc9f11ca48635e54983d75249c78d72e2a Mon Sep 17 00:00:00 2001
From: Matthew Hoffman <matthew@protopia.ai>
Date: Mon, 7 Aug 2023 23:12:11 -0700
Subject: [PATCH 876/935] Register ModelOutput subclasses as supported
 torch.utils._pytree nodes (#25358)

* Register ModelOutput subclasses as supported torch.utils._pytree nodes

Fixes #25357 where DDP with static_graph=True does not sync gradients when calling backward() over tensors contained in ModelOutput subclasses

* Add test for torch pytree ModelOutput serialization and deserialization
---
 src/transformers/utils/generic.py | 15 +++++++++++++++
 tests/utils/test_model_output.py  | 23 +++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index afe102408378..500b6192ab0f 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -248,6 +248,21 @@ class ModelOutput(OrderedDict):
     </Tip>
     """
 
+    def __init_subclass__(cls) -> None:
+        """Register subclasses as pytree nodes.
+
+        This is necessary to synchronize gradients when using `torch.nn.parallel.DistributedDataParallel` with
+        `static_graph=True` with modules that output `ModelOutput` subclasses.
+        """
+        if is_torch_available():
+            import torch.utils._pytree
+
+            torch.utils._pytree._register_pytree_node(
+                cls,
+                torch.utils._pytree._dict_flatten,
+                lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+            )
+
     def __post_init__(self):
         class_fields = fields(self)
 
diff --git a/tests/utils/test_model_output.py b/tests/utils/test_model_output.py
index 20ff5ceba822..b415b6c2ef96 100644
--- a/tests/utils/test_model_output.py
+++ b/tests/utils/test_model_output.py
@@ -17,6 +17,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from transformers.testing_utils import require_torch
 from transformers.utils import ModelOutput
 
 
@@ -120,3 +121,25 @@ def test_instantiate_from_iterator(self):
         x = ModelOutputTest(a=(30, 30))
         self.assertEqual(list(x.keys()), ["a"])
         self.assertEqual(x.a, (30, 30))
+
+    @require_torch
+    def test_torch_pytree(self):
+        # ensure torch.utils._pytree treats ModelOutput subclasses as nodes (and not leaves)
+        # this is important for DistributedDataParallel gradient synchronization with static_graph=True
+        import torch
+        import torch.utils._pytree
+
+        x = ModelOutputTest(a=1.0, c=2.0)
+        self.assertFalse(torch.utils._pytree._is_leaf(x))
+
+        expected_flat_outs = [1.0, 2.0]
+        expected_tree_spec = torch.utils._pytree.TreeSpec(
+            ModelOutputTest, ["a", "c"], [torch.utils._pytree.LeafSpec(), torch.utils._pytree.LeafSpec()]
+        )
+
+        actual_flat_outs, actual_tree_spec = torch.utils._pytree.tree_flatten(x)
+        self.assertEqual(expected_flat_outs, actual_flat_outs)
+        self.assertEqual(expected_tree_spec, actual_tree_spec)
+
+        unflattened_x = torch.utils._pytree.tree_unflatten(actual_flat_outs, actual_tree_spec)
+        self.assertEqual(x, unflattened_x)

From 6ea3ee3cd215dfe0b32034299da3f876af0e7c4e Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 8 Aug 2023 10:48:45 +0200
Subject: [PATCH 877/935] Fix `test_model_parallelism` (#25359)

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 src/transformers/models/clip/modeling_clip.py              | 4 ++--
 src/transformers/models/clipseg/modeling_clipseg.py        | 2 +-
 src/transformers/models/data2vec/modeling_data2vec_text.py | 2 +-
 src/transformers/models/deit/modeling_deit.py              | 2 +-
 src/transformers/models/esm/modeling_esm.py                | 2 +-
 src/transformers/models/esm/modeling_esmfold.py            | 2 ++
 .../models/instructblip/modeling_instructblip.py           | 7 ++++++-
 src/transformers/models/lilt/modeling_lilt.py              | 1 -
 src/transformers/models/roberta/modeling_roberta.py        | 2 +-
 .../roberta_prelayernorm/modeling_roberta_prelayernorm.py  | 2 +-
 src/transformers/models/vilt/modeling_vilt.py              | 2 +-
 src/transformers/models/vit/modeling_vit.py                | 2 +-
 src/transformers/models/vit_hybrid/modeling_vit_hybrid.py  | 2 +-
 .../models/xlm_roberta/modeling_xlm_roberta.py             | 2 +-
 tests/models/clip/test_modeling_clip.py                    | 1 +
 tests/models/clipseg/test_modeling_clipseg.py              | 1 +
 tests/models/data2vec/test_modeling_data2vec_text.py       | 1 +
 tests/models/esm/test_modeling_esm.py                      | 1 +
 tests/models/opt/test_modeling_opt.py                      | 4 ++++
 tests/models/roberta/test_modeling_roberta.py              | 1 +
 .../test_modeling_roberta_prelayernorm.py                  | 1 +
 tests/models/vilt/test_modeling_vilt.py                    | 1 +
 tests/models/vit_hybrid/test_modeling_vit_hybrid.py        | 1 +
 tests/models/xglm/test_modeling_xglm.py                    | 4 ++++
 tests/test_modeling_common.py                              | 2 +-
 25 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index c72e0d43e6b5..3a894b9727c9 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -793,7 +793,7 @@ def forward(
 class CLIPTextModel(CLIPPreTrainedModel):
     config_class = CLIPTextConfig
 
-    _no_split_modules = ["CLIPEncoderLayer"]
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
 
     def __init__(self, config: CLIPTextConfig):
         super().__init__(config)
@@ -1198,7 +1198,7 @@ def forward(
 class CLIPTextModelWithProjection(CLIPPreTrainedModel):
     config_class = CLIPTextConfig
 
-    _no_split_modules = ["CLIPEncoderLayer"]
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
 
     def __init__(self, config: CLIPTextConfig):
         super().__init__(config)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 3dc006179c52..96f13217aaf8 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -800,7 +800,7 @@ def forward(
 class CLIPSegTextModel(CLIPSegPreTrainedModel):
     config_class = CLIPSegTextConfig
 
-    _no_split_modules = ["CLIPSegEncoderLayer"]
+    _no_split_modules = ["CLIPSegTextEmbeddings", "CLIPSegEncoderLayer"]
 
     def __init__(self, config: CLIPSegTextConfig):
         super().__init__(config)
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index ed79b021fbf1..213eda21a434 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -593,7 +593,7 @@ class Data2VecTextPreTrainedModel(PreTrainedModel):
     config_class = Data2VecTextConfig
     base_model_prefix = "data2vec_text"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["Data2VecTextForTextEmbeddings", "Data2VecTextLayer"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 8b03835812fc..38c28dbbedc6 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -399,7 +399,7 @@ class DeiTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "deit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["DeiTLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index 2d5ca07f2fc6..e83f5ff808de 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -690,7 +690,7 @@ class EsmPreTrainedModel(PreTrainedModel):
 
     config_class = EsmConfig
     base_model_prefix = "esm"
-    _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock"]
+    _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock", "EsmEmbeddings"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py
index 05c165f5864a..6e6c9fcb4aea 100644
--- a/src/transformers/models/esm/modeling_esmfold.py
+++ b/src/transformers/models/esm/modeling_esmfold.py
@@ -2018,6 +2018,8 @@ def distogram(coords, min_bin, max_bin, num_bins):
     ESM_START_DOCSTRING,
 )
 class EsmForProteinFolding(EsmPreTrainedModel):
+    _no_split_modules = ["EsmFoldStructureModule", "EsmFoldTriangularSelfAttentionBlock"]
+
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index b532d78b4454..ea736c28400c 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -275,7 +275,12 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
     config_class = InstructBlipConfig
     base_model_prefix = "blip"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["InstructBlipAttention", "InstructBlipQFormerMultiHeadAttention"]
+    _no_split_modules = [
+        "InstructBlipQFormerEmbeddings",
+        "InstructBlipAttention",
+        "InstructBlipQFormerMultiHeadAttention",
+        "InstructBlipQFormerSelfOutput",
+    ]
     _keep_in_fp32_modules = []
 
     # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlip
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index e5783b970f87..af83e43edbc3 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -579,7 +579,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return pooled_output
 
 
-# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Lilt,roberta->lilt
 class LiltPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 0b19804dccf2..349e6c0a27c3 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -593,7 +593,7 @@ class RobertaPreTrainedModel(PreTrainedModel):
     config_class = RobertaConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index c9b455716fc2..17bacd97f4b2 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -596,7 +596,7 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
     config_class = RobertaPreLayerNormConfig
     base_model_prefix = "roberta_prelayernorm"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index c5df1c823f66..5baf47395818 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -573,7 +573,7 @@ class ViltPreTrainedModel(PreTrainedModel):
     config_class = ViltConfig
     base_model_prefix = "vilt"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["ViltSelfAttention"]
+    _no_split_modules = ["ViltEmbeddings", "ViltSelfAttention"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index bfd440caae2b..461c7285f23e 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -439,7 +439,7 @@ class ViTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["ViTEmbeddings", "ViTLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 051d431946a8..008f6b3c9db5 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -458,7 +458,7 @@ class ViTHybridPreTrainedModel(PreTrainedModel):
     base_model_prefix = "vit"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["ViTHybridEmbeddings", "ViTHybridLayer"]
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 14e2e22086a9..8165b786143f 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -595,7 +595,7 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
     config_class = XLMRobertaConfig
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
-    _no_split_modules = []
+    _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention"]
 
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 996bea95b95d..0edd73f7ec60 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -353,6 +353,7 @@ class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     fx_compatible = True
     test_pruning = False
     test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = CLIPTextModelTester(self)
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 37a71d1b18a3..0f97f381fc52 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -308,6 +308,7 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_pruning = False
     test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = CLIPSegTextModelTester(self)
diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py
index 4b4b2835dc1d..afaa8a76addb 100644
--- a/tests/models/data2vec/test_modeling_data2vec_text.py
+++ b/tests/models/data2vec/test_modeling_data2vec_text.py
@@ -388,6 +388,7 @@ class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes
         if is_torch_available()
         else {}
     )
+    model_split_percents = [0.5, 0.9]
 
     def setUp(self):
         self.model_tester = Data2VecTextModelTester(self)
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index f242e77966c0..8af7a318ac63 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -192,6 +192,7 @@ class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         else {}
     )
     test_sequence_classification_problem_types = True
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = EsmModelTester(self)
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 69a063f276b7..669dc3e6c093 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -323,6 +323,10 @@ def test_opt_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
     """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 40c85123c476..6cacf605a26a 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -395,6 +395,7 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         else {}
     )
     fx_compatible = True
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = RobertaModelTester(self)
diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
index c44e1613b201..ee0972eec329 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
@@ -395,6 +395,7 @@ class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, Pipe
         else {}
     )
     fx_compatible = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = RobertaPreLayerNormModelTester(self)
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 4aa036ebb6cb..399f0710c779 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -235,6 +235,7 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_pruning = False
     test_headmasking = False
     test_torchscript = False
+    model_split_percents = [0.5, 0.8, 0.9]
 
     # ViltForMaskedLM, ViltForQuestionAnswering and ViltForImagesAndTextClassification require special treatment
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index 20747b2d54c0..a1146264894d 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -163,6 +163,7 @@ class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
+    model_split_percents = [0.5, 0.9]
 
     def setUp(self):
         self.model_tester = ViTHybridModelTester(self)
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index e6c013cca150..9cf0c9a2bbf9 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -347,6 +347,10 @@ def test_model_from_pretrained(self):
             model = XGLMModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
 
 @require_torch
 class XGLMModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 1850ae8e5bf5..e4a3f2de6060 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2597,7 +2597,7 @@ def test_model_parallelism(self):
 
             model_size = compute_module_sizes(model)[""]
             # We test several splits of sizes to make sure it works.
-            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents]
+            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.cpu().save_pretrained(tmp_dir)
 

From 5ea2595ecde9a89079d4d289bc65dc5f8dc02ef6 Mon Sep 17 00:00:00 2001
From: "JB (Don)" <1557853+hackyon@users.noreply.github.com>
Date: Tue, 8 Aug 2023 17:49:21 +0900
Subject: [PATCH 878/935] Add warning for missing attention mask when pad
 tokens are detected (#25345)

* Add attention mask and pad token warning to many of the models

* Remove changes under examples/research_projects

These files are not maintained by HG.

* Skip the warning check during torch.fx or JIT tracing

* Switch ordering for the warning and input shape assignment

This ordering is a little cleaner for some of the cases.

* Add missing line break in one of the files
---
 src/transformers/modeling_utils.py                         | 7 ++++++-
 src/transformers/models/albert/modeling_albert.py          | 1 +
 src/transformers/models/align/modeling_align.py            | 1 +
 src/transformers/models/altclip/modeling_altclip.py        | 2 +-
 src/transformers/models/bert/modeling_bert.py              | 2 +-
 .../models/bert_generation/modeling_bert_generation.py     | 1 +
 src/transformers/models/big_bird/modeling_big_bird.py      | 1 +
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py     | 1 +
 src/transformers/models/blenderbot/modeling_blenderbot.py  | 1 +
 .../models/blenderbot_small/modeling_blenderbot_small.py   | 1 +
 src/transformers/models/blip/modeling_blip_text.py         | 1 +
 .../models/bridgetower/modeling_bridgetower.py             | 2 +-
 src/transformers/models/camembert/modeling_camembert.py    | 2 +-
 src/transformers/models/canine/modeling_canine.py          | 1 +
 .../models/chinese_clip/modeling_chinese_clip.py           | 1 +
 src/transformers/models/clap/modeling_clap.py              | 2 +-
 src/transformers/models/codegen/modeling_codegen.py        | 1 +
 src/transformers/models/convbert/modeling_convbert.py      | 1 +
 src/transformers/models/ctrl/modeling_ctrl.py              | 1 +
 src/transformers/models/data2vec/modeling_data2vec_text.py | 2 +-
 src/transformers/models/deberta/modeling_deberta.py        | 1 +
 src/transformers/models/deberta_v2/modeling_deberta_v2.py  | 1 +
 .../decision_transformer/modeling_decision_transformer.py  | 1 +
 src/transformers/models/distilbert/modeling_distilbert.py  | 1 +
 src/transformers/models/dpr/modeling_dpr.py                | 2 ++
 src/transformers/models/electra/modeling_electra.py        | 1 +
 src/transformers/models/ernie/modeling_ernie.py            | 1 +
 src/transformers/models/esm/modeling_esm.py                | 1 +
 src/transformers/models/funnel/modeling_funnel.py          | 2 ++
 src/transformers/models/git/modeling_git.py                | 1 +
 src/transformers/models/gpt2/modeling_gpt2.py              | 1 +
 .../models/gpt_bigcode/modeling_gpt_bigcode.py             | 1 +
 src/transformers/models/gpt_neo/modeling_gpt_neo.py        | 1 +
 src/transformers/models/gpt_neox/modeling_gpt_neox.py      | 1 +
 .../models/gpt_neox_japanese/modeling_gpt_neox_japanese.py | 1 +
 src/transformers/models/gptj/modeling_gptj.py              | 1 +
 src/transformers/models/ibert/modeling_ibert.py            | 1 +
 src/transformers/models/imagegpt/modeling_imagegpt.py      | 1 +
 src/transformers/models/layoutlm/modeling_layoutlm.py      | 1 +
 src/transformers/models/layoutlmv2/modeling_layoutlmv2.py  | 1 +
 src/transformers/models/lilt/modeling_lilt.py              | 1 +
 src/transformers/models/longformer/modeling_longformer.py  | 1 +
 src/transformers/models/luke/modeling_luke.py              | 1 +
 src/transformers/models/lxmert/modeling_lxmert.py          | 1 +
 src/transformers/models/m2m_100/modeling_m2m_100.py        | 1 +
 src/transformers/models/marian/modeling_marian.py          | 1 +
 src/transformers/models/markuplm/modeling_markuplm.py      | 1 +
 src/transformers/models/mega/modeling_mega.py              | 1 +
 .../models/megatron_bert/modeling_megatron_bert.py         | 1 +
 src/transformers/models/mobilebert/modeling_mobilebert.py  | 1 +
 src/transformers/models/mpnet/modeling_mpnet.py            | 1 +
 src/transformers/models/mra/modeling_mra.py                | 1 +
 src/transformers/models/nezha/modeling_nezha.py            | 1 +
 src/transformers/models/nllb_moe/modeling_nllb_moe.py      | 1 +
 .../models/nystromformer/modeling_nystromformer.py         | 1 +
 src/transformers/models/openai/modeling_openai.py          | 1 +
 src/transformers/models/pegasus/modeling_pegasus.py        | 1 +
 src/transformers/models/pegasus_x/modeling_pegasus_x.py    | 1 +
 src/transformers/models/qdqbert/modeling_qdqbert.py        | 1 +
 src/transformers/models/realm/modeling_realm.py            | 1 +
 src/transformers/models/reformer/modeling_reformer.py      | 1 +
 src/transformers/models/rembert/modeling_rembert.py        | 1 +
 src/transformers/models/roberta/modeling_roberta.py        | 2 +-
 .../roberta_prelayernorm/modeling_roberta_prelayernorm.py  | 1 +
 src/transformers/models/roc_bert/modeling_roc_bert.py      | 1 +
 src/transformers/models/roformer/modeling_roformer.py      | 1 +
 src/transformers/models/splinter/modeling_splinter.py      | 1 +
 .../models/squeezebert/modeling_squeezebert.py             | 1 +
 src/transformers/models/tapas/modeling_tapas.py            | 1 +
 src/transformers/models/vilt/modeling_vilt.py              | 1 +
 .../models/visual_bert/modeling_visual_bert.py             | 1 +
 src/transformers/models/xglm/modeling_xglm.py              | 1 +
 .../models/xlm_roberta/modeling_xlm_roberta.py             | 2 +-
 .../models/xlm_roberta_xl/modeling_xlm_roberta_xl.py       | 2 +-
 src/transformers/models/xmod/modeling_xmod.py              | 1 +
 src/transformers/models/yoso/modeling_yoso.py              | 1 +
 .../modeling_{{cookiecutter.lowercase_modelname}}.py       | 2 ++
 77 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index f879aabc2629..9d6dc9d82b33 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -74,7 +74,7 @@
     replace_return_docstrings,
 )
 from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
-from .utils.import_utils import ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled
+from .utils.import_utils import ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled, is_torch_fx_proxy
 from .utils.quantization_config import BitsAndBytesConfig
 from .utils.versions import require_version_core
 
@@ -3527,6 +3527,11 @@ def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
         """
         Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
         """
+
+        # Skip the check during tracing.
+        if is_torch_fx_proxy(input_ids) or torch.jit.is_tracing():
+            return
+
         if (attention_mask is not None) or (self.config.pad_token_id is None):
             return
 
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 749bb84803ac..fe6b37732332 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -701,6 +701,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 4c5f1d3138c2..6cbf01a3432c 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -1267,6 +1267,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index bbffbf10b688..979e974e06c9 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1305,8 +1305,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 0fc784fcfb9a..de6ed382e472 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -966,8 +966,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 3f4a26da4594..56d32e2910c5 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -768,6 +768,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 53a3a1d8c484..a914267ba338 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2035,6 +2035,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 2af4387da109..43f876a7ee7e 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1857,6 +1857,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index af7af01cbc6a..de3403786810 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -732,6 +732,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 3e52052124bc..ff6eda893f7c 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -729,6 +729,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 61e983bb2db4..59e5539340cd 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -717,6 +717,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             batch_size, seq_length = input_shape
             device = input_ids.device
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 5fa87c853ab8..ce569157b811 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -1118,8 +1118,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index f0b40f56de5c..d12882dfec30 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -841,8 +841,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index b863e294bdd2..ae36ed6bab9c 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -1126,6 +1126,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index f8972fab1ddc..7bab0aea6eb9 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -1208,6 +1208,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index d0d99c1a1c66..95dca1f866f5 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1853,8 +1853,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 4b87800cb1be..d95884bbba6a 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -460,6 +460,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index a3910e20dbef..a6fccf5b72b4 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -818,6 +818,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index 342494ad41be..c577c2bca910 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -397,6 +397,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 213eda21a434..f949f3cce598 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -790,8 +790,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index c946592730e4..6f6c2af63a67 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -959,6 +959,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 608bca009580..eda4f406cb31 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -1059,6 +1059,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 7414eb41c01c..08f5d7f11060 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -530,6 +530,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index d0ceb4c2464f..a6a884398b2a 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -589,6 +589,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 588440d4a6c5..ce5ba24899f8 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -562,6 +562,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -650,6 +651,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index fdab72bd126f..ad9e17cf6ed2 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -868,6 +868,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 79b3c00280b7..d0a3cc9a81b9 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -895,6 +895,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index e83f5ff808de..05693b0c1e1e 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -876,6 +876,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index b3f4297eaf54..06432cedcf4d 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -966,6 +966,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -1041,6 +1042,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 0c127a48dc47..80eaca01e946 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -1213,6 +1213,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 3c2bfc6a6c9d..8f0bb600c8d7 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -776,6 +776,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 415c6ac0dcb2..1c34f28a5c88 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -560,6 +560,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index c6b95a768aba..f9eda68de1c3 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -525,6 +525,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 2d7f2df45649..495f78f27927 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -571,6 +571,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 7df64174a599..e66c51c1b4fb 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -508,6 +508,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index fd7083e4fdad..d7b0bce13498 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -577,6 +577,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index 6cf484d96f78..0dcdaaf6998f 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -796,6 +796,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index f24cf7ae7136..31c6a001a976 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -715,6 +715,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 26c4cd92d6e5..b1f453f1dfd0 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -795,6 +795,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 18927fb1fde8..90e3cefa96a3 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -1047,6 +1047,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index af83e43edbc3..46fe2d3e9cd7 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -763,6 +763,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 994157daa879..33bf9a6f9268 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1707,6 +1707,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 8a3ceb14d50e..52b947e80977 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -1116,6 +1116,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 2a1a21282ec0..226e2e7197a7 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -923,6 +923,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index df6c50dc9cab..116edbb27bd3 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -774,6 +774,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index bad7ff719443..316aa575180b 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -745,6 +745,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 34435b898fcf..678367ff94d2 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -877,6 +877,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py
index 9381e60905cd..b74406ea588f 100644
--- a/src/transformers/models/mega/modeling_mega.py
+++ b/src/transformers/models/mega/modeling_mega.py
@@ -1533,6 +1533,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             device = input_ids.device
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index c28b681326c6..f56745dace06 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -933,6 +933,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 809e209b8b73..70f2ebc7bfd8 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -867,6 +867,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index 68bdad1c9fd4..86194607e217 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -533,6 +533,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py
index 7ae4974a05a8..d400fea6d23d 100644
--- a/src/transformers/models/mra/modeling_mra.py
+++ b/src/transformers/models/mra/modeling_mra.py
@@ -988,6 +988,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index 8d66bfe41fab..fa31e94f4d2e 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -950,6 +950,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 09c0b91d8d29..b37c79ddfc2d 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -1105,6 +1105,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index 607deb7b0ab6..51ee73ab72d3 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -599,6 +599,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 9fa3a6429e19..a3eb90b4ac78 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -452,6 +452,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index b90a60b25051..7cccf0df190b 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -757,6 +757,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index c1b1d414771c..ba3b98b1eff8 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -1013,6 +1013,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index da60b8efea1e..6cd7511e00cc 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -928,6 +928,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             batch_size, seq_length = input_shape
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 2e675a4d3425..aa738d782b7b 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1063,6 +1063,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 7f3979ad21ee..14f735c7e6a3 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -2031,6 +2031,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()  # noqa: F841
             device = input_ids.device
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index e0ab18088aae..f25e8efbe326 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -837,6 +837,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 349e6c0a27c3..38c2d74acc3b 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -788,8 +788,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 17bacd97f4b2..9dd4b23c43c6 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -790,6 +790,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 221991f354fb..f6b5df11c6af 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -993,6 +993,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index ad91766f9660..9f040174d391 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -874,6 +874,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 193481e57f25..f72ffb10111b 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -700,6 +700,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index b82de3a0b06b..0ac1260c82b0 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -596,6 +596,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 832a731b5bf4..cdaa4b3e2725 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -932,6 +932,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 5baf47395818..a36d58bd235b 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -798,6 +798,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 0706eb1f1c48..81ad1068483a 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -773,6 +773,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 02f230629f9a..21db9ad497a1 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -606,6 +606,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 8165b786143f..44b7fd4cca9a 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -790,8 +790,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 7b6c15033cfc..8aac5a8bb1bf 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -756,8 +756,8 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            input_shape = input_ids.size()
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index c44cded49952..0940b1cc6c67 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -886,6 +886,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index 4d4ef9a4f509..5edd7f883542 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -790,6 +790,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 1b4a80112e40..f0960a384123 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -845,6 +845,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
@@ -2271,6 +2272,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:

From dedd11160d28af01d6355bcac38aa0937eeed7a6 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Tue, 8 Aug 2023 10:16:00 +0100
Subject: [PATCH 879/935] [ASR Pipeline] Clarify return timestamps (#25344)

* [ASR Pipeline] Clarify return timestamps

* fix indentation

* fix ctc check

* fix ctc error message!

* fix test

* fix other test

* add new tests

* final comment
---
 .../pipelines/automatic_speech_recognition.py | 65 ++++++++++++-------
 ..._pipelines_automatic_speech_recognition.py | 24 ++++++-
 2 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index f32aabe64f93..1f2c202ac23f 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -156,8 +156,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
         feature_extractor ([`SequenceFeatureExtractor`]):
             The feature extractor that will be used by the pipeline to encode waveform for the model.
         chunk_length_s (`float`, *optional*, defaults to 0):
-            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default). Only
-            available for CTC models, e.g. [`Wav2Vec2ForCTC`].
+            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).
 
             <Tip>
 
@@ -247,14 +246,29 @@ def __call__(
                       np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
                       treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
                       inference to provide more context to the model). Only use `stride` with CTC models.
-            return_timestamps (*optional*, `str`):
-                Only available for pure CTC models. If set to `"char"`, the pipeline will return timestamps along the
-                text for every character in the text. For instance if you get `[{"text": "h", "timestamp": (0.5, 0.6)},
-                {"text": "i", "timestamp": (0.7, 0.9)}]`, then it means the model predicts that the letter "h" was
-                pronounced after `0.5` and before `0.6` seconds. If set to `"word"`, the pipeline will return
-                timestamps along the text for every word in the text. For instance if you get `[{"text": "hi ",
-                "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp": (1.0, 1.5)}]`, then it means the model
-                predicts that the word "hi" was pronounced after `0.5` and before `0.9` seconds.
+            return_timestamps (*optional*, `str` or `bool`):
+                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+                other sequence-to-sequence models.
+
+                For CTC models, timestamps can take one of two formats:
+                    - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
+                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+                        `0.6` seconds.
+                    - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
+                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+                        before `0.9` seconds.
+
+                For the Whisper model, timestamps can take one of two formats:
+                    - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
+                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+                        by inspecting the cross-attention weights.
+                    - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
+                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+                        Note that a segment of text refers to a sequence of one or more words, rather than individual
+                        words as with word-level timestamps.
             generate_kwargs (`dict`, *optional*):
                 The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                 complete overview of generate, check the [following
@@ -264,12 +278,12 @@ def __call__(
 
         Return:
             `Dict`: A dictionary with the following keys:
-                - **text** (`str` ) -- The recognized text.
+                - **text** (`str`): The recognized text.
                 - **chunks** (*optional(, `List[Dict]`)
-                        When using `return_timestamps`, the `chunks` will become a list containing all the various text
-                        chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
-                        "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
-                        `"".join(chunk["text"] for chunk in output["chunks"])`.
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
         """
         return super().__call__(inputs, **kwargs)
 
@@ -308,6 +322,20 @@ def _sanitize_parameters(
         if decoder_kwargs is not None:
             postprocess_params["decoder_kwargs"] = decoder_kwargs
         if return_timestamps is not None:
+            if self.type == "seq2seq" and return_timestamps:
+                raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
+            if self.type == "ctc_with_lm" and return_timestamps != "word":
+                raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
+            if self.type == "ctc" and return_timestamps not in ["char", "word"]:
+                raise ValueError(
+                    "CTC can either predict character (char) level timestamps, or word level timestamps."
+                    "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
+                )
+            if self.type == "seq2seq_whisper" and return_timestamps == "char":
+                raise ValueError(
+                    "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+                    "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
+                )
             forward_params["return_timestamps"] = return_timestamps
             postprocess_params["return_timestamps"] = return_timestamps
         if return_language is not None:
@@ -497,13 +525,6 @@ def postprocess(
         # Optional return types
         optional = {}
 
-        if return_timestamps and self.type == "seq2seq":
-            raise ValueError("We cannot return_timestamps yet on non-ctc models apart from Whisper !")
-        if return_timestamps == "char" and self.type == "ctc_with_lm":
-            raise ValueError("CTC with LM cannot return `char` timestamps, only `word`")
-        if return_timestamps == "char" and self.type == "seq2seq_whisper":
-            raise ValueError("Whisper cannot return `char` timestamps, use `True` or `word` instead.")
-
         if return_language is not None and self.type != "seq2seq_whisper":
             raise ValueError("Only whisper can return language for now.")
 
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 2b77c556f4c8..2d43cdbc81c9 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -136,7 +136,7 @@ def run_pipeline_test(self, speech_recognizer, examples):
         else:
             # Non CTC models cannot use return_timestamps
             with self.assertRaisesRegex(
-                ValueError, "^We cannot return_timestamps yet on non-ctc models apart from Whisper !$"
+                ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
             ):
                 outputs = speech_recognizer(audio, return_timestamps="char")
 
@@ -161,7 +161,7 @@ def test_small_model_pt(self):
 
         # Non CTC models cannot use return_timestamps
         with self.assertRaisesRegex(
-            ValueError, "^We cannot return_timestamps yet on non-ctc models apart from Whisper !$"
+            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
         ):
             _ = speech_recognizer(waveform, return_timestamps="char")
 
@@ -261,6 +261,11 @@ def test_large_model_pt_with_lm(self):
                 ],
             },
         )
+        # CTC + LM models cannot use return_timestamps="char"
+        with self.assertRaisesRegex(
+            ValueError, "^CTC with LM can only predict word level timestamps, set `return_timestamps='word'`$"
+        ):
+            _ = speech_recognizer(filename, return_timestamps="char")
 
     @require_tf
     def test_small_model_tf(self):
@@ -750,6 +755,14 @@ def test_simple_whisper_asr(self):
         )
         # fmt: on
 
+        # Whisper can only predict segment level timestamps or word level, not character level
+        with self.assertRaisesRegex(
+            ValueError,
+            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
+        ):
+            _ = speech_recognizer(filename, return_timestamps="char")
+
     @slow
     @require_torch
     @require_torchaudio
@@ -1082,6 +1095,13 @@ def test_chunking_and_timestamps(self):
                 ],
             },
         )
+        # CTC models must specify return_timestamps type - cannot set `return_timestamps=True` blindly
+        with self.assertRaisesRegex(
+            ValueError,
+            "^CTC can either predict character (char) level timestamps, or word level timestamps."
+            "Set `return_timestamps='char'` or `return_timestamps='word'` as required.$",
+        ):
+            _ = speech_recognizer(audio, return_timestamps=True)
 
     @require_torch
     @slow

From 36d5b8b06c481cd57cc81f81913198f7515a97f5 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 8 Aug 2023 10:37:14 +0100
Subject: [PATCH 880/935] MaskFormer, Mask2Former - replace einsum for tracing
 (#25297)

* Replace einsum with ops for tracing

* Fix comment
---
 .../mask2former/modeling_mask2former.py       | 15 ++++++++++-----
 .../models/maskformer/modeling_maskformer.py  | 19 +++++++++++++++----
 .../models/oneformer/modeling_oneformer.py    |  8 ++++----
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 96e03b841886..5be7189cdc05 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -359,7 +359,7 @@ def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
         `torch.Tensor`: The computed loss between each pairs.
     """
     inputs = inputs.sigmoid().flatten(1)
-    numerator = 2 * torch.einsum("nc,mc->nm", inputs, labels)
+    numerator = 2 * torch.matmul(inputs, labels.T)
     # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
     denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
     loss = 1 - (numerator + 1) / (denominator + 1)
@@ -387,9 +387,9 @@ def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Ten
     cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
     cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
 
-    loss = torch.einsum("nc,mc->nm", cross_entropy_loss_pos, labels) + torch.einsum(
-        "nc,mc->nm", cross_entropy_loss_neg, (1 - labels)
-    )
+    loss_pos = torch.matmul(cross_entropy_loss_pos, labels.T)
+    loss_neg = torch.matmul(cross_entropy_loss_neg, (1 - labels).T)
+    loss = loss_pos + loss_neg
     loss = loss / height_and_width
     return loss
 
@@ -2012,7 +2012,12 @@ def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attenti
         mask_embeddings = self.mask_embedder(outputs.transpose(0, 1))
 
         # Sum up over the channels
-        outputs_mask = torch.einsum("bqc,   bchw -> bqhw", mask_embeddings, pixel_embeddings)
+        # (batch_size, num_queries, num_channels, 1, 1)
+        mask_embeddings = mask_embeddings.unsqueeze(-1).unsqueeze(-1)
+        # (batch_size, 1, num_channels, height, width)
+        pixel_embeddings = pixel_embeddings.unsqueeze(1)
+        # (batch_size, num_queries, height, width)
+        outputs_mask = (mask_embeddings * pixel_embeddings).sum(2)
 
         attention_mask = nn.functional.interpolate(
             outputs_mask, size=attention_mask_target_size, mode="bilinear", align_corners=False
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 4f9500bd127d..ad0a2e812764 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -355,7 +355,7 @@ def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
         `torch.Tensor`: The computed loss between each pairs.
     """
     inputs = inputs.sigmoid().flatten(1)
-    numerator = 2 * torch.einsum("nc,mc->nm", inputs, labels)
+    numerator = 2 * torch.matmul(inputs, labels.T)
     # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
     denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
     loss = 1 - (numerator + 1) / (denominator + 1)
@@ -397,7 +397,7 @@ def pair_wise_sigmoid_focal_loss(inputs: Tensor, labels: Tensor, alpha: float =
     focal_neg = (prob**gamma) * cross_entropy_loss_neg
     focal_neg *= 1 - alpha
 
-    loss = torch.einsum("nc,mc->nm", focal_pos, labels) + torch.einsum("nc,mc->nm", focal_neg, (1 - labels))
+    loss = torch.matmul(focal_pos, labels.T) + torch.matmul(focal_neg, (1 - labels).T)
 
     return loss / height_and_width
 
@@ -1712,7 +1712,13 @@ def get_logits(self, outputs: MaskFormerModelOutput) -> Tuple[Tensor, Tensor, Di
             # get the masks
             mask_embeddings = self.mask_embedder(stacked_transformer_decoder_outputs)
             # sum up over the channels for each embedding
-            binaries_masks = torch.einsum("lbqc,   bchw -> lbqhw", mask_embeddings, pixel_embeddings)
+            # (num_embeddings, batch_size, num_queries, num_channels, 1, 1)
+            mask_embeddings = mask_embeddings.unsqueeze(-1).unsqueeze(-1)
+            # (1, batch_size, 1, num_channels, height, width)
+            pixel_embeddings = pixel_embeddings.unsqueeze(0).unsqueeze(2)
+            # (num_embeddings, batch_size, num_queries, height, width)
+            binaries_masks = (mask_embeddings * pixel_embeddings).sum(dim=3)
+
             masks_queries_logits = binaries_masks[-1]
             # go til [:-1] because the last one is always used
             for aux_binary_masks, aux_classes in zip(binaries_masks[:-1], classes[:-1]):
@@ -1727,7 +1733,12 @@ def get_logits(self, outputs: MaskFormerModelOutput) -> Tuple[Tensor, Tensor, Di
             # get the masks
             mask_embeddings = self.mask_embedder(transformer_decoder_hidden_states)
             # sum up over the channels
-            masks_queries_logits = torch.einsum("bqc,   bchw -> bqhw", mask_embeddings, pixel_embeddings)
+            # (batch_size, num_queries, num_channels, 1, 1)
+            mask_embeddings = mask_embeddings.unsqueeze(-1).unsqueeze(-1)
+            # (batch_size, 1, num_channels, height, width)
+            pixel_embeddings = pixel_embeddings.unsqueeze(1)
+            # (batch_size, num_queries, height, width)
+            masks_queries_logits = (mask_embeddings * pixel_embeddings).sum(dim=2)
 
         return class_queries_logits, masks_queries_logits, auxiliary_logits
 
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 0b15e28d6413..5b6220f88169 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -167,7 +167,7 @@ def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
         `torch.Tensor`: The computed loss between each pairs.
     """
     inputs = inputs.sigmoid().flatten(1)
-    numerator = 2 * torch.einsum("nc,mc->nm", inputs, labels)
+    numerator = 2 * torch.matmul(inputs, labels.T)
     # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
     denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
     loss = 1 - (numerator + 1) / (denominator + 1)
@@ -196,9 +196,9 @@ def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Ten
     cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
     cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
 
-    loss = torch.einsum("nc,mc->nm", cross_entropy_loss_pos, labels) + torch.einsum(
-        "nc,mc->nm", cross_entropy_loss_neg, (1 - labels)
-    )
+    loss_pos = torch.matmul(cross_entropy_loss_pos, labels.T)
+    loss_neg = torch.matmul(cross_entropy_loss_neg, (1 - labels).T)
+    loss = loss_pos + loss_neg
     loss = loss / height_and_width
     return loss
 

From 01ab39b65f151c4ffb3d8d6e5cb4e6ce206bcd33 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 8 Aug 2023 05:41:00 -0400
Subject: [PATCH 881/935] Load state in else (#25318)

* Load else

* New approach

* Propagate
---
 .../run_image_classification_no_trainer.py                | 8 ++++++--
 examples/pytorch/image-pretraining/run_mim_no_trainer.py  | 8 ++++++--
 examples/pytorch/language-modeling/run_clm_no_trainer.py  | 8 ++++++--
 examples/pytorch/language-modeling/run_mlm_no_trainer.py  | 8 ++++++--
 examples/pytorch/multiple-choice/run_swag_no_trainer.py   | 8 ++++++--
 .../question-answering/run_qa_beam_search_no_trainer.py   | 8 ++++++--
 examples/pytorch/question-answering/run_qa_no_trainer.py  | 8 ++++++--
 .../run_semantic_segmentation_no_trainer.py               | 8 ++++++--
 .../pytorch/summarization/run_summarization_no_trainer.py | 8 ++++++--
 .../pytorch/text-classification/run_glue_no_trainer.py    | 8 ++++++--
 .../pytorch/token-classification/run_ner_no_trainer.py    | 8 ++++++--
 .../pytorch/translation/run_translation_no_trainer.py     | 8 ++++++--
 12 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 62434649c3be..e63dc7973335 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -454,14 +454,18 @@ def collate_fn(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 5053dabf91af..8c5773c9d06c 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -664,14 +664,18 @@ def preprocess_images(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 7d6e48863251..f3252e0dc759 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -575,14 +575,18 @@ def group_texts(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 2fe8beca1b50..863a74ab2b36 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -613,14 +613,18 @@ def group_texts(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index e7ec89a429fc..be7498503235 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -560,14 +560,18 @@ def preprocess_function(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 6d0045505949..83a3b0357d2c 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -797,14 +797,18 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 0fc438d70b69..920dd1664b9c 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -828,14 +828,18 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index f0518ada5d94..98bec7de43f7 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -558,14 +558,18 @@ def preprocess_val(example_batch):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 65a924a800e7..e4b390c22a3f 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -629,14 +629,18 @@ def postprocess_text(preds, labels):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 1ab8bb0db845..d649cf00b79a 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -507,14 +507,18 @@ def preprocess_function(examples):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index a38f0e47d025..02431d43d8f6 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -667,14 +667,18 @@ def compute_metrics():
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index b61e715d3748..243d35b3c161 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -610,14 +610,18 @@ def postprocess_text(preds, labels):
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-            accelerator.load_state(args.resume_from_checkpoint)
+            checkpoint_path = args.resume_from_checkpoint
             path = os.path.basename(args.resume_from_checkpoint)
         else:
             # Get the most recent checkpoint
             dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
             dirs.sort(key=os.path.getctime)
             path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
         # Extract `epoch_{i}` or `step_{i}`
         training_difference = os.path.splitext(path)[0]
 

From 5744482abc472e472874b632d23c726affed8650 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 8 Aug 2023 12:00:31 +0200
Subject: [PATCH 882/935] Fix `token` in example template (#25351)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../run_{{cookiecutter.example_shortcut}}.py  | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index 12e59b8b2449..c49eded5807d 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -115,11 +115,13 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    token: bool = field(
-        default=False,
+    token: str = field(
+        default=None,
         metadata={
-            "help": "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-            "with private models)."
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
         },
     )
     trust_remote_code: bool = field(
@@ -299,7 +301,7 @@ def main():
     config_kwargs = {
         "cache_dir": model_args.cache_dir,
         "revision": model_args.model_revision,
-        "token": True if model_args.token else None,
+        "token": model_args.token,
         "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.config_name:
@@ -314,7 +316,7 @@ def main():
         "cache_dir": model_args.cache_dir,
         "use_fast": model_args.use_fast_tokenizer,
         "revision": model_args.model_revision,
-        "token": True if model_args.token else None,
+        "token": model_args.token,
         "trust_remote_code": model_args.trust_remote_code,
     }
     if model_args.tokenizer_name:
@@ -334,7 +336,7 @@ def main():
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
-            token=True if model_args.token else None,
+            token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
     else:
@@ -349,7 +351,7 @@ def main():
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.token else None,
+        token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
     )
     tokenizer = AutoTokenizer.from_pretrained(
@@ -357,7 +359,7 @@ def main():
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        token=True if model_args.token else None,
+        token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
@@ -366,7 +368,7 @@ def main():
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        token=True if model_args.token else None,
+        token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
     )
 {% endif %}

From 26ce4dd8b79dce59e183a8aeefe20e7e98a49113 Mon Sep 17 00:00:00 2001
From: Alan Ji <hzji210@gmail.com>
Date: Tue, 8 Aug 2023 19:48:50 +0800
Subject: [PATCH 883/935] Enable tests to run on third-party devcies (#25327)

* enable unit tests to run on third-party devcies other than CUDA and CPU.

* remove the modification that enabled ut on MPS

* control test on third-party device by env variable

* update

---------

Co-authored-by: statelesshz <jihuazhong1@huawei.com>
---
 src/transformers/testing_utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a9ab304d2aa6..b93a40daa2ac 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -175,6 +175,7 @@ def parse_int_from_env(key, default=None):
 _tf_gpu_memory_limit = parse_int_from_env("TF_GPU_MEMORY_LIMIT", default=None)
 _run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
 _run_tool_tests = parse_flag_from_env("RUN_TOOL_TESTS", default=False)
+_run_third_party_device_tests = parse_flag_from_env("RUN_THIRD_PARTY_DEVICE_TESTS", default=False)
 
 
 def is_pt_tf_cross_test(test_case):
@@ -612,7 +613,12 @@ def require_torch_multi_npu(test_case):
     # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
     import torch
 
-    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+    if torch.cuda.is_available():
+        torch_device = "cuda"
+    elif _run_third_party_device_tests and is_torch_npu_available():
+        torch_device = "npu"
+    else:
+        torch_device = "cpu"
 else:
     torch_device = None
 

From 6247d1b2b6a3aea7c1484d9c6560a5795d35b7e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EB=82=98=EA=B1=B4=EC=A3=BC?=
 <54880474+keonju2@users.noreply.github.com>
Date: Tue, 8 Aug 2023 20:56:34 +0900
Subject: [PATCH 884/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`add=5Ftensorflow=5Fmodel.md`=20to=20Korean=20(#25017)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: add_tensorflow_model.md

* feat: chatgpt draft

* fix: manual edits

* fix: manual edits

* fix: resolve suggestions

* fix: manual edits
---
 docs/source/ko/_toctree.yml            |   4 +-
 docs/source/ko/add_tensorflow_model.md | 263 +++++++++++++++++++++++++
 2 files changed, 265 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/add_tensorflow_model.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index f5efb7027246..ae3382a1b08e 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -147,8 +147,8 @@
       title: (번역중) How to contribute to transformers?
     - local: in_translation
       title: (번역중) How to add a model to 🤗 Transformers?
-    - local: in_translation
-      title: (번역중) How to convert a 🤗 Transformers model to TensorFlow?
+    - local: add_tensorflow_model
+      title: 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요?
     - local: in_translation
       title: (번역중) How to add a pipeline to 🤗 Transformers?
     - local: testing
diff --git a/docs/source/ko/add_tensorflow_model.md b/docs/source/ko/add_tensorflow_model.md
new file mode 100644
index 000000000000..b0d7a064f828
--- /dev/null
+++ b/docs/source/ko/add_tensorflow_model.md
@@ -0,0 +1,263 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요? [[how-to-convert-a-transformers-model-to-tensorflow]]
+
+🤗 Transformers에서처럼 사용할 수 있는 여러 가지 프레임워크가 있다는 것은 애플리케이션을 설계할 때 그들의 강점을 유연하게 이용할 수 있다는 장점이 있지만, 모델 별로 호환성을 추가해야 한다는 단점 또한 존재한다는 것을 의미합니다. 좋은 소식은 기존 모델에 TensorFlow 호환성을 추가하는 것이 [처음부터 새로운 모델을 추가하는 것](add_new_model)보다도 간단하다는 것입니다! 
+
+만약 대규모 TensorFlow 모델을 더 깊이 이해하려거나, 오픈 소스에 큰 기여를 하려거나, 선택한 모델에 Tensorflow를 활용하려한다면, 이 안내서는 여러분께 도움이 될 것입니다.
+
+이 가이드는 Hugging Face 팀의 최소한의 감독 아래에서 🤗 Transformers에서 사용되는 TensorFlow 모델 가중치와/또는 아키텍처를 기여할 수 있는 커뮤니티 구성원인 여러분을 대상으로 합니다. 
+새로운 모델을 작성하는 것은 쉬운 일이 아니지만, 이 가이드를 통해 조금 덜 힘들고 훨씬 쉬운 작업으로 만들 수 있습니다. 
+모두의 경험을 모으는 것은 이 작업을 점차적으로 더 쉽게 만드는 데 굉장히 중요하기 때문에, 이 가이드를 개선시킬만한 제안이 떠오르면 공유하시는걸 적극적으로 권장합니다!
+
+더 깊이 알아보기 전에, 🤗 Transformers를 처음 접하는 경우 다음 자료를 확인하는 것이 좋습니다:
+- [🤗 Transformers의 일반 개요](add_new_model#general-overview-of-transformers)
+- [Hugging Face의 TensorFlow 철학](https://huggingface.co/blog/tensorflow-philosophy)
+
+이 가이드의 나머지 부분에서는 새로운 TensorFlow 모델 아키텍처를 추가하는 데 필요한 단계, Pytorch를 TensorFlow 모델 가중치로 변환하는 절차 및 ML 프레임워크 간의 불일치를 효율적으로 디버깅하는 방법을 알게 될 것입니다. 시작해봅시다!
+
+<Tip>
+
+사용하려는 모델이 이미 해당하는 TensorFlow 아키텍처가 있는지 확실하지 않나요?
+
+선택한 모델([예](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14))의 `config.json`의 `model_type` 필드를 확인해보세요. 🤗 Transformers의 해당 모델 폴더에는 "modeling_tf"로 시작하는 파일이 있는 경우, 해당 모델에는 해당 TensorFlow 아키텍처([예](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert))가 있다는 의미입니다.
+
+</Tip>
+
+## TensorFlow 모델 아키텍처 코드 추가하는 단계별 가이드 [[step-by-step-guide-to add-tensorFlow-model-architecture-code]]
+
+대규모 아키텍처를 가진 모델을 설계하는 방법에는 여러가지가 있으며, 해당 설계를 구현하는 방법도 여러 가지입니다. 
+그러나 우리는 [🤗 Transformers 일반 개요](add_new_model#general-overview-of-transformers)에서 언급한 대로 일관된 설계 선택에 따라야지만 🤗 Transformers를 사용하기 편할 것이라는 확고한 의견을 가지고 있습니다.
+우리의 경험을 통해 TensorFlow 모델을 추가하는 데 관련된 중요한 몇 가지 사항을 알려 드릴 수 있습니다:
+
+- 이미 있는걸 다시 개발하려 하지 마세요! 최소한 2개의 이미 구현된 모델을 대개 참조해야 합니다. 구현하려는 모델과 기능상 동일한 Pytorch 모델 하나와 같은 문제 유형을 풀고 있는 다른 TensorFlow 모델 하나를 살펴보세요.
+- 우수한 모델 구현은 시간이 지나도 남아있습니다. 이것은 코드가 아름답다는 이유가 아니라 코드가 명확하고 디버깅 및 개선이 쉽기 때문입니다. TensorFlow 구현에서 다른 모델들과 패턴을 똑같이 하고 Pytorch 구현과의 불일치를 최소화하여 메인테이너의 업무를 쉽게 한다면, 기여한 코드가 오래도록 유지될 수 있습니다.
+- 필요하다면 도움을 요청하세요! 🤗 Transformers 팀은 여러분을 돕기 위해 있으며, 여러분이 직면한 동일한 문제에 대한 해결책을 이미 찾은 경우도 있을 수 있습니다.
+
+TensorFlow 모델 아키텍처를 추가하는 데 필요한 단계를 개략적으로 써보면:
+1. 변환하려는 모델 선택
+2. transformers 개발 환경 준비
+3. (선택 사항) 이론적 측면 및 기존 구현 이해
+4. 모델 아키텍처 구현
+5. 모델 테스트 구현
+6. PR (pull request) 제출
+7. (선택 사항) 데모 빌드 및 공유
+
+### 1.-3. 모델 기여 준비 [[1.-3.-prepare-your-model-contribution]]
+
+**1. 변환하려는 모델 선택**
+
+우선 기본 사항부터 시작해 보겠습니다. 먼저 변환하려는 아키텍처를 알아야 합니다. 
+특정 아키텍처에 대한 관심 없는 경우, 🤗 Transformers 팀에게 제안을 요청하는 것은 여러분의 영향력을 극대화하는 좋은 방법입니다. 
+우리는 TensorFlow에서 빠져 있는 가장 유명한 아키텍처로 이끌어 드리겠습니다. 
+TensorFlow에서 사용할 모델이 이미 🤗 Transformers에 TensorFlow 아키텍처 구현이 있지만 가중치가 없는 경우, 
+이 페이지의 [가중치 추가 섹션](#adding-tensorflow-weights-to-hub)으로 바로 이동하셔도 됩니다.
+
+간단히 말해서, 이 안내서의 나머지 부분은 TensorFlow 버전의 *BrandNewBert*([가이드](add_new_model)와 동일한 예제)를 기여하려고 결정했다고 가정합니다.
+
+<Tip>
+
+TensorFlow 모델 아키텍처에 작업을 시작하기 전에 해당 작업이 진행 중인지 확인하세요. 
+`BrandNewBert`를 검색하여
+[pull request GitHub 페이지](https://github.com/huggingface/transformers/pulls?q=is%3Apr)에서 TensorFlow 관련 pull request가 없는지 확인할 수 있습니다.
+
+</Tip>
+
+**2. transformers 개발 환경 준비**
+
+
+모델 아키텍처를 선택한 후, 관련 작업을 수행할 의도를 미리 알리기 위해 Draft PR을 여세요. 아래 지침대로 하시면 환경을 설정하고 Draft PR을 열 수 있습니다.
+
+1. 'Fork' 버튼을 클릭하여 [리포지터리](https://github.com/huggingface/transformers)를 포크하세요. 이렇게 하면 GitHub 사용자 계정에 코드의 사본이 생성됩니다.
+
+
+2. `transformers` 포크를 로컬 디스크에 클론하고 원본 리포지터리를 원격 리포지터리로 추가하세요.
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. 개발 환경을 설정하세요. 예를 들어, 다음 명령을 실행하여 개발 환경을 설정할 수 있습니다.
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+운영 체제에 따라서 Transformers의 선택적 종속성이 증가하면서 위 명령이 실패할 수도 있습니다. 그런 경우 TensorFlow를 설치한 후 다음을 실행하세요.
+
+```bash
+pip install -e ".[quality]"
+```
+
+**참고:** CUDA를 설치할 필요는 없습니다. 새로운 모델이 CPU에서 작동하도록 만드는 것만으로 충분합니다.
+
+4. 메인 브랜치에서 만드려는 기능이 잘 표현되는 이름으로 브랜치를 만듭니다.
+
+```bash
+git checkout -b add_tf_brand_new_bert
+```
+
+5. 메인 브랜치의 현재 상태를 페치(fetch)하고 리베이스하세요.
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+6. `transformers/src/models/brandnewbert/`에 `modeling_tf_brandnewbert.py`라는 빈 `.py` 파일을 추가하세요. 이 파일이 TensorFlow 모델 파일이 될 것입니다.
+
+7. 변경 사항을 계정에 푸시하세요.
+
+```bash
+git add .
+git commit -m "initial commit"
+git push -u origin add_tf_brand_new_bert
+```
+
+8. 만족스러운 경우 GitHub에서 포크된 웹 페이지로 이동합니다. "Pull request"를 클릭합니다. Hugging Face 팀의 GitHub ID를 리뷰어로 추가해서, 앞으로의 변경 사항에 대해 Hugging Face 팀이 알림을 받을 수 있도록 합니다.
+
+
+9. GitHub Pull Requests 페이지의 오른쪽에 있는 "Convert to draft"를 클릭하여 PR을 초안으로 변경하세요.
+
+이제 🤗 Transformers에서 *BrandNewBert*를 TensorFlow로 변환할 개발 환경을 설정했습니다.
+
+
+**3. (선택 사항) 이론적 측면 및 기존 구현 이해**
+
+
+*BrandNewBert*처럼 자세한 글이 있다면 시간을 내어 논문을 읽는걸 추천드립니다. 이해하기 어려운 부분이 많을 수 있습니다. 그렇다고 해서 걱정하지 마세요! 목표는 논문의 심도있는 이론적 이해가 아니라 TensorFlow를 사용하여 🤗 Transformers에 모델을 효과적으로 다시 구현하는 데 필요한 필수 정보를 추출하는 것입니다. 많은 시간을 이론적 이해에 투자할 필요는 없지만 실용적인 측면에서 현재 존재하는 모델 문서 페이지(e.g. [model docs for BERT](model_doc/bert))에 집중하는 것이 좋습니다.
+
+
+모델의 기본 사항을 이해한 후, 기존 구현을 이해하는 것이 중요합니다. 이는 작업 중인 모델에 대한 실제 구현이 여러분의 기대와 일치함을 확인하고, TensorFlow 측면에서의 기술적 문제를 예상할 수 있습니다.
+
+막대한 양의 정보를 처음으로 학습할 때 압도당하는 것은 자연스러운 일입니다. 이 단계에서 모델의 모든 측면을 이해해야 하는 필요는 전혀 없습니다. 그러나 우리는 Hugging Face의 [포럼](https://discuss.huggingface.co/)을 통해 질문이 있는 경우 대답을 구할 것을 권장합니다.
+
+### 4. 모델 구현 [[4-model-implementation]]
+
+
+이제 드디어 코딩을 시작할 시간입니다. 우리의 제안된 시작점은 PyTorch 파일 자체입니다: `modeling_brand_new_bert.py`의 내용을 
+`src/transformers/models/brand_new_bert/` 내부의
+`modeling_tf_brand_new_bert.py`에 복사합니다. 이 섹션의 목표는 파일을 수정하고 🤗 Transformers의 import 구조를 업데이트하여 `TFBrandNewBert` 및 `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`가 성공적으로 작동하는 TensorFlow *BrandNewBert* 모델을 가져올 수 있도록 하는 것입니다.
+
+유감스럽게도, PyTorch 모델을 TensorFlow로 변환하는 규칙은 없습니다. 그러나 프로세스를 가능한한 원활하게 만들기 위해 다음 팁을 따를 수 있습니다.
+
+- 모든 클래스 이름 앞에 `TF`를 붙입니다(예: `BrandNewBert`는 `TFBrandNewBert`가 됩니다).
+- 대부분의 PyTorch 작업에는 직접적인 TensorFlow 대체가 있습니다. 예를 들어, `torch.nn.Linear`는 `tf.keras.layers.Dense`에 해당하고, `torch.nn.Dropout`은 `tf.keras.layers.Dropout`에 해당합니다. 특정 작업에 대해 확신이 없는 경우 [TensorFlow 문서](https://www.tensorflow.org/api_docs/python/tf)나 [PyTorch 문서](https://pytorch.org/docs/stable/)를 참조할 수 있습니다.
+- 🤗 Transformers 코드베이스에서 패턴을 찾으세요. 직접적인 대체가 없는 특정 작업을 만나면 다른 사람이 이미 동일한 문제를 해결한 경우가 많습니다.
+- 기본적으로 PyTorch와 동일한 변수 이름과 구조를 유지하세요. 이렇게 하면 디버깅과 문제 추적, 그리고 문제 해결 추가가 더 쉬워집니다.
+- 일부 레이어는 각 프레임워크마다 다른 기본값을 가지고 있습니다. 대표적인 예로 배치 정규화 레이어의 epsilon은 [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)에서 `1e-5`이고 [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)에서 `1e-3`입니다. 문서를 모두 확인하세요!
+- PyTorch의 `nn.Parameter` 변수는 일반적으로 TF 레이어의 `build()` 내에서 초기화해야 합니다. 다음 예를 참조하세요: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
+   [TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
+- PyTorch 모델의 함수 상단에 `#copied from ...`가 있는 경우, TensorFlow 모델에 TensorFlow 아키텍처가 있다면 TensorFlow 모델이 해당 함수를 복사한 아키텍처에서 사용할 수 있습니다.
+- TensorFlow 함수에서 `name` 속성을 올바르게 할당하는 것은 `from_pt=True` 가중치 교차 로딩을 수행하는 데 중요합니다. `name`은 대부분 PyTorch 코드의 해당 변수의 이름입니다. `name`이 제대로 설정되지 않으면 모델 가중치를 로드할 때 오류 메시지에서 확인할 수 있습니다.
+- 기본 모델 클래스인 `BrandNewBertModel`의 로직은 실제로 Keras 레이어 서브클래스([예시](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719))인 `TFBrandNewBertMainLayer`에 있습니다. `TFBrandNewBertModel`은 이 레이어를 감싸기만 하는 래퍼 역할을 합니다.
+- Keras 모델은 사전 훈련된 가중치를 로드하기 위해 빌드되어야 합니다. 따라서 `TFBrandNewBertPreTrainedModel`은 모델의 입력 예제인 `dummy_inputs`([예시](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)) 유지해야 합니다.
+- 도움이 필요한 경우 도움을 요청하세요. 우리는 여기 있어서 도움을 드리기 위해 있는 것입니다! 🤗
+
+모델 파일 자체 외에도 모델 클래스 및 관련 문서 페이지에 대한 포인터를 추가해야 합니다. 이 부분은 다른 PR([예시](https://github.com/huggingface/transformers/pull/18020/files))의 패턴을 따라 완전히 완료할 수 있습니다. 다음은 필요한 수동 변경 목록입니다.
+
+- `src/transformers/__init__.py`에 *BrandNewBert*의 모든 공개 클래스를 포함합니다.
+- `src/transformers/models/auto/modeling_tf_auto.py`에서 *BrandNewBert* 클래스를 해당 Auto 클래스에 추가합니다.
+- `utils/documentation_tests.txt`에 모델 파일을 문서화하는 테스트 파일 목록을 추가합니다.
+- `src/transformers/utils/dummy_tf_objects.py`에 *BrandNewBert*와 관련된 레이지 로딩 클래스를 추가합니다.
+- `src/transformers/models/brand_new_bert/__init__.py`에서 공개 클래스에 대한 import 구조를 업데이트합니다.
+- `docs/source/en/model_doc/brand_new_bert.md`에서 *BrandNewBert*의 공개 메서드에 대한 문서 포인터를 추가합니다.
+- `docs/source/en/model_doc/brand_new_bert.md`의 *BrandNewBert* 기여자 목록에 자신을 추가합니다.
+- 마지막으로 ✅ 녹색 체크박스를 TensorFlow 열 docs/source/en/index.md 안 BrandNewBert에 추가합니다.
+
+구현이 만족하면 다음 체크리스트를 실행하여 모델 아키텍처가 준비되었는지 확인하세요.  
+
+1. 훈련 시간에 다르게 동작하는 `training` 인수로 불리는 모든 레이어(예: Dropout)는 최상위 클래스에서 전파됩니다.
+2. #copied from ...가능할 때마다 사용했습니다.
+3. `TFBrandNewBertMainLayer`와 그것을 사용하는 모든 클래스는 `call`함수로 `@unpack_inputs`와 함께 데코레이터 됩니다.
+4. `TFBrandNewBertMainLayer`는 `@keras_serializable`로 데코레이터 됩니다.
+5. TensorFlow 모델은 `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)`를 사용하여 PyTorch 가중치에서 로드할 수 있습니다.
+6. 예상 입력 형식을 사용하여 TensorFlow 모델을 호출할 수 있습니다.
+
+### 5. 모델 테스트 구현 [[5-add-model-tests]]
+
+TensorFlow 모델 아키텍처를 구현하는 데 성공했습니다! 이제 TensorFlow 모델을 테스트하는 구현을 작성할 차례입니다. 이를 통해 모델이 예상대로 작동하는지 확인할 수 있습니다. 이전에 우리는 `test_modeling_brand_new_bert.py` 파일을 `tests/models/brand_new_bert/ into test_modeling_tf_brand_new_bert.py`에 복사한 뒤, TensorFlow로 교체하는 것이 좋습니다. 지금은, 모든 `.from_pretrained()`을 `from_pt=True`를 사용하여 존재하는 Pytorch 가중치를 가져오도록 해야합니다.  
+
+완료하셨으면, 이제 진실의 순간이 찾아왔습니다: 테스트를 실행해 보세요! 😬
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+오류가 많이 나타날 것이지만 괜찮습니다! 기계 학습 모델을 디버깅하는 것은 악명높게 어려우며 성공의 핵심 요소는 인내심입니다 (`breakpoint()`도 필요합니다). 우리의 경험상으로는 ML 프레임워크 사이의 미묘한 불일치로 인해 가장 어려운 문제가 발생합니다. 이에 대한 몇 가지 지침이 이 가이드의 끝 부분에 있습니다. 다른 경우에는 일반 테스트가 직접 모델에 적용되지 않을 수 있으며, 이 경우 모델 테스트 클래스 레벨에서 재정의를 제안합니다. 문제가 무엇이든지 상관없이 문제가 있으면 당신이 고립되었다면 draft pull request에서 도움을 요청하는 것이 좋습니다.
+
+모든 테스트가 통과되면 축하합니다. 이제 모델을 🤗 Transformers 라이브러리에 추가할 준비가 거의 완료된 것입니다! 🎉
+
+
+테스트를 추가하는 방법에 대한 자세한 내용은 [🤗 Transformers의 테스트 가이드](https://huggingface.co/transformers/contributing.html#running-tests)를 참조하세요.
+
+### 6.-7. 모든 사용자가 당신의 모델을 사용할 수 있게 하기 [[6.-7.-ensure-everyone -can-use-your-model]]
+
+**6. 풀 요청 제출하기**
+
+구현과 테스트가 완료되면 풀 요청을 제출할 시간입니다. 코드를 푸시하기 전에 코드 서식 맞추기 유틸리티인 `make fixup` 🪄 를 실행하세요. 이렇게 하면 자동으로 서식 오류를 수정하며 자동 검사가 실패하는 것을 방지할 수 있습니다.
+
+이제 드래프트 풀 요청을 실제 풀 요청으로 변환하는 시간입니다. "리뷰 준비됨" 버튼을 클릭하고 Joao (`@gante`)와 Matt (`@Rocketknight1`)를 리뷰어로 추가하세요. 모델 풀 요청에는 적어도 3명의 리뷰어가 필요하지만, 그들이 당신의 모델에 적절한 추가 리뷰어를 찾을 것입니다.
+
+모든 리뷰어들이 PR 상태에 만족하면 마지막으로 `.from_pretrained()` 호출에서 `from_pt=True` 플래그를 제거하는 것입니다. TensorFlow 가중치가 없기 때문에 이를 추가해야 합니다! 이를 수행하는 방법은 아래 섹션의 지침을 확인하세요.
+
+마침내 TensorFlow 가중치가 병합되고, 적어도 3명의 리뷰어 승인을 받았으며 모든 CI 검사가 통과되었다면, 로컬로 테스트를 한 번 더 확인하세요.
+
+```bash
+NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
+py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
+```
+
+그리고 우리는 당신의 PR을 병합할 것입니다! 마일스톤 달성을 축하드립니다! 🎉
+
+**7. (선택 사항) 데모를 만들고 세상과 공유하기**
+
+오픈 소스의 가장 어려운 부분 중 하나는 발견입니다. 다른 사용자들이 당신의 멋진 TensorFlow 기여를 어떻게 알 수 있을까요? 물론 적절한 커뮤니케이션으로 가능합니다! 📣
+
+커뮤니티와 모델을 공유하는 두 가지 주요 방법이 있습니다:
+- 데모 만들기. Gradio 데모, 노트북 및 모델을 자랑하는 다른 재미있는 방법을 포함합니다. [커뮤니티 기반 데모](https://huggingface.co/docs/transformers/community)에 노트북을 추가하는 것을 적극 권장합니다.
+- Twitter와 LinkedIn과 같은 소셜 미디어에 이야기 공유하기. 당신의 작업에 자랑스러워하고 커뮤니티와 당신의 업적을 공유해야 합니다. 이제 당신의 모델은 전 세계의 수천 명의 엔지니어와 연구원들에 의해 사용될 수 있습니다 🌍! 우리는 당신의 게시물을 리트윗하고 커뮤니티와 함께 당신의 작업을 공유하는 데 도움이 될 것입니다.
+
+
+## 🤗 허브에 TensorFlow 가중치 추가하기 [[adding-tensorFlow-weights-to-🤗-hub]]
+
+TensorFlow 모델 아키텍처가 🤗 Transformers에서 사용 가능하다고 가정하고, PyTorch 가중치를 TensorFlow 가중치로 변환하는 것은 쉽습니다!
+
+다음은 그 방법입니다:
+1. 터미널에서 Hugging Face 계정으로 로그인되어 있는지 확인하십시오. `huggingface-cli login` 명령어를 사용하여 로그인할 수 있습니다. (액세스 토큰은 [여기](https://huggingface.co/settings/tokens)에서 찾을 수 있습니다.)
+2. `transformers-cli pt-to-tf --model-name foo/bar`를 실행하십시오. 여기서 `foo/bar`는 변환하려는 PyTorch 가중치가 있는 모델 저장소의 이름입니다.
+3. 방금 만든 🤗 허브 PR에서 `@joaogante`와 `@Rocketknight1`을 태그합니다.
+
+그게 다입니다! 🎉
+
+
+## ML 프레임워크 간 디버깅 🐛[[debugging-mismatches-across-ml-frameworks]]
+
+새로운 아키텍처를 추가하거나 기존 아키텍처에 대한 TensorFlow 가중치를 생성할 때, PyTorch와 TensorFlow 간의 불일치로 인한 오류가 발생할 수 있습니다. 심지어 두 프레임워크의 모델 아키텍처 코드가 동일해 보일 수도 있습니다. 무슨 일이 벌어지고 있는 걸까요? 🤔
+
+먼저, 이러한 불일치를 이해하는 이유에 대해 이야기해 보겠습니다. 많은 커뮤니티 멤버들은 🤗 Transformers 모델을 그대로 사용하고, 우리의 모델이 예상대로 작동할 것이라고 믿습니다. 두 프레임워크 간에 큰 불일치가 있으면 모델이 적어도 하나의 프레임워크에 대한 참조 구현을 따르지 않음을 의미합니다. 이는 모델이 의도한 대로 작동하지 않을 수 있음을 나타냅니다. 이는 아예 실행되지 않는 모델보다 나쁠 수 있습니다! 따라서 우리는 모든 모델의 프레임워크 불일치를 `1e-5`보다 작게 유지하는 것을 목표로 합니다.
+
+기타 숫자 문제와 마찬가지로, 세세한 문제가 있습니다. 그리고 세세함에 집중하는 공정에서 필수 요소는 인내심입니다. 이러한 종류의 문제가 발생할 때 권장되는 작업 흐름은 다음과 같습니다:
+1. 불일치의 원인을 찾아보십시오. 변환 중인 모델은 아마도 특정 지점까지 거의 동일한 내부 변수를 가지고 있을 것입니다. 두 프레임워크의 아키텍처에 `breakpoint()` 문을 넣고, 위에서 아래로 숫자 변수의 값을 비교하여 문제의 근원을 찾아냅니다.
+2. 이제 문제의 근원을 찾았으므로 🤗 Transformers 팀에 연락하세요. 우리는 비슷한 문제를 이전에 겪었을 수 있으며 빠르게 해결책을 제공할 수 있습니다. 예외적인 경우에는 StackOverflow와 GitHub 이슈와 같은 인기있는 페이지를 확인하십시오.
+3. 더 이상 해결책이 없는 경우, 더 깊이 들어가야 합니다. 좋은 소식은 문제의 원인을 찾았으므로 나머지 모델을 추상화하고 문제가 있는 명령어에 초점을 맞출 수 있습니다! 나쁜 소식은 해당 명령어의 소스 구현에 대해 알아봐야 한다는 것입니다. 일부 경우에는 참조 구현에 문제가 있을 수도 있으니 업스트림 저장소에서 이슈를 열기를 꺼리지 마십시오.
+
+어떤 경우에는 🤗 Transformers 팀과의 토론을 통해 불일치를 수정할 수 없을 수도 있습니다. 모델의 출력 레이어에서 불일치가 매우 작지만 숨겨진 상태에서 크게 나타날 수 있기 때문입니다. 이 경우 모델을 배포하는 것을 우선시하기 위해 불일치를 무시하기로 결정할 수도 있습니다. 위에서 언급한 `pt-to-tf` CLI에는 가중치 변환 시 오류 메시지를 무시하는 `--max-error` 플래그가 있습니다.

From 9e57e0c0633a7d730c8b4c785e6925dfd907e36d Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 8 Aug 2023 14:12:56 +0200
Subject: [PATCH 885/935] Fix `torch_job` worker(s) crashing (#25374)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 504c8915b997..cd7154629551 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -285,6 +285,7 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     parallelism=1,
+    pytest_num_workers=6,
 )
 
 

From 5bd8c011bb24201148c1a2ba753ffd3f0822f004 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 8 Aug 2023 13:53:03 +0100
Subject: [PATCH 886/935] Generate: add config-level validation (#25381)

---
 .../generation/configuration_utils.py         | 113 +++++++++++++++++-
 src/transformers/generation/utils.py          |  59 ---------
 2 files changed, 111 insertions(+), 61 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 78a21d74adb3..03116840744b 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -332,12 +332,121 @@ def __repr__(self):
 
     def validate(self):
         """
-        Validates the values of the attributes of the GenerationConfig instance, and raises a `ValueError` if any of
-        the values are invalid.
+        Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
+        of parameterization that can be detected as incorrect from the configuration instance alone.
+
+        Note that some parameters are best validated at generate runtime, as they may depend on other inputs and/or the
+        model, such as parameters related to the generation length.
         """
+
+        # Validation of individual attributes
         if self.early_stopping not in {True, False, "never"}:
             raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
 
+        # Validation of attribute relations:
+        # 1. detect sampling-only parameterization when not in sampling mode
+        if self.do_sample is False:
+            greedy_wrong_parameter_msg = (
+                "`do_sample` is set to `False`. However, {flag_name} is set to {flag_value} -- this flag is only used "
+                "in sample-based generation modes. Set `do_sample=True` or unset {flag_name} to continue."
+            )
+            if self.temperature != 1.0:
+                raise ValueError(
+                    greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature)
+                )
+            if self.top_p != 1.0:
+                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p))
+            if self.typical_p != 1.0:
+                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p))
+            if self.top_k != 50 and self.penalty_alpha is None:  # contrastive search uses top_k
+                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k))
+            if self.epsilon_cutoff != 0.0:
+                raise ValueError(
+                    greedy_wrong_parameter_msg.format(flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff)
+                )
+            if self.eta_cutoff != 0.0:
+                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="eta_cutoff", flag_value=self.eta_cutoff))
+
+        # 2. detect beam-only parameterization when not in beam mode
+        if self.num_beams == 1:
+            single_beam_wrong_parameter_msg = (
+                "`num_beams` is set to 1. However, {flag_name} is set to {flag_value} -- this flag is only used in "
+                "beam-based generation modes. Set `num_beams>1` or unset {flag_name} to continue."
+            )
+            if self.early_stopping is not False:
+                raise ValueError(
+                    single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping)
+                )
+            if self.num_beam_groups != 1:
+                raise ValueError(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                    )
+                )
+            if self.diversity_penalty != 0.0:
+                raise ValueError(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="diversity_penalty", flag_value=self.diversity_penalty
+                    )
+                )
+            if self.length_penalty != 1.0:
+                raise ValueError(
+                    single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty)
+                )
+            if self.constraints is not None:
+                raise ValueError(
+                    single_beam_wrong_parameter_msg.format(flag_name="constraints", flag_value=self.constraints)
+                )
+
+        # 3. detect incorrect paramaterization specific to advanced beam modes
+        else:
+            # constrained beam search
+            if self.constraints is not None:
+                constrained_wrong_parameter_msg = (
+                    "`constraints` is not `None`, triggering constrained beam search. However, {flag_name} is set to "
+                    "{flag_value}, which is incompatible with this generation mode. Set `constraints=None` or unset "
+                    "{flag_name} to continue."
+                )
+                if self.do_sample is True:
+                    raise ValueError(
+                        constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample)
+                    )
+                if self.num_beam_groups != 1:
+                    raise ValueError(
+                        constrained_wrong_parameter_msg.format(
+                            flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                        )
+                    )
+            # group beam search
+            if self.diversity_penalty != 0.0 or self.num_beam_groups != 1:
+                group_error_prefix = (
+                    "`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In "
+                    "this generation mode, "
+                )
+                if self.do_sample is True:
+                    raise ValueError(group_error_prefix + "`do_sample` must be set to `False`")
+                if self.num_beams % self.num_beam_groups != 0:
+                    raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`")
+                if self.diversity_penalty == 0.0:
+                    raise ValueError(
+                        group_error_prefix
+                        + "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical."
+                    )
+
+        # 4. check `num_return_sequences`
+        if self.num_return_sequences != 1:
+            if self.num_beams == 1:
+                if self.do_sample is False:
+                    raise ValueError(
+                        "Greedy methods without beam search do not support `num_return_sequences` different than 1 "
+                        f"(got {self.num_return_sequences})."
+                    )
+            elif self.num_return_sequences > self.num_beams:
+                raise ValueError(
+                    f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
+                    f"({self.num_beams})."
+                )
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 29d636b3c63b..832d43ee5ac0 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1493,13 +1493,6 @@ def generate(
         # 7. determine generation mode
         generation_mode = self._get_generation_mode(generation_config, assistant_model)
 
-        if generation_config.num_beam_groups > generation_config.num_beams:
-            raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
-        if generation_mode == GenerationMode.GROUP_BEAM_SEARCH and generation_config.do_sample is True:
-            raise ValueError(
-                "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`."
-            )
-
         if streamer is not None and (generation_config.num_beams > 1):
             raise ValueError(
                 "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
@@ -1572,12 +1565,6 @@ def generate(
                 **model_kwargs,
             )
         if generation_mode == GenerationMode.GREEDY_SEARCH:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing greedy search, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
-
             # 11. run greedy search
             return self.greedy_search(
                 input_ids,
@@ -1593,11 +1580,6 @@ def generate(
             )
 
         elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing contrastive search, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
             if not model_kwargs["use_cache"]:
                 raise ValueError("Contrastive search requires `use_cache=True`")
 
@@ -1645,12 +1627,6 @@ def generate(
             )
 
         elif generation_mode == GenerationMode.BEAM_SEARCH:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
-
             # 11. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
                 batch_size=batch_size,
@@ -1686,8 +1662,6 @@ def generate(
             # 11. prepare logits warper
             logits_warper = self._get_logits_warper(generation_config)
 
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
             # 12. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
                 batch_size=batch_size * generation_config.num_return_sequences,
@@ -1722,24 +1696,6 @@ def generate(
             )
 
         elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-
-            if generation_config.num_beams % generation_config.num_beam_groups != 0:
-                raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.")
-
-            if generation_config.diversity_penalty == 0.0:
-                raise ValueError(
-                    "`diversity_penalty` should be greater than `0.0`, otherwise your beam groups will be identical."
-                )
-
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
-
-            has_default_typical_p = kwargs.get("typical_p") is None and generation_config.typical_p == 1.0
-            if not has_default_typical_p:
-                raise ValueError("Decoder argument `typical_p` is not supported with beam groups.")
-
             # 11. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
                 batch_size=batch_size,
@@ -1773,21 +1729,6 @@ def generate(
             )
 
         elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
-            if generation_config.num_return_sequences > generation_config.num_beams:
-                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-
-            if stopping_criteria.max_length is None:
-                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
-
-            if generation_config.num_beams <= 1:
-                raise ValueError("`num_beams` needs to be greater than 1 for constrained generation.")
-
-            if generation_config.do_sample:
-                raise ValueError("`do_sample` needs to be false for constrained generation.")
-
-            if generation_config.num_beam_groups is not None and generation_config.num_beam_groups > 1:
-                raise ValueError("`num_beam_groups` not supported yet for constrained generation.")
-
             final_constraints = []
             if generation_config.constraints is not None:
                 final_constraints = generation_config.constraints

From 9c7b744795623ad045473680bf8cd79c2058b780 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 8 Aug 2023 16:27:24 +0200
Subject: [PATCH 887/935] Fix missing usage of `token` (#25382)

* add missing tokens

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../contrastive-image-text/run_clip.py        | 12 +++++--
 .../image-pretraining/run_mim_no_trainer.py   | 30 ++++++++++++-----
 .../contrastive-image-text/run_clip.py        | 32 +++++++++++++++++--
 .../tensorflow/language-modeling/run_clm.py   | 18 +++++++----
 .../tensorflow/language-modeling/run_mlm.py   | 25 ++++++++++-----
 .../token-classification/run_ner.py           |  7 +++-
 6 files changed, 96 insertions(+), 28 deletions(-)

diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 86612b8b7eeb..110f1bbaa643 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -342,11 +342,19 @@ def main():
     # 5. Load pretrained model, tokenizer, and image processor
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.tokenizer_name,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 8c5773c9d06c..c853669c0dee 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -17,6 +17,7 @@
 import logging
 import math
 import os
+import warnings
 from pathlib import Path
 
 import datasets
@@ -187,14 +188,20 @@ def parse_args():
         help="Name or path of preprocessor config.",
     )
     parser.add_argument(
-        "--use_auth_token",
-        type=bool,
-        default=False,
+        "--token",
+        type=str,
+        default=None,
         help=(
-            "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-            "with private models)."
+            "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+            "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
         ),
     )
+    parser.add_argument(
+        "--use_auth_token",
+        type=bool,
+        default=None,
+        help="The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`.",
+    )
     parser.add_argument(
         "--trust_remote_code",
         type=bool,
@@ -377,6 +384,12 @@ def collate_fn(examples):
 def main():
     args = parse_args()
 
+    if args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        args.token = args.use_auth_token
+
     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
     # information sent is the one passed as arguments along with your Python/PyTorch versions.
     send_example_telemetry("run_mim_no_trainer", args)
@@ -440,7 +453,7 @@ def main():
         args.dataset_config_name,
         data_files=args.data_files,
         cache_dir=args.cache_dir,
-        use_auth_token=True if args.use_auth_token else None,
+        token=args.token,
     )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -457,7 +470,7 @@ def main():
     config_kwargs = {
         "cache_dir": args.cache_dir,
         "revision": args.model_revision,
-        "use_auth_token": True if args.use_auth_token else None,
+        "token": args.token,
         "trust_remote_code": args.trust_remote_code,
     }
     if args.config_name_or_path:
@@ -508,13 +521,14 @@ def main():
             config=config,
             cache_dir=args.cache_dir,
             revision=args.model_revision,
-            token=True if args.use_auth_token else None,
+            token=args.token,
             trust_remote_code=args.trust_remote_code,
         )
     else:
         logger.info("Training new model from scratch")
         model = AutoModelForMaskedImageModeling.from_config(
             config,
+            token=args.token,
             trust_remote_code=args.trust_remote_code,
         )
 
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index f5519046c68f..bbbb6d75977d 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -108,6 +108,16 @@ class ModelArguments:
             "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
         },
     )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
     freeze_vision_model: bool = field(
         default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
     )
@@ -353,15 +363,27 @@ def main():
     # 5. Load pretrained model, tokenizer, and image processor
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.tokenizer_name,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.text_model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.text_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+            model_args.text_model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         raise ValueError(
@@ -376,6 +398,7 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         with training_args.strategy.scope():
             model = TFAutoModel.from_pretrained(
@@ -383,6 +406,7 @@ def main():
                 cache_dir=model_args.cache_dir,
                 revision=model_args.model_revision,
                 token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
     else:
         # Load image_processor, in this script we only use this to get the mean and std for normalization.
@@ -391,6 +415,7 @@ def main():
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
         with training_args.strategy.scope():
             model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
@@ -398,6 +423,7 @@ def main():
                 text_model_name_or_path=model_args.text_model_name_or_path,
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
     config = model.config
 
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index 43d5a1d8a3de..1614bbd4b124 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -378,11 +378,12 @@ def main():
     if model_args.config_name:
         config = AutoConfig.from_pretrained(
             model_args.config_name,
+            token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
-            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -390,11 +391,11 @@ def main():
 
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, trust_remote_code=model_args.trust_remote_code
+            model_args.tokenizer_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
         )
     else:
         raise ValueError(
@@ -499,15 +500,20 @@ def group_texts(examples):
         # region Prepare model
         if checkpoint is not None:
             model = TFAutoModelForCausalLM.from_pretrained(
-                checkpoint, config=config, trust_remote_code=model_args.trust_remote_code
+                checkpoint, config=config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
             )
         elif model_args.model_name_or_path:
             model = TFAutoModelForCausalLM.from_pretrained(
-                model_args.model_name_or_path, config=config, trust_remote_code=model_args.trust_remote_code
+                model_args.model_name_or_path,
+                config=config,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
+            model = TFAutoModelForCausalLM.from_config(
+                config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 75edfa954aae..671331745de7 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -358,12 +358,16 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     if checkpoint is not None:
-        config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=model_args.trust_remote_code)
+        config = AutoConfig.from_pretrained(
+            checkpoint, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, trust_remote_code=model_args.trust_remote_code)
+        config = AutoConfig.from_pretrained(
+            model_args.config_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+        )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
-            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
         )
     else:
         config = CONFIG_MAPPING[model_args.model_type]()
@@ -371,11 +375,11 @@ def main():
 
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, trust_remote_code=model_args.trust_remote_code
+            model_args.tokenizer_name, token=model_args.token, trust_remote_code=model_args.trust_remote_code
         )
     elif model_args.model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+            model_args.model_name_or_path, token=model_args.token, trust_remote_code=model_args.trust_remote_code
         )
     else:
         raise ValueError(
@@ -512,15 +516,20 @@ def group_texts(examples):
         # region Prepare model
         if checkpoint is not None:
             model = TFAutoModelForMaskedLM.from_pretrained(
-                checkpoint, config=config, trust_remote_code=model_args.trust_remote_code
+                checkpoint, config=config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
             )
         elif model_args.model_name_or_path:
             model = TFAutoModelForMaskedLM.from_pretrained(
-                model_args.model_name_or_path, config=config, trust_remote_code=model_args.trust_remote_code
+                model_args.model_name_or_path,
+                config=config,
+                token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
         else:
             logger.info("Training new model from scratch")
-            model = TFAutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
+            model = TFAutoModelForMaskedLM.from_config(
+                config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
+            )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
         # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index bcf47eb83c1e..f04dae721825 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -317,12 +317,14 @@ def get_label_list(labels):
         config = AutoConfig.from_pretrained(
             model_args.config_name,
             num_labels=num_labels,
+            token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
     elif model_args.model_name_or_path:
         config = AutoConfig.from_pretrained(
             model_args.model_name_or_path,
             num_labels=num_labels,
+            token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
     else:
@@ -341,12 +343,14 @@ def get_label_list(labels):
             tokenizer_name_or_path,
             use_fast=True,
             add_prefix_space=True,
+            token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
     else:
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_name_or_path,
             use_fast=True,
+            token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
         )
     # endregion
@@ -419,12 +423,13 @@ def tokenize_and_align_labels(examples):
             model = TFAutoModelForTokenClassification.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
+                token=model_args.token,
                 trust_remote_code=model_args.trust_remote_code,
             )
         else:
             logger.info("Training new model from scratch")
             model = TFAutoModelForTokenClassification.from_config(
-                config, trust_remote_code=model_args.trust_remote_code
+                config, token=model_args.token, trust_remote_code=model_args.trust_remote_code
             )
 
         # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch

From 5b517e1764221c17e57db7bea5679b513c7026a5 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 8 Aug 2023 17:15:34 +0200
Subject: [PATCH 888/935] Use small config for
 `OneFormerModelTest.test_model_with_labels` (#25383)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/create_circleci_config.py               | 2 +-
 tests/models/oneformer/test_modeling_oneformer.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index cd7154629551..f122213057b6 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -285,7 +285,7 @@ def job_name(self):
         "pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
     ],
     parallelism=1,
-    pytest_num_workers=6,
+    pytest_num_workers=8,
 )
 
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index 222f1ce66b0f..1fa400a22855 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -332,13 +332,13 @@ def test_model_with_labels(self):
             "pixel_values": torch.randn((2, 3, *size), device=torch_device),
             "task_inputs": torch.randint(high=self.model_tester.vocab_size, size=(2, 77), device=torch_device).long(),
             "text_inputs": torch.randint(
-                high=self.model_tester.vocab_size, size=(2, 134, 77), device=torch_device
+                high=self.model_tester.vocab_size, size=(2, 6, 77), device=torch_device
             ).long(),
             "mask_labels": torch.randn((2, 150, *size), device=torch_device),
             "class_labels": torch.zeros(2, 150, device=torch_device).long(),
         }
 
-        config = OneFormerConfig()
+        config = self.model_tester.get_config()
         config.is_training = True
 
         model = OneFormerForUniversalSegmentation(config).to(torch_device)

From e3490104da84d0bfd6bbfdcb41ea591be7fa8037 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 8 Aug 2023 17:02:49 +0100
Subject: [PATCH 889/935] Add copied from for image processor methods (#25121)

* Add copied from statements for image processors

* Move out rescale and normalize to base image processor

* Remove rescale and normalize from vit (post rebase)

* Update docstrings and tidy up

* PR comments
---
 .../models/beit/image_processing_beit.py      |  5 ++-
 .../models/bit/image_processing_bit.py        |  1 +
 .../models/blip/image_processing_blip.py      |  3 +-
 .../image_processing_bridgetower.py           | 28 +++++++++-----
 .../image_processing_conditional_detr.py      | 26 ++++++++++---
 .../image_processing_deformable_detr.py       | 26 ++++++++++---
 .../models/deit/image_processing_deit.py      | 28 ++++++++------
 .../models/deta/image_processing_deta.py      | 26 ++++++++++---
 .../models/detr/image_processing_detr.py      | 27 +++++++++++---
 .../models/dpt/image_processing_dpt.py        |  1 +
 .../image_processing_efficientnet.py          | 28 ++++++++------
 .../models/flava/image_processing_flava.py    | 22 +++++++----
 .../models/glpn/image_processing_glpn.py      |  7 +++-
 .../imagegpt/image_processing_imagegpt.py     | 24 +++++++-----
 .../layoutlmv2/image_processing_layoutlmv2.py | 17 ++++++---
 .../layoutlmv3/image_processing_layoutlmv3.py | 19 +++++++---
 .../image_processing_mask2former.py           | 32 +++++++++++++---
 .../maskformer/image_processing_maskformer.py | 28 +++++++++++---
 .../image_processing_mobilenet_v1.py          |  1 +
 .../image_processing_mobilenet_v2.py          | 16 ++++----
 .../mobilevit/image_processing_mobilevit.py   | 25 +++++++------
 .../oneformer/image_processing_oneformer.py   | 29 +++++++++++----
 .../models/owlvit/image_processing_owlvit.py  | 22 +++++++----
 .../perceiver/image_processing_perceiver.py   | 26 ++++++++-----
 .../models/pvt/image_processing_pvt.py        |  8 ++--
 .../segformer/image_processing_segformer.py   | 29 ++++++++++-----
 .../models/vilt/image_processing_vilt.py      | 20 +++++++---
 .../models/vit/image_processing_vit.py        |  7 ++--
 .../vit_hybrid/image_processing_vit_hybrid.py |  1 +
 .../models/yolos/image_processing_yolos.py    | 37 +++++++++++++++----
 30 files changed, 400 insertions(+), 169 deletions(-)

diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index ff6564dfcb43..08da81113dc7 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -434,8 +434,9 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
             outputs ([`BeitForSemanticSegmentation`]):
                 Raw outputs of the model.
             target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
-                List of tuples corresponding to the requested final size (height, width) of each prediction. If left to
-                None, predictions will not be resized.
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
         Returns:
             semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
             segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index f26ef239d016..65c7a0c946a8 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -118,6 +118,7 @@ def __init__(
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index 1727d202fefb..23b008f8d72b 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -104,6 +104,7 @@ def __init__(
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
@@ -131,7 +132,7 @@ def resize(
         Returns:
             `np.ndarray`: The resized image.
         """
-        size = get_size_dict(size, default_to_square=True)
+        size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index cb1281ddbbf8..1ae7868f47f3 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -234,20 +234,21 @@ def center_crop(
         **kwargs,
     ) -> np.ndarray:
         """
-        Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
-        edge, the image is padded with 0's and then center cropped.
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
 
         Args:
             image (`np.ndarray`):
                 Image to center crop.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Size of the output image in the form `{"height": h, "width": w}`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         output_size = size["shortest_edge"]
         return center_crop(image, size=(output_size, output_size), data_format=data_format, **kwargs)
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
         image: np.ndarray,
@@ -269,22 +270,26 @@ def _pad_image(
         )
         return padded_image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
     def pad(
         self,
         images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
     ) -> BatchFeature:
         """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and optionally returns
-        their corresponding pixel mask.
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
 
         Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
-            return_pixel_mask (`bool`, *optional*, defaults to `False`):
-                Whether to return the pixel mask.
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                     - Unset: Return a list of `np.ndarray`.
@@ -296,10 +301,13 @@ def pad(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         pad_size = get_max_height_width(images)
+
         padded_images = [
-            self._pad_image(image=image, output_size=pad_size, data_format=data_format) for image in images
+            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            for image in images
         ]
         data = {"pixel_values": padded_images}
+
         if return_pixel_mask:
             masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
             data["pixel_mask"] = masks
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 4f6497d112b0..465f224cc2fb 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -935,10 +935,21 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -980,7 +991,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -992,8 +1003,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 2b54921f27ed..39d31fd62ec4 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -933,10 +933,21 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -978,7 +989,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -990,8 +1001,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index d63a1832a5f4..144984523e3c 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -107,33 +107,39 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to `(size["height"], size["width"])` using the specified resampling filter.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index d4734c9894ab..4ae35b101d66 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -606,10 +606,21 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -651,7 +662,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -663,8 +674,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index dcdf6f85df7f..794f0f4bd4e9 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -907,11 +907,23 @@ def resize_annotation(
         """
         return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
 
+    # TODO (Amy) - update to use `rescale_factor` instead of `scale`
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -950,7 +962,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -962,8 +974,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index eb49967be6b3..417f5b3c842a 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -296,6 +296,7 @@ def preprocess(
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 9a19ab1ff643..a9cab52491a2 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -116,33 +116,39 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.include_top = include_top
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.NEAREST
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.NEAREST,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to `(size["height"], size["width"])` using the specified resampling filter.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.NEAREST`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.NEAREST`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.NEAREST`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def rescale(
         self,
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 75e15ddb7b49..2ece1bd2b32c 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -331,6 +331,7 @@ def masking_generator(
             mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
         )
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
@@ -346,18 +347,23 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def map_pixels(self, image: np.ndarray) -> np.ndarray:
         return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 290100ecd684..3e37d0bab221 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -70,7 +70,12 @@ def __init__(
         super().__init__(**kwargs)
 
     def resize(
-        self, image: np.ndarray, size_divisor: int, resample, data_format: Optional[ChannelDimension] = None, **kwargs
+        self,
+        image: np.ndarray,
+        size_divisor: int,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index b00ef83dda7e..c9478c3bbd4c 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -100,6 +100,7 @@ def __init__(
         self.do_normalize = do_normalize
         self.do_color_quantize = do_color_quantize
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
@@ -109,24 +110,29 @@ def resize(
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to (size["height"], size["width"]).
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"Size dictionary must contain both height and width keys. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def normalize(
         self,
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index c396b7e40324..ab70a015d8a7 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -131,6 +131,7 @@ def __init__(
         self.ocr_lang = ocr_lang
         self.tesseract_config = tesseract_config
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
@@ -146,15 +147,21 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index cdf2d03e785e..9ca195ce793f 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -157,6 +157,7 @@ def __init__(
         self.ocr_lang = ocr_lang
         self.tesseract_config = tesseract_config
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
@@ -166,21 +167,27 @@ def resize(
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image to (size["height"], size["width"]) dimensions.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
         output_size = (size["height"], size["width"])
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 0690f9536eef..2c5b8cca1cde 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -253,6 +253,7 @@ def compute_segments(
 
 
 # TODO: (Amy) Move to image_transforms
+# Copied from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
 def convert_segmentation_map_to_binary_masks(
     segmentation_map: "np.ndarray",
     instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
@@ -289,6 +290,7 @@ def convert_segmentation_map_to_binary_masks(
     return binary_masks.astype(np.float32), labels.astype(np.int64)
 
 
+# Copied from transformers.models.maskformer.image_processing_maskformer.get_maskformer_resize_output_image_size with maskformer->mask2former
 def get_mask2former_resize_output_image_size(
     image: np.ndarray,
     size: Union[int, Tuple[int, int], List[int], Tuple[int]],
@@ -440,6 +442,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
         return super().from_dict(image_processor_dict, **kwargs)
 
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.resize with get_maskformer_resize_output_image_size->get_mask2former_resize_output_image_size
     def resize(
         self,
         image: np.ndarray,
@@ -483,14 +486,27 @@ def resize(
         image = resize(image, size=size, resample=resample, data_format=data_format)
         return image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
+    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
     def convert_segmentation_map_to_binary_masks(
         self,
         segmentation_map: "np.ndarray",
@@ -719,7 +735,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -731,8 +747,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
@@ -808,6 +829,7 @@ def encode_inputs(
         """
         ignore_index = self.ignore_index if ignore_index is None else ignore_index
         reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels
+
         pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
         encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors)
 
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index fdd0f2d9aacf..a151700f97d4 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -494,11 +494,23 @@ def resize(
         image = resize(image, size=size, resample=resample, data_format=data_format)
         return image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -508,7 +520,6 @@ def convert_segmentation_map_to_binary_masks(
         instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
-        **kwargs,
     ):
         reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
         ignore_index = ignore_index if ignore_index is not None else self.ignore_index
@@ -741,7 +752,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -753,8 +764,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index eb63a0fd35f1..a9f3cc0c38c5 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -111,6 +111,7 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index fac1fe4e6d19..d3edd8236f02 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -115,6 +115,7 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
+    # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
@@ -254,6 +255,7 @@ def preprocess(
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->MobileNetV2
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports
@@ -262,14 +264,14 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
         Args:
             outputs ([`MobileNetV2ForSemanticSegmentation`]):
                 Raw outputs of the model.
-            target_sizes (`List[Tuple]`, *optional*):
-                A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested
-                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
         Returns:
-            `List[torch.Tensor]`:
-                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
-                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
-                `torch.Tensor` correspond to a semantic class id.
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
         """
         # TODO: add support for other frameworks
         logits = outputs.logits
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 8432f835535a..147050099a6f 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -107,23 +107,24 @@ def __init__(
         self.crop_size = crop_size
         self.do_flip_channel_order = do_flip_channel_order
 
+    # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize with PILImageResampling.BICUBIC->PILImageResampling.BILINEAR
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.BILINEAR,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image.
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Controls the size of the output image. The shortest edge of the image will be resized to
-                `size["shortest_edge"]` while maintaining the aspect ratio.
+                Size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
@@ -131,7 +132,7 @@ def resize(
         """
         size = get_size_dict(size, default_to_square=False)
         if "shortest_edge" not in size:
-            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
         output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
@@ -251,6 +252,7 @@ def preprocess(
         data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->MobileViT
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports
@@ -259,15 +261,14 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
         Args:
             outputs ([`MobileViTForSemanticSegmentation`]):
                 Raw outputs of the model.
-            target_sizes (`List[Tuple]`, *optional*):
-                A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested
-                final size (height, width) of each prediction. If left to None, predictions will not be resized.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
 
         Returns:
-            `List[torch.Tensor]`:
-                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
-                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
-                `torch.Tensor` correspond to a semantic class id.
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
         """
         # TODO: add support for other frameworks
         logits = outputs.logits
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index d8944773661a..cd0bdf085075 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -477,12 +477,23 @@ def resize(
         image = resize(image, size=size, resample=resample, data_format=data_format)
         return image
 
-    # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.rescale
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -493,7 +504,6 @@ def convert_segmentation_map_to_binary_masks(
         instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
-        **kwargs,
     ):
         reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
         ignore_index = ignore_index if ignore_index is not None else self.ignore_index
@@ -730,7 +740,7 @@ def pad(
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -742,8 +752,13 @@ def pad(
                 The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 684bb40f2dbd..c230fda88f26 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -196,17 +196,25 @@ def center_crop(
 
         return center_crop(image, (crop_size["height"], crop_size["width"]), data_format=data_format, **kwargs)
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self,
-        image: np.ndarray,
-        rescale_factor: float,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale an image by a certain factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
-        return rescale(image, rescale_factor, data_format=data_format, **kwargs)
+        return rescale(image, rescale_factor, data_format=data_format)
 
     def preprocess(
         self,
diff --git a/src/transformers/models/perceiver/image_processing_perceiver.py b/src/transformers/models/perceiver/image_processing_perceiver.py
index 927cbf9b198e..de490ab8d81b 100644
--- a/src/transformers/models/perceiver/image_processing_perceiver.py
+++ b/src/transformers/models/perceiver/image_processing_perceiver.py
@@ -146,11 +146,12 @@ def center_crop(
         cropped_width = (size["width"] / crop_size["width"]) * min_dim
         return center_crop(image, size=(cropped_height, cropped_width), data_format=data_format, **kwargs)
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
     def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PIL.Image.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> np.ndarray:
@@ -161,18 +162,23 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`):
-                Resampling filter to use when resizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
-            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
diff --git a/src/transformers/models/pvt/image_processing_pvt.py b/src/transformers/models/pvt/image_processing_pvt.py
index c12c8a91eac5..d0a7cc99a459 100644
--- a/src/transformers/models/pvt/image_processing_pvt.py
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -93,6 +93,7 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
@@ -109,7 +110,7 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample:
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
@@ -123,9 +124,8 @@ def resize(
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index 30d9d02f432d..4424f5dd646f 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -128,6 +128,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels")
         return super().from_dict(image_processor_dict, **kwargs)
 
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
@@ -143,19 +144,25 @@ def resize(
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BILINEAR`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
     def reduce_label(self, label: ImageInput) -> np.ndarray:
         label = to_numpy_array(label)
         # Avoid using underflow conversion
@@ -387,6 +394,7 @@ def preprocess(
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->Segformer
     def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
         """
         Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports
@@ -396,8 +404,9 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple]
             outputs ([`SegformerForSemanticSegmentation`]):
                 Raw outputs of the model.
             target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
-                List of tuples corresponding to the requested final size (height, width) of each prediction. If left to
-                None, predictions will not be resized.
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
         Returns:
             semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
             segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 2e9aad6a9513..c45c2c6b25a0 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -229,6 +229,7 @@ def resize(
         output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor)
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
     def _pad_image(
         self,
         image: np.ndarray,
@@ -250,22 +251,26 @@ def _pad_image(
         )
         return padded_image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
     def pad(
         self,
         images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
         return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
     ) -> BatchFeature:
         """
-        Pads a batch of images with zeros to the size of largest height and width in the batch and optionally returns
-        their corresponding pixel mask.
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
 
         Args:
-            images (`List[np.ndarray]`):
-                Batch of images to pad.
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
-                Whether to return the pixel mask.
+                Whether to return a pixel mask.
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                     - Unset: Return a list of `np.ndarray`.
@@ -277,10 +282,13 @@ def pad(
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         pad_size = get_max_height_width(images)
+
         padded_images = [
-            self._pad_image(image=image, output_size=pad_size, data_format=data_format) for image in images
+            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            for image in images
         ]
         data = {"pixel_values": padded_images}
+
         if return_pixel_mask:
             masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
             data["pixel_mask"] = masks
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 8946778d8c44..8004ac32e07d 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -109,7 +109,7 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
-            resample:
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
             data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format for the output image. If unset, the channel dimension format of the input
@@ -123,9 +123,8 @@ def resize(
         size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
-        return resize(
-            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
-        )
+        output_size = (size["height"], size["width"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index cd853093bb10..26b1e53adaeb 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -118,6 +118,7 @@ def __init__(
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
 
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
         self,
         image: np.ndarray,
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index f01dbbf89225..209b0d441d84 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -844,10 +844,21 @@ def resize_annotation(
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
     def rescale(
-        self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
+        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
     ) -> np.ndarray:
         """
-        Rescale the image by the given factor.
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
         """
         return rescale(image, rescale_factor, data_format=data_format)
 
@@ -881,13 +892,15 @@ def _pad_image(
         )
         return padded_image
 
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
     def pad(
         self,
         images: List[np.ndarray],
-        return_pixel_mask: bool = False,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
-    ) -> np.ndarray:
+    ) -> BatchFeature:
         """
         Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
         in the batch and optionally returns their corresponding pixel mask.
@@ -895,16 +908,26 @@ def pad(
         Args:
             image (`np.ndarray`):
                 Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
             return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether to return a pixel mask.
-            input_channel_dimension (`ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be inferred from the input image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
         pad_size = get_max_height_width(images)
 
-        padded_images = [self._pad_image(image, pad_size, data_format=data_format) for image in images]
+        padded_images = [
+            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            for image in images
+        ]
         data = {"pixel_values": padded_images}
 
         if return_pixel_mask:

From 3a05e010e0c7e8abd3e5357dd4e89e28cc69003e Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Tue, 8 Aug 2023 13:05:41 -0400
Subject: [PATCH 890/935] change version (#25387)

---
 src/transformers/modeling_utils.py | 4 ++--
 src/transformers/trainer.py        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9d6dc9d82b33..adb2b4919d77 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2786,10 +2786,10 @@ def from_pretrained(
             model = replace_with_bnb_linear(
                 model, modules_to_not_convert=modules_to_not_convert, quantization_config=quantization_config
             )
-            # training in 8-bit is only available in 0.37.0+
+            # training in 8-bit is only available in 0.37.0+ but a major bug in 8-bit optimizers was fixed in 0.41.1
             model._is_quantized_training_enabled = version.parse(
                 importlib.metadata.version("bitsandbytes")
-            ) >= version.parse("0.37.0")
+            ) >= version.parse("0.41.1")
 
             model.config.quantization_config = quantization_config
             model.is_8bit_serializable = is_8bit_serializable
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index cb4c04321027..ec74c45fa304 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -404,7 +404,7 @@ def __init__(
             else:
                 raise ValueError(
                     "The model you want to train is loaded in 8-bit precision.  if you want to fine-tune an 8-bit"
-                    " model, please make sure that you have installed `bitsandbytes>=0.37.0`. "
+                    " model, please make sure that you have installed `bitsandbytes>=0.41.1`. "
                 )
 
         # Setup Sharded DDP training

From 41c5f45bfe958be58bac6c891652f632ebac23e2 Mon Sep 17 00:00:00 2001
From: Abhipsha Das <abhipsha.das16@gmail.com>
Date: Tue, 8 Aug 2023 13:18:33 -0400
Subject: [PATCH 891/935] [DOCS] Add example for `TopPLogitsWarper`  (#25361)

* [DOCS] Add example for `TopPLogitsWarper`

* fix typo

* address review feedback

* address review nits
---
 src/transformers/generation/logits_process.py | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index f7cf2e89682a..289d0f955faa 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -370,6 +370,37 @@ class TopPLogitsWarper(LogitsWarper):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+    >>> text = "It is probably one of the most important things for parents to teach children about patience and acceptance. In this way, we as a society can ensure"
+    >>> inputs = tokenizer(text, return_tensors="pt")
+
+    >>> # Generate sequences without top_p sampling
+    >>> # We see that the answer tends to have a lot of repeated tokens and phrases
+    >>> outputs = model.generate(**inputs, max_length=55)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    'It is probably one of the most important things for parents to teach children about patience and acceptance. In this way, we as a society can ensure that our children are not taught to be impatient or to be afraid of the future.\n\nThe first step is to teach them'
+
+    >>> # Generate sequences with top_p sampling: set `do_sample=True` to use top_p sampling with `top_p` arugment
+    >>> # We already see that the answer has less repetitive tokens and is more diverse
+    >>> outputs = model.generate(**inputs, max_length=55, do_sample=True, top_p=0.25)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    'It is probably one of the most important things for parents to teach children about patience and acceptance. In this way, we as a society can ensure that children learn to be more accepting of others and to be more tolerant of others.\n\nWe can also teach children to be'
+
+    >>> # Generate sequences with top_p sampling with a larger top_p value
+    >>> # We see that as we increase the top_p value, less probable tokens also get selected during text generation, making the answer more diverse
+    >>> # Pro Tip: In practice, we tend to use top_p values between 0.9 and 1.0!
+    >>> outputs = model.generate(**inputs, max_length=55, do_sample=True, top_p=0.95)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    'It is probably one of the most important things for parents to teach children about patience and acceptance. In this way, we as a society can ensure we have the best learning environment, so that we can teach to learn and not just take advantage of the environment.\n\nThe'
+    ```
     """
 
     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):

From 1367142afd363e2799e3299b9bbf14fcb5e848c0 Mon Sep 17 00:00:00 2001
From: SeongWooChoi <46990061+nuatmochoi@users.noreply.github.com>
Date: Wed, 9 Aug 2023 15:15:31 +0900
Subject: [PATCH 892/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`perf=5Ftrain=5Fcpu=5Fmany.md`=20to=20Korean=20(#24923)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: perf_train_cpu_many.md

* feat: chatgpt draft

* fix: manual edits

* fix: resolve suggestions

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

---------

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml           |   4 +-
 docs/source/ko/perf_train_cpu_many.md | 134 ++++++++++++++++++++++++++
 2 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/perf_train_cpu_many.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index ae3382a1b08e..e9318ad1759e 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -115,8 +115,8 @@
       title: (번역중) Training on many GPUs
     - local: perf_train_cpu
       title: CPU에서 훈련
-    - local: in_translation
-      title: (번역중) Training on many CPUs
+    - local: perf_train_cpu_many
+      title: 다중 CPU에서 훈련하기
     - local: in_translation
       title: (번역중) Training on TPUs
     - local: in_translation
diff --git a/docs/source/ko/perf_train_cpu_many.md b/docs/source/ko/perf_train_cpu_many.md
new file mode 100644
index 000000000000..47545e845326
--- /dev/null
+++ b/docs/source/ko/perf_train_cpu_many.md
@@ -0,0 +1,134 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 다중 CPU에서 효율적으로 훈련하기 [[efficient-training-on-multiple-cpus]]
+
+하나의 CPU에서 훈련하는 것이 너무 느릴 때는 다중 CPU를 사용할 수 있습니다. 이 가이드는 PyTorch 기반의 DDP를 사용하여 분산 CPU 훈련을 효율적으로 수행하는 방법에 대해 설명합니다.
+
+## PyTorch용 Intel® oneCCL 바인딩 [[intel-oneccl-bindings-for-pytorch]]
+
+[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library)은 allreduce, allgather, alltoall과 같은 집합 통신(collective communications)을 구현한 효율적인 분산 딥러닝 훈련을 위한 라이브러리입니다. oneCCL에 대한 자세한 정보는 [oneCCL 문서](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html)와 [oneCCL 사양](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html)을 참조하세요.
+
+`oneccl_bindings_for_pytorch` 모듈 (`torch_ccl`은 버전 1.12 이전에 사용)은 PyTorch C10D ProcessGroup API를 구현하며, 외부 ProcessGroup로 동적으로 가져올 수 있으며 현재 Linux 플랫폼에서만 작동합니다.
+
+[oneccl_bind_pt](https://github.com/intel/torch-ccl)에서 더 자세한 정보를 확인하세요.
+
+### PyTorch용 Intel® oneCCL 바인딩 설치: [[intel-oneccl-bindings-for-pytorch-installation]]
+
+다음 Python 버전에 대한 Wheel 파일을 사용할 수 있습니다.
+
+| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 |
+| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: |
+| 1.13.0            |            | √          | √          | √          | √           |
+| 1.12.100          |            | √          | √          | √          | √           |
+| 1.12.0            |            | √          | √          | √          | √           |
+| 1.11.0            |            | √          | √          | √          | √           |
+| 1.10.0            | √          | √          | √          | √          |             |
+
+```
+pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+`{pytorch_version}`은 1.13.0과 같이 PyTorch 버전을 나타냅니다.
+[oneccl_bind_pt 설치](https://github.com/intel/torch-ccl)에 대한 더 많은 접근 방법을 확인해 보세요.
+oneCCL과 PyTorch의 버전은 일치해야 합니다.
+
+<Tip warning={true}>
+
+oneccl_bindings_for_pytorch 1.12.0 버전의 미리 빌드된 Wheel 파일은 PyTorch 1.12.1과 호환되지 않습니다(PyTorch 1.12.0용입니다).
+PyTorch 1.12.1은 oneccl_bindings_for_pytorch 1.12.10 버전과 함께 사용해야 합니다.
+
+</Tip>
+
+## Intel® MPI 라이브러리 [[intel-mpi-library]]
+이 표준 기반 MPI 구현을 사용하여 Intel® 아키텍처에서 유연하고 효율적이며 확장 가능한 클러스터 메시징을 제공하세요. 이 구성 요소는 Intel® oneAPI HPC Toolkit의 일부입니다.
+
+oneccl_bindings_for_pytorch는 MPI 도구 세트와 함께 설치됩니다. 사용하기 전에 환경을 소스로 지정해야 합니다.
+
+Intel® oneCCL 버전 1.12.0 이상인 경우
+```
+oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
+source $oneccl_bindings_for_pytorch_path/env/setvars.sh
+```
+
+Intel® oneCCL 버전이 1.12.0 미만인 경우
+```
+torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))")
+source $torch_ccl_path/env/setvars.sh
+```
+
+#### IPEX 설치: [[ipex-installation]]
+
+IPEX는 Float32와 BFloat16을 모두 사용하는 CPU 훈련을 위한 성능 최적화를 제공합니다. [single CPU section](./perf_train_cpu)을 참조하세요.
+
+
+이어서 나오는 "Trainer에서의 사용"은 Intel® MPI 라이브러리의 mpirun을 예로 들었습니다.
+
+
+## Trainer에서의 사용 [[usage-in-trainer]]
+Trainer에서 ccl 백엔드를 사용하여 멀티 CPU 분산 훈련을 활성화하려면 명령 인수에 **`--ddp_backend ccl`**을 추가해야 합니다.
+
+[질의 응답 예제](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)를 사용한 예를 살펴보겠습니다.
+
+
+다음 명령은 한 Xeon 노드에서 2개의 프로세스로 훈련을 활성화하며, 각 소켓당 하나의 프로세스가 실행됩니다. OMP_NUM_THREADS/CCL_WORKER_COUNT 변수는 최적의 성능을 위해 조정할 수 있습니다.
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=127.0.0.1
+ mpirun -n 2 -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex
+```
+다음 명령은 두 개의 Xeon(노드0 및 노드1, 주 프로세스로 노드0을 사용)에서 총 4개의 프로세스로 훈련을 활성화하며, 각 소켓당 하나의 프로세스가 실행됩니다. OMP_NUM_THREADS/CCL_WORKER_COUNT 변수는 최적의 성능을 위해 조정할 수 있습니다.
+
+노드0에서는 각 노드의 IP 주소를 포함하는 구성 파일(예: hostfile)을 생성하고 해당 구성 파일 경로를 인수로 전달해야 합니다.
+```shell script
+ cat hostfile
+ xxx.xxx.xxx.xxx #node0 ip
+ xxx.xxx.xxx.xxx #node1 ip
+```
+이제 노드0에서 다음 명령을 실행하면 **4DDP**가 노드0 및 노드1에서 BF16 자동 혼합 정밀도로 활성화됩니다.
+```shell script
+ export CCL_WORKER_COUNT=1
+ export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
+ mpirun -f hostfile -n 4 -ppn 2 \
+ -genv OMP_NUM_THREADS=23 \
+ python3 run_qa.py \
+ --model_name_or_path bert-large-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12  \
+ --learning_rate 3e-5  \
+ --num_train_epochs 2  \
+ --max_seq_length 384 \
+ --doc_stride 128  \
+ --output_dir /tmp/debug_squad/ \
+ --no_cuda \
+ --ddp_backend ccl \
+ --use_ipex \
+ --bf16
+```

From 1564a81ac56b0620230b0e3ed886d8accd22388c Mon Sep 17 00:00:00 2001
From: David Reguera <33068707+nablabits@users.noreply.github.com>
Date: Wed, 9 Aug 2023 08:31:57 +0200
Subject: [PATCH 893/935] 16059 - Add missing type hints for ASTModel (#25364)

* 16059 - Add missing type hints for ASTModel

* Add an additional type hint

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>

---------

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
---
 .../modeling_audio_spectrogram_transformer.py                 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 0f8c045121c7..28969f50b672 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -443,7 +443,7 @@ def _set_gradient_checkpointing(self, module: ASTEncoder, value: bool = False) -
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
 class ASTModel(ASTPreTrainedModel):
-    def __init__(self, config: ASTConfig):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__(config)
         self.config = config
 
@@ -481,7 +481,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ):
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

From 85447bb22ebfe485c998607ccc1358c8c6a5ade9 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 9 Aug 2023 15:31:24 +0800
Subject: [PATCH 894/935] rm useless condition since the previous condition
 contains it. (#25403)

---
 src/transformers/trainer.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index ec74c45fa304..82fd41874d73 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1086,14 +1086,6 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 optimizer_kwargs.update(bnb_kwargs)
             except ImportError:
                 raise ValueError("Trainer tried to instantiate bnb optimizer but bnb is not installed!")
-        elif args.optim == OptimizerNames.ADAMW_BNB:
-            try:
-                from bitsandbytes.optim import Adam8bit
-
-                optimizer_cls = Adam8bit
-                optimizer_kwargs.update(adam_kwargs)
-            except ImportError:
-                raise ValueError("Trainer tried to instantiate bnb Adam8bit but bnb is not installed!")
         elif args.optim == OptimizerNames.ADAMW_ANYPRECISION:
             try:
                 from torchdistx.optimizers import AnyPrecisionAdamW

From 599377161b75d3c66bd6a94f605a733f7c6edbd1 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 9 Aug 2023 10:46:05 +0200
Subject: [PATCH 895/935] Fix path for dynamic module creation (#25402)

---
 src/transformers/dynamic_module_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 6c00fcbb9a73..d59257b27335 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -59,7 +59,7 @@ def create_dynamic_module(name: Union[str, os.PathLike]):
     Creates a dynamic module in the cache directory for modules.
     """
     init_hf_modules()
-    dynamic_module_path = Path(HF_MODULES_CACHE) / name
+    dynamic_module_path = (Path(HF_MODULES_CACHE) / name).resolve()
     # If the parent module does not exist yet, recursively create it.
     if not dynamic_module_path.parent.exists():
         create_dynamic_module(dynamic_module_path.parent)

From ea5dda22904e807437e07cdfa5d3d89f17a590f7 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 9 Aug 2023 11:09:09 +0100
Subject: [PATCH 896/935] YOLOS - Revert default return_pixel_mask value
 (#25404)

Revert default return_pixel_mask value
---
 src/transformers/models/yolos/image_processing_yolos.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 209b0d441d84..3a42c653bd43 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -892,12 +892,11 @@ def _pad_image(
         )
         return padded_image
 
-    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
     def pad(
         self,
         images: List[np.ndarray],
         constant_values: Union[float, Iterable[float]] = 0,
-        return_pixel_mask: bool = True,
+        return_pixel_mask: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = None,
     ) -> BatchFeature:

From d59b872c9e77e23acbc8e504e5720cf59313b40b Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 9 Aug 2023 11:09:20 +0100
Subject: [PATCH 897/935] Docs: introduction to generation with LLMs (#25240)

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/_toctree.yml    |   4 +-
 docs/source/en/llm_tutorial.md | 221 +++++++++++++++++++++++++++++++++
 2 files changed, 224 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/llm_tutorial.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c17669026c20..7aa3dcd59ad5 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -23,6 +23,8 @@
     title: Share your model
   - local: transformers_agents
     title: Agents
+  - local: llm_tutorial
+    title: Generation with LLMs
   title: Tutorials
 - sections:
   - sections:
@@ -79,7 +81,7 @@
     isExpanded: false
   - sections:
       - local: generation_strategies
-        title: Customize text generation strategy
+        title: Customize the generation strategy
     title: Generation
     isExpanded: false
   title: Task Guides
diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md
new file mode 100644
index 000000000000..31eef527fd54
--- /dev/null
+++ b/docs/source/en/llm_tutorial.md
@@ -0,0 +1,221 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Generation with LLMs
+
+[[open-in-colab]]
+
+LLMs, or Large Language Models, are the key component behind text generation. In a nutshell, they consist of large pretrained transformer models trained to predict the next word (or, more precisely, token) given some input text. Since they predict one token at a time, you need to do something more elaborate to generate new sentences other than just calling the model -- you need to do autoregressive generation.
+
+Autoregressive generation is the inference-time procedure of iteratively calling a model with its own generated outputs, given a few initial inputs. In 🤗 Transformers, this is handled by the [`~generation.GenerationMixin.generate`] method, which is available to all models with generative capabilities.
+
+This tutorial will show you how to:
+
+* Generate text with an LLM
+* Avoid common pitfalls
+* Next steps to help you get the most out your LLM
+
+Before you begin, make sure you have all the necessary libraries installed:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+
+## Generate text
+
+A language model trained for [causal language modeling](tasks/language_modeling) takes a sequence of text tokens as input and returns the probability distribution for the next token.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"Forward pass of an LLM"</figcaption>
+</figure>
+
+A critical aspect of autoregressive generation with LLMs is how to select the next token from this probability distribution. Anything goes in this step as long as you end up with a token for the next iteration. This means it can be as simple as selecting the most likely token from the probability distribution or as complex as applying a dozen transformations before sampling from the resulting distribution.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"Autoregressive generation iteratively selects the next token from a probability distribution to generate text"</figcaption>
+</figure>
+
+The process depicted above is repeated iteratively until some stopping condition is reached. Ideally, the stopping condition is dictated by the model, which should learn when to output an end-of-sequence (`EOS`) token. If this is not the case, generation stops when some predefined maximum length is reached.
+
+Properly setting up the token selection step and the stopping condition is essential to make your model behave as you'd expect on your task. That is why we have a [`~generation.GenerationConfig`] file associated with each model, which contains a good default generative parameterization and is loaded alongside your model.
+
+Let's talk code!
+
+<Tip>
+
+If you're interested in basic LLM usage, our high-level [`Pipeline`](pipeline_tutorial) interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through [`~generation.GenerationMixin.generate`]. Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput.
+
+</Tip>
+
+<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
+First, you need to load the model.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+You'll notice two flags in the `from_pretrained` call:
+
+ - `device_map` ensures the model is moved to your GPU(s)
+ - `load_in_4bit` applies [4-bit dynamic quantization](main_classes/quantization) to massively reduce the resource requirements
+
+There are other ways to initialize a model, but this is a good baseline to begin with an LLM.
+
+Next, you need to preprocess your text input with a [tokenizer](tokenizer_summary).
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.
+
+Finally, call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens, which should be converted to text before printing.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, black, white, and brown'
+```
+
+And that's it! In a few lines of code, you can harness the power of an LLM.
+
+
+## Common pitfalls
+
+There are many [generation strategies](generation_strategies), and sometimes the default values may not be appropriate for your use case. If your outputs aren't aligned with what you're expecting, we've created a list of the most common pitfalls and how to avoid them.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+### Generated output is too short/long
+
+If not specified in the [`~generation.GenerationConfig`] file, `generate` returns up to 20 tokens by default. We highly recommend manually setting `max_new_tokens` in your `generate` call to control the maximum number of new tokens it can return. Keep in mind LLMs (more precisely, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) also return the input prompt as part of the output.
+
+
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### Incorrect generation mode
+
+By default, and unless specified in the [`~generation.GenerationConfig`] file, `generate` selects the most likely token at each iteration (greedy decoding). Depending on your task, this may be undesirable; creative tasks like chatbots or writing an essay benefit from sampling. On the other hand, input-grounded tasks like audio transcription or translation benefit from greedy decoding. Enable sampling with `do_sample=True`, and you can learn more about this topic in this [blog post](https://huggingface.co/blog/how-to-generate).
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(0)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.\nI just need to be. I am always.\nEvery time'
+```
+
+### Wrong padding side
+
+LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) architectures, meaning they continue to iterate on your input prompt. If your inputs do not have the same length, they need to be padded. Since LLMs are not trained to continue from pad tokens, your input needs to be left-padded. Make sure you also don't forget to pass the attention mask to generate!
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
+''
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
+
+## Further resources
+
+While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding:
+
+<!-- TODO: complete with new guides -->
+### Advanced generate usage
+
+1. [Guide](generation_strategies) on how to control different generation methods, how to set up the generation configuration file, and how to stream the output;
+2. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).
+
+### LLM leaderboards
+
+1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), which focuses on the quality of the open-source models;
+2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), which focuses on LLM throughput.
+
+### Latency and throughput
+
+1. [Guide](main_classes/quantization) on dynamic quantization, which shows you how to drastically reduce your memory requirements.
+
+### Related libraries
+
+1. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
+2. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.

From 3deed1f97e9dcfc1a7b9071e290a8844526f2d2a Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 9 Aug 2023 11:48:32 +0100
Subject: [PATCH 898/935] Generate: length validation (#25384)

---
 src/transformers/generation/utils.py | 75 +++++++++++++++++++---------
 1 file changed, 51 insertions(+), 24 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 832d43ee5ac0..eaee4d029d42 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1245,6 +1245,52 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
                 " generate arguments will also show up in this list)"
             )
 
+    def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
+        """Performs validation related to the resulting generated length"""
+
+        # 1. Max length warnings related to poor parameterization
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length != 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the"
+                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
+                "generation.",
+                UserWarning,
+            )
+        if input_ids_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            warnings.warn(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`.",
+                UserWarning,
+            )
+
+        # 2. Min length warnings due to unfeasible parameter combinations
+        min_length_error_suffix = (
+            " Generation will stop at the defined maximum length. You should decrease the minimum length and/or "
+            "increase the maximum length."
+        )
+        if has_default_max_length:
+            min_length_error_suffix += (
+                f" Note that `max_length` is set to {generation_config.max_length}, its default value."
+            )
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            warnings.warn(
+                f"Unfeasible length constraints: `min_length` ({generation_config.min_length}) is larger than"
+                f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
+                UserWarning,
+            )
+        if generation_config.min_new_tokens is not None:
+            min_length = generation_config.min_new_tokens + input_ids_length
+            if min_length > generation_config.max_length:
+                warnings.warn(
+                    f"Unfeasible length constraints: `min_new_tokens` ({generation_config.min_new_tokens}), when "
+                    f"added to the prompt length ({input_ids_length}), is larger than"
+                    f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
+                    UserWarning,
+                )
+
     @torch.no_grad()
     def generate(
         self,
@@ -1458,16 +1504,9 @@ def generate(
             streamer.put(input_ids.cpu())
 
         # 6. Prepare `max_length` depending on other stopping criteria.
-        input_ids_seq_length = input_ids.shape[-1]
+        input_ids_length = input_ids.shape[-1]
         has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length != 20:
-            # 20 is the default max_length of the generation config
-            warnings.warn(
-                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
-                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
+        if generation_config.max_new_tokens is not None:
             if not has_default_max_length:
                 logger.warning(
                     f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
@@ -1475,20 +1514,8 @@ def generate(
                     "Please refer to the documentation for more information. "
                     "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-
-        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
-            raise ValueError(
-                f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
-                f" the maximum length ({generation_config.max_length})"
-            )
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
 
         # 7. determine generation mode
         generation_mode = self._get_generation_mode(generation_config, assistant_model)
@@ -1512,7 +1539,7 @@ def generate(
         # 8. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
+            input_ids_seq_length=input_ids_length,
             encoder_input_ids=inputs_tensor,
             prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
             logits_processor=logits_processor,

From 00b93cda213aa3e754ebbec6ccda77cca4b67247 Mon Sep 17 00:00:00 2001
From: Alan Ji <hzji210@gmail.com>
Date: Wed, 9 Aug 2023 19:50:13 +0800
Subject: [PATCH 899/935] Improve training args (#25401)

* enhanced tips for some training args

* make style
---
 src/transformers/training_args.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 1f6e81959ec1..95732bbfc551 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -193,9 +193,9 @@ class TrainingArguments:
         prediction_loss_only (`bool`, *optional*, defaults to `False`):
             When performing evaluation and generating predictions, only returns the loss.
         per_device_train_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/TPU core/CPU for training.
+            The batch size per GPU/TPU/MPS/NPU core/CPU for training.
         per_device_eval_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/TPU core/CPU for evaluation.
+            The batch size per GPU/TPU/MPS/NPU core/CPU for evaluation.
         gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
@@ -648,10 +648,10 @@ class TrainingArguments:
     )
 
     per_device_train_batch_size: int = field(
-        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for training."}
     )
     per_device_eval_batch_size: int = field(
-        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for evaluation."}
     )
 
     per_gpu_train_batch_size: Optional[int] = field(
@@ -804,7 +804,9 @@ class TrainingArguments:
     )
     use_cpu: bool = field(
         default=False,
-        metadata={"help": " Whether or not to use cpu. If set to False, we will use cuda or mps device if available."},
+        metadata={
+            "help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
+        },
     )
     use_mps_device: bool = field(
         default=False,

From f456b4d10b5c72e0cfe835d3f22a71079c513eb2 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 9 Aug 2023 13:07:11 +0100
Subject: [PATCH 900/935] Generate: generation config validation fixes in docs
 (#25405)

---
 docs/source/en/model_doc/donut.md                  | 6 ------
 src/transformers/generation/configuration_utils.py | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/docs/source/en/model_doc/donut.md b/docs/source/en/model_doc/donut.md
index 1e214da8bc77..cfbf79972d57 100644
--- a/docs/source/en/model_doc/donut.md
+++ b/docs/source/en/model_doc/donut.md
@@ -80,11 +80,9 @@ into a single instance to both extract the input features and decode the predict
 ...     pixel_values.to(device),
 ...     decoder_input_ids=decoder_input_ids.to(device),
 ...     max_length=model.decoder.config.max_position_embeddings,
-...     early_stopping=True,
 ...     pad_token_id=processor.tokenizer.pad_token_id,
 ...     eos_token_id=processor.tokenizer.eos_token_id,
 ...     use_cache=True,
-...     num_beams=1,
 ...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
 ...     return_dict_in_generate=True,
 ... )
@@ -125,11 +123,9 @@ into a single instance to both extract the input features and decode the predict
 ...     pixel_values.to(device),
 ...     decoder_input_ids=decoder_input_ids.to(device),
 ...     max_length=model.decoder.config.max_position_embeddings,
-...     early_stopping=True,
 ...     pad_token_id=processor.tokenizer.pad_token_id,
 ...     eos_token_id=processor.tokenizer.eos_token_id,
 ...     use_cache=True,
-...     num_beams=1,
 ...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
 ...     return_dict_in_generate=True,
 ... )
@@ -172,11 +168,9 @@ into a single instance to both extract the input features and decode the predict
 ...     pixel_values.to(device),
 ...     decoder_input_ids=decoder_input_ids.to(device),
 ...     max_length=model.decoder.config.max_position_embeddings,
-...     early_stopping=True,
 ...     pad_token_id=processor.tokenizer.pad_token_id,
 ...     eos_token_id=processor.tokenizer.eos_token_id,
 ...     use_cache=True,
-...     num_beams=1,
 ...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
 ...     return_dict_in_generate=True,
 ... )
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 03116840744b..6b62a062e42f 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -597,7 +597,7 @@ def from_pretrained(
         >>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
         >>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
         >>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
-        ...     "gpt2", top_k=1, foo=False, return_unused_kwargs=True
+        ...     "gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
         ... )
         >>> generation_config.top_k
         1

From ef74da6582a3ac9224ea7a770dfc01a1c91b4985 Mon Sep 17 00:00:00 2001
From: David Reguera <33068707+nablabits@users.noreply.github.com>
Date: Wed, 9 Aug 2023 14:13:33 +0200
Subject: [PATCH 901/935] 16059 - Add extra type hints for AltCLIPModel
 (#25399)

---
 src/transformers/models/altclip/modeling_altclip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 979e974e06c9..c4e32de55d9c 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1420,7 +1420,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ):
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndProjection]:
         r"""
         Returns:
 
@@ -1613,7 +1613,7 @@ def forward(
         pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        token_type_ids=None,
+        token_type_ids: Optional[torch.Tensor] = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,

From eb3ded16f730019b223b06514a8d03df4199feb1 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 9 Aug 2023 13:15:06 +0100
Subject: [PATCH 902/935] Generate: lower severity of parameterization checks
 (#25407)

---
 .../generation/configuration_utils.py         | 73 +++++++++++++------
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 6b62a062e42f..51644d9a6f0c 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -313,7 +313,7 @@ def __init__(self, **kwargs):
                     raise err
 
         # Validate the values of the attributes
-        self.validate()
+        self.validate(is_init=True)
 
     def __eq__(self, other):
         if not isinstance(other, GenerationConfig):
@@ -330,7 +330,7 @@ def __eq__(self, other):
     def __repr__(self):
         return f"{self.__class__.__name__} {self.to_json_string()}"
 
-    def validate(self):
+    def validate(self, is_init=False):
         """
         Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
         of parameterization that can be detected as incorrect from the configuration instance alone.
@@ -344,58 +344,85 @@ def validate(self):
             raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
 
         # Validation of attribute relations:
+        fix_location = ""
+        if is_init:
+            fix_location = (
+                " This was detected when initializing the generation config instance, which means the corresponding "
+                "file may hold incorrect parameterization and should be fixed."
+            )
+
         # 1. detect sampling-only parameterization when not in sampling mode
         if self.do_sample is False:
             greedy_wrong_parameter_msg = (
                 "`do_sample` is set to `False`. However, {flag_name} is set to {flag_value} -- this flag is only used "
-                "in sample-based generation modes. Set `do_sample=True` or unset {flag_name} to continue."
+                "in sample-based generation modes. You should set `do_sample=True` or unset {flag_name}."
+                + fix_location
             )
             if self.temperature != 1.0:
-                raise ValueError(
-                    greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature)
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature),
+                    UserWarning,
                 )
             if self.top_p != 1.0:
-                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p))
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p),
+                    UserWarning,
+                )
             if self.typical_p != 1.0:
-                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p))
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p),
+                    UserWarning,
+                )
             if self.top_k != 50 and self.penalty_alpha is None:  # contrastive search uses top_k
-                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k))
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k),
+                    UserWarning,
+                )
             if self.epsilon_cutoff != 0.0:
-                raise ValueError(
-                    greedy_wrong_parameter_msg.format(flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff)
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff),
+                    UserWarning,
                 )
             if self.eta_cutoff != 0.0:
-                raise ValueError(greedy_wrong_parameter_msg.format(flag_name="eta_cutoff", flag_value=self.eta_cutoff))
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="eta_cutoff", flag_value=self.eta_cutoff),
+                    UserWarning,
+                )
 
         # 2. detect beam-only parameterization when not in beam mode
         if self.num_beams == 1:
             single_beam_wrong_parameter_msg = (
                 "`num_beams` is set to 1. However, {flag_name} is set to {flag_value} -- this flag is only used in "
-                "beam-based generation modes. Set `num_beams>1` or unset {flag_name} to continue."
+                "beam-based generation modes. You should set `num_beams>1` or unset {flag_name}." + fix_location
             )
             if self.early_stopping is not False:
-                raise ValueError(
-                    single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping)
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping),
+                    UserWarning,
                 )
             if self.num_beam_groups != 1:
-                raise ValueError(
+                warnings.warn(
                     single_beam_wrong_parameter_msg.format(
                         flag_name="num_beam_groups", flag_value=self.num_beam_groups
-                    )
+                    ),
+                    UserWarning,
                 )
             if self.diversity_penalty != 0.0:
-                raise ValueError(
+                warnings.warn(
                     single_beam_wrong_parameter_msg.format(
                         flag_name="diversity_penalty", flag_value=self.diversity_penalty
-                    )
+                    ),
+                    UserWarning,
                 )
             if self.length_penalty != 1.0:
-                raise ValueError(
-                    single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty)
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty),
+                    UserWarning,
                 )
             if self.constraints is not None:
-                raise ValueError(
-                    single_beam_wrong_parameter_msg.format(flag_name="constraints", flag_value=self.constraints)
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="constraints", flag_value=self.constraints),
+                    UserWarning,
                 )
 
         # 3. detect incorrect paramaterization specific to advanced beam modes
@@ -405,7 +432,7 @@ def validate(self):
                 constrained_wrong_parameter_msg = (
                     "`constraints` is not `None`, triggering constrained beam search. However, {flag_name} is set to "
                     "{flag_value}, which is incompatible with this generation mode. Set `constraints=None` or unset "
-                    "{flag_name} to continue."
+                    "{flag_name} to continue." + fix_location
                 )
                 if self.do_sample is True:
                     raise ValueError(

From f2a43c7383091a36275b852b39a47865694b8f39 Mon Sep 17 00:00:00 2001
From: Maria Khalusova <kafooster@gmail.com>
Date: Wed, 9 Aug 2023 08:29:06 -0400
Subject: [PATCH 903/935] VQA task guide (#25244)

* initial commit

* semi-finished task guide draft

* image link

* Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/en/tasks/visual_question_answering.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* feedback addressed

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* nits addressed

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/_toctree.yml                   |   2 +
 .../en/tasks/visual_question_answering.md     | 401 ++++++++++++++++++
 2 files changed, 403 insertions(+)
 create mode 100644 docs/source/en/tasks/visual_question_answering.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7aa3dcd59ad5..9d1c33900c10 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -75,6 +75,8 @@
         title: Image captioning
       - local: tasks/document_question_answering
         title: Document Question Answering
+      - local: tasks/visual_question_answering
+        title: Visual Question Answering
       - local: tasks/text-to-speech
         title: Text to speech
     title: Multimodal
diff --git a/docs/source/en/tasks/visual_question_answering.md b/docs/source/en/tasks/visual_question_answering.md
new file mode 100644
index 000000000000..c45f12dbc1e7
--- /dev/null
+++ b/docs/source/en/tasks/visual_question_answering.md
@@ -0,0 +1,401 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Visual Question Answering
+
+[[open-in-colab]]
+
+Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. 
+The input to models supporting this task is typically a combination of an image and a question, and the output is an 
+answer expressed in natural language.
+
+Some noteworthy use case examples for VQA include:
+* Accessibility applications for visually impaired individuals.
+* Education: posing questions about visual materials presented in lectures or textbooks. VQA can also be utilized in interactive museum exhibits or historical sites.
+* Customer service and e-commerce: VQA can enhance user experience by letting users ask questions about products. 
+* Image retrieval: VQA models can be used to retrieve images with specific characteristics. For example, the user can ask "Is there a dog?" to find all images with dogs from a set of images.
+
+In this guide you'll learn how to:
+
+- Fine-tune a classification VQA model, specifically [ViLT](../model_doc/vilt), on the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa).
+- Use your fine-tuned ViLT for inference.
+- Run zero-shot VQA inference with a generative model, like BLIP-2.
+
+## Fine-tuning ViLT
+
+ViLT model incorporates text embeddings into a Vision Transformer (ViT), allowing it to have a minimal design for 
+Vision-and-Language Pre-training (VLP). This model can be used for several downstream tasks. For the VQA task, a classifier 
+head is placed on top (a linear layer on top of the final hidden state of the `[CLS]` token) and randomly initialized. 
+Visual Question Answering is thus treated as a **classification problem**.
+
+More recent models, such as BLIP, BLIP-2, and InstructBLIP, treat VQA as a generative task. Later in this guide we 
+illustrate how to use them for zero-shot VQA inference. 
+
+Before you begin, make sure you have all the necessary libraries installed. 
+
+```bash
+pip install -q transformers datasets
+```
+
+We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the 🤗 Hub.
+When prompted, enter your token to log in:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+Let's define the model checkpoint as a global variable.
+
+```py
+>>> model_checkpoint = "dandelin/vilt-b32-mlm"
+```
+
+## Load the data
+
+For illustration purposes, in this guide we use a very small sample of the annotated visual question answering `Graphcore/vqa` dataset. 
+You can find the full dataset on [🤗 Hub](https://huggingface.co/datasets/Graphcore/vqa).
+
+As an alternative to the [`Graphcore/vqa` dataset](https://huggingface.co/datasets/Graphcore/vqa), you can download the 
+same data manually from the official [VQA dataset page](https://visualqa.org/download.html). If you prefer to follow the 
+tutorial with your custom data, check out how to [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset#loading-script)
+guide in the 🤗 Datasets documentation.  
+
+Let's load the first 200 examples from the validation split and explore the dataset's features:  
+
+```python
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("Graphcore/vqa", split="validation[:200]")
+>>> dataset
+Dataset({
+    features: ['question', 'question_type', 'question_id', 'image_id', 'answer_type', 'label'],
+    num_rows: 200
+})
+```
+
+Let's take a look at an example to understand the dataset's features:
+
+```py
+>>> dataset[0]
+{'question': 'Where is he looking?',
+ 'question_type': 'none of the above',
+ 'question_id': 262148000,
+ 'image_id': '/root/.cache/huggingface/datasets/downloads/extracted/ca733e0e000fb2d7a09fbcc94dbfe7b5a30750681d0e965f8e0a23b1c2f98c75/val2014/COCO_val2014_000000262148.jpg',
+ 'answer_type': 'other',
+ 'label': {'ids': ['at table', 'down', 'skateboard', 'table'],
+  'weights': [0.30000001192092896,
+   1.0,
+   0.30000001192092896,
+   0.30000001192092896]}}
+```
+
+The features relevant to the task include: 
+* `question`: the question to be answered from the image
+* `image_id`: the path to the image the question refers to
+* `label`: the annotations
+
+We can remove the rest of the features as they won't be necessary: 
+
+```py 
+>>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
+```
+
+As you can see, the `label` feature contains several answers to the same question (called `ids` here) collected by different human annotators. 
+This is because the answer to a question can be subjective. In this case, the question is "where is he looking?". Some people 
+annotated this with "down", others with "at table", another one with "skateboard", etc. 
+
+Take a look at the image and consider which answer would you give:
+
+```python
+>>> from PIL import Image
+
+>>> image = Image.open(dataset[0]['image_id'])
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/vqa-example.png" alt="VQA Image Example"/>
+</div>
+
+Due to the questions' and answers' ambiguity, datasets like this are treated as a multi-label classification problem (as 
+multiple answers are possibly valid). Moreover, rather than just creating a one-hot encoded vector, one creates a 
+soft encoding, based on the number of times a certain answer appeared in the annotations.
+
+For instance, in the example above, because the answer "down" is selected way more often than other answers, it has a 
+score (called `weight` in the dataset) of 1.0, and the rest of the answers have scores < 1.0. 
+
+To later instantiate the model with an appropriate classification head, let's create two dictionaries: one that maps 
+the label name to an integer and vice versa:
+
+```py
+>>> import itertools
+
+>>> labels = [item['ids'] for item in dataset['label']]
+>>> flattened_labels = list(itertools.chain(*labels))
+>>> unique_labels = list(set(flattened_labels))
+
+>>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
+>>> id2label = {idx: label for label, idx in label2id.items()} 
+```
+
+Now that we have the mappings, we can replace the string answers with their ids, and flatten the dataset for a more convenient further preprocessing. 
+
+```python
+>>> def replace_ids(inputs):
+...   inputs["label"]["ids"] = [label2id[x] for x in inputs["label"]["ids"]]
+...   return inputs
+
+
+>>> dataset = dataset.map(replace_ids)
+>>> flat_dataset = dataset.flatten()
+>>> flat_dataset.features
+{'question': Value(dtype='string', id=None),
+ 'image_id': Value(dtype='string', id=None),
+ 'label.ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
+ 'label.weights': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}
+```
+
+## Preprocessing data
+
+The next step is to load a ViLT processor to prepare the image and text data for the model. 
+[`ViltProcessor`] wraps a BERT tokenizer and ViLT image processor into a convenient single processor:
+
+```py 
+>>> from transformers import ViltProcessor
+
+>>> processor = ViltProcessor.from_pretrained(model_checkpoint)
+```
+
+To preprocess the data we need to encode the images and questions using the [`ViltProcessor`]. The processor will use 
+the [`BertTokenizerFast`] to tokenize the text and create `input_ids`, `attention_mask` and `token_type_ids` for the text data. 
+As for images, the processor will leverage [`ViltImageProcessor`] to resize and normalize the image, and create `pixel_values` and `pixel_mask`.
+
+All these preprocessing steps are done under the hood, we only need to call the `processor`. However, we still need to 
+prepare the target labels. In this representation, each element corresponds to a possible answer (label). For correct answers, the element holds 
+their respective score (weight), while the remaining elements are set to zero.
+
+The following function applies the `processor` to the images and questions and formats the labels as described above:
+
+```py
+>>> import torch
+
+>>> def preprocess_data(examples):
+...     image_paths = examples['image_id']
+...     images = [Image.open(image_path) for image_path in image_paths]
+...     texts = examples['question']    
+
+...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
+
+...     for k, v in encoding.items():
+...           encoding[k] = v.squeeze()
+    
+...     targets = []
+
+...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
+...         target = torch.zeros(len(id2label))
+
+...         for label, score in zip(labels, scores):
+...             target[label] = score
+      
+...         targets.append(target)
+
+...     encoding["labels"] = targets
+    
+...     return encoding
+```
+
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.map`] function. You can speed up `map` by 
+setting `batched=True` to process multiple elements of the dataset at once. At this point, feel free to remove the columns you don't need.
+
+```py
+>>> processed_dataset = flat_dataset.map(preprocess_data, batched=True, remove_columns=['question','question_type',  'question_id', 'image_id', 'answer_type', 'label.ids', 'label.weights'])
+>>> processed_dataset
+Dataset({
+    features: ['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
+    num_rows: 200
+})
+```
+
+As a final step, create a batch of examples using [`DefaultDataCollator`]:
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+## Train the model
+
+You’re ready to start training your model now! Load ViLT with [`ViltForQuestionAnswering`]. Specify the number of labels 
+along with the label mappings:
+
+```py
+>>> from transformers import ViltForQuestionAnswering
+
+>>> model = ViltForQuestionAnswering.from_pretrained(model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id)
+```
+
+At this point, only three steps remain:
+
+1. Define your training hyperparameters in [`TrainingArguments`]:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> repo_id = "MariaK/vilt_finetuned_200"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+2. Pass the training arguments to [`Trainer`] along with the model, dataset, processor, and data collator.
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=processed_dataset,
+...     tokenizer=processor,
+... )
+```
+
+3. Call [`~Trainer.train`] to finetune your model.
+
+```py
+>>> trainer.train() 
+```
+
+Once training is completed, share your model to the Hub with the [`~Trainer.push_to_hub`] method to share your final model on the 🤗 Hub:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+Now that you have fine-tuned a ViLT model, and uploaded it to the 🤗 Hub, you can use it for inference. The simplest
+way to try out your fine-tuned model for inference is to use it in a [`Pipeline`].
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200")
+```
+
+The model in this guide has only been trained on 200 examples, so don't expect a lot from it. Let's see if it at least 
+learned something from the data and take the first example from the dataset to illustrate inference:
+
+```py
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+>>> print(question)
+>>> pipe(image, question, top_k=1)
+"Where is he looking?"
+[{'score': 0.5498199462890625, 'answer': 'down'}]
+```
+
+Even though not very confident, the model indeed has learned something. With more examples and longer training, you'll get far better results!
+
+You can also manually replicate the results of the pipeline if you'd like:
+1. Take an image and a question, prepare them for the model using the processor from your model.
+2. Forward the result or preprocessing through the model.
+3. From the logits, get the most likely answer's id, and find the actual answer in the `id2label`.
+
+```py
+>>> processor = ViltProcessor.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+
+>>> # prepare inputs
+>>> inputs = processor(image, question, return_tensors="pt")
+
+>>> model = ViltForQuestionAnswering.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> # forward pass
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits
+>>> idx = logits.argmax(-1).item()
+>>> print("Predicted answer:", model.config.id2label[idx])
+Predicted answer: down
+```
+
+## Zero-shot VQA
+
+The previous model treated VQA as a classification task. Some recent models, such as BLIP, BLIP-2, and InstructBLIP approach 
+VQA as a generative task. Let's take [BLIP-2](../model_doc/blip-2) as an example. It introduced a new visual-language pre-training 
+paradigm in which any combination of pre-trained vision encoder and LLM can be used (learn more in the [BLIP-2 blog post](https://huggingface.co/blog/blip-2)). 
+This enables achieving state-of-the-art results on multiple visual-language tasks including visual question answering. 
+
+Let's illustrate how you can use this model for VQA. First, let's load the model. Here we'll explicitly send the model to a 
+GPU, if available, which we didn't need to do earlier when training, as [`Trainer`] handles this automatically: 
+
+```py
+>>> from transformers import AutoProcessor, Blip2ForConditionalGeneration
+>>> import torch
+
+>>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+>>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)
+```
+
+The model takes image and text as input, so let's use the exact same image/question pair from the first example in the VQA dataset: 
+
+```py 
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+```
+
+To use BLIP-2 for visual question answering task, the textual prompt has to follow a specific format: `Question: {} Answer:`.
+
+```py
+>>> prompt = f"Question: {question} Answer:" 
+```
+
+Now we need to preprocess the image/prompt with the model's processor, pass the processed input through the model, and decode the output:
+
+```py
+>>> inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+>>> print(generated_text)
+"He is looking at the crowd" 
+```
+
+As you can see, the model recognized the crowd, and the direction of the face (looking down), however, it seems to miss 
+the fact the crowd is behind the skater. Still, in cases where acquiring human-annotated datasets is not feasible, this 
+approach can quickly produce useful results.
+ 

From 133aac09b043cae5fa0aeedfac854d2498431533 Mon Sep 17 00:00:00 2001
From: MinJae Kang <39152134+mjk0618@users.noreply.github.com>
Date: Thu, 10 Aug 2023 01:24:29 +0900
Subject: [PATCH 904/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`add=5Fnew=5Fmodel.md`=20to=20Korean=20(#24957)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: add_new_model.md

* feat: chatgpt draft

* fix: manual edits

* fix: change document title

* fix: edit with reviewers

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: SeongWooChoi <46990061+nuatmochoi@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: SeongWooChoi <46990061+nuatmochoi@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: SeongWooChoi <46990061+nuatmochoi@users.noreply.github.com>

* fix: edit with reviewers

Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>

* fix: add anchor to header

* Update docs/source/ko/add_new_model.md

Co-authored-by: 이서정 <97655267+sjlee-wise@users.noreply.github.com>

* Update docs/source/ko/add_new_model.md

Co-authored-by: 이서정 <97655267+sjlee-wise@users.noreply.github.com>

* Update docs/source/ko/add_new_model.md

Co-authored-by: 이서정 <97655267+sjlee-wise@users.noreply.github.com>

* fix: edit with reviews

* feat: edit toctree

---------

Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
Co-authored-by: Jungnerd <46880056+jungnerd@users.noreply.github.com>
Co-authored-by: SeongWooChoi <46990061+nuatmochoi@users.noreply.github.com>
Co-authored-by: 이서정 <97655267+sjlee-wise@users.noreply.github.com>
---
 docs/source/ko/_toctree.yml     |   4 +-
 docs/source/ko/add_new_model.md | 630 ++++++++++++++++++++++++++++++++
 2 files changed, 632 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/add_new_model.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index e9318ad1759e..56db314e8f70 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -145,8 +145,8 @@
 - sections:
     - local: in_translation
       title: (번역중) How to contribute to transformers?
-    - local: in_translation
-      title: (번역중) How to add a model to 🤗 Transformers?
+    - local: add_new_model
+      title: 🤗 Transformers에 새로운 모델을 추가하는 방법 
     - local: add_tensorflow_model
       title: 어떻게 🤗 Transformers 모델을 TensorFlow로 변환하나요?
     - local: in_translation
diff --git a/docs/source/ko/add_new_model.md b/docs/source/ko/add_new_model.md
new file mode 100644
index 000000000000..6ae32d2ac60f
--- /dev/null
+++ b/docs/source/ko/add_new_model.md
@@ -0,0 +1,630 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Hugging Face Transformers를 추가하는 방법은 무엇인가요? [[how-to-add-a-model-to-transformers]]
+
+Hugging Face Transformers 라이브러리는 커뮤니티 기여자들 덕분에 새로운 모델을 제공할 수 있는 경우가 많습니다. 하지만 이는 도전적인 프로젝트이며 Hugging Face Transformers 라이브러리와 구현할 모델에 대한 깊은 이해가 필요합니다. Hugging Face에서는 더 많은 커뮤니티 멤버가 모델을 적극적으로 추가할 수 있도록 지원하고자 하며, 이 가이드를 통해 PyTorch 모델을 추가하는 과정을 안내하고 있습니다 (PyTorch가 설치되어 있는지 확인해주세요).
+
+<Tip>
+
+TensorFlow 모델을 구현하고자 하는 경우 [🤗 Transformers 모델을 TensorFlow로 변환하는 방법](add_tensorflow_model) 가이드를 살펴보세요!
+
+</Tip>
+
+이 과정을 진행하면 다음과 같은 내용을 이해하게 됩니다:
+
+- 오픈 소스의 모범 사례에 대한 통찰력을 얻습니다.
+- 가장 인기 있는 딥러닝 라이브러리의 설계 원칙을 이해합니다.
+- 대규모 모델을 효율적으로 테스트하는 방법을 배웁니다.
+- `black`, `ruff`, `make fix-copies`와 같은 Python 유틸리티를 통합하여 깔끔하고 가독성 있는 코드를 작성하는 방법을 배웁니다.
+
+Hugging Face 팀은 항상 도움을 줄 준비가 되어 있으므로 혼자가 아니라는 점을 기억하세요. 🤗 ❤️
+
+시작에 앞서 🤗 Transformers에 원하는 모델을 추가하기 위해 [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) 이슈를 열어야 합니다. 특정 모델을 기여하는 데 특별히 까다로운 기준을 가지지 않는 경우 [New model label](https://github.com/huggingface/transformers/labels/New%20model)을 필터링하여 요청되지 않은 모델이 있는지 확인하고 작업할 수 있습니다.
+
+새로운 모델 요청을 열었다면 첫 번째 단계는 🤗 Transformers에 익숙해지는 것입니다!
+
+## 🤗 Transformers의 전반적인 개요  [[general-overview-of-transformers]]
+
+먼저 🤗 Transformers에 대한 전반적인 개요를 파악해야 합니다. 🤗 Transformers는 매우 주관적인 라이브러리이기 때문에 해당 라이브러리의 철학이나 설계 선택 사항에 동의하지 않을 수도 있습니다. 그러나 우리의 경험상 라이브러리의 기본적인 설계 선택과 철학은 🤗 Transformers의 규모를 효율적으로 확장하면서 유지 보수 비용을 합리적인 수준으로 유지하는 것입니다.
+
+[라이브러리의 철학에 대한 문서](philosophy)를 읽는 것이 라이브러리를 더 잘 이해하는 좋은 시작점입니다. 모든 모델에 적용하려는 몇 가지 작업 방식에 대한 선택 사항이 있습니다:
+
+- 일반적으로 추상화보다는 구성을 선호합니다.
+- 코드를 복제하는 것이 항상 나쁜 것은 아닙니다. 코드의 가독성이나 접근성을 크게 향상시킨다면 복제하는 것은 좋습니다.
+- 모델 파일은 가능한 한 독립적으로 유지되어야 합니다. 따라서 특정 모델의 코드를 읽을 때 해당 `modeling_....py` 파일만 확인하면 됩니다.
+
+우리는 라이브러리의 코드가 제품을 제공하는 수단뿐만 아니라 개선하고자 하는 제품이라고도 생각합니다. 따라서 모델을 추가할 때, 사용자는 모델을 사용할 사람뿐만 아니라 코드를 읽고 이해하고 필요한 경우 조정할 수 있는 모든 사람까지도 포함한다는 점을 기억해야 합니다.
+
+이를 염두에 두고 일반적인 라이브러리 설계에 대해 조금 더 자세히 알아보겠습니다.
+
+### 모델 개요 [[overview-of-models]]
+
+모델을 성공적으로 추가하려면 모델과 해당 구성인 [`PreTrainedModel`] 및 [`PretrainedConfig`] 간의 상호작용을 이해하는 것이 중요합니다. 예를 들어, 🤗 Transformers에 추가하려는 모델을 `BrandNewBert`라고 부르겠습니다.
+
+다음을 살펴보겠습니다:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+
+보다시피, 🤗 Transformers에서는 상속을 사용하지만 추상화 수준을 최소한으로 유지합니다. 라이브러리의 어떤 모델에서도 두 수준 이상의 추상화가 존재하지 않습니다. `BrandNewBertModel`은 `BrandNewBertPreTrainedModel`에서 상속받고, 이 클래스는 [`PreTrainedModel`]에서 상속받습니다. 이로써 새로운 모델은 [`PreTrainedModel`]에만 의존하도록 하려고 합니다. 모든 새로운 모델에 자동으로 제공되는 중요한 기능은 [`~PreTrainedModel.from_pretrained`] 및 [`~PreTrainedModel.save_pretrained`]입니다. 이러한 기능 외에도 `BrandNewBertModel.forward`와 같은 다른 중요한 기능은 새로운 `modeling_brand_new_bert.py` 스크립트에서 완전히 정의되어야 합니다. 또한 `BrandNewBertForMaskedLM`과 같은 특정 헤드 레이어를 가진 모델은 `BrandNewBertModel`을 상속받지 않고 forward pass에서 호출할 수 있는 `BrandNewBertModel`을 사용하여 추상화 수준을 낮게 유지합니다. 모든 새로운 모델은 `BrandNewBertConfig`라는 구성 클래스를 필요로 합니다. 이 구성은 항상 [`PreTrainedModel`]의 속성으로 저장되며, 따라서 `BrandNewBertPreTrainedModel`을 상속받는 모든 클래스에서 `config` 속성을 통해 액세스할 수 있습니다:
+
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # model has access to its config
+```
+
+모델과 마찬가지로 구성은 [`PretrainedConfig`]에서 기본 직렬화 및 역직렬화 기능을 상속받습니다. 구성과 모델은 항상 *pytorch_model.bin* 파일과 *config.json* 파일로 각각 별도로 직렬화됩니다. [`~PreTrainedModel.save_pretrained`]를 호출하면 자동으로 [`~PretrainedConfig.save_pretrained`]도 호출되므로 모델과 구성이 모두 저장됩니다.
+
+
+### 코드 스타일 [[code-style]]
+
+새로운 모델을 작성할 때, Transformers는 주관적인 라이브러리이며 몇 가지 독특한 코딩 스타일이 있습니다:
+
+1. 모델의 forward pass는 모델 파일에 완전히 작성되어야 합니다. 라이브러리의 다른 모델에서 블록을 재사용하려면 코드를 복사하여 위에 `# Copied from` 주석과 함께 붙여넣으면 됩니다 (예: [여기](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)를 참조하세요).
+2. 코드는 완전히 이해하기 쉬워야 합니다. 변수 이름을 명확하게 지정하고 약어를 사용하지 않는 것이 좋습니다. 예를 들어, `act`보다는 `activation`을 선호합니다. 한 글자 변수 이름은 루프의 인덱스인 경우를 제외하고 권장되지 않습니다.
+3. 더 일반적으로, 짧은 마법 같은 코드보다는 길고 명시적인 코드를 선호합니다.
+4. PyTorch에서 `nn.Sequential`을 하위 클래스로 만들지 말고 `nn.Module`을 하위 클래스로 만들고 forward pass를 작성하여 다른 사람이 코드를 빠르게 디버그할 수 있도록 합니다. print 문이나 중단점을 추가할 수 있습니다.
+5. 함수 시그니처에는 타입 주석을 사용해야 합니다. 그 외에는 타입 주석보다 변수 이름이 훨씬 읽기 쉽고 이해하기 쉽습니다.
+
+### 토크나이저 개요 [[overview-of-tokenizers]]
+ 
+아직 준비되지 않았습니다 :-( 이 섹션은 곧 추가될 예정입니다!
+
+## 🤗 Transformers에 모델 추가하는 단계별 방법  [[stepbystep-recipe-to-add-a-model-to-transformers]]
+
+각자 모델을 이식하는 방법에 대한 선호가 다르기 때문에 다른 기여자들이 Hugging Face에 모델을 이식하는 방법에 대한 요약을 살펴보는 것이 매우 유용할 수 있습니다. 다음은 모델을 이식하는 방법에 대한 커뮤니티 블로그 게시물 목록입니다:
+
+1. [GPT2 모델 이식하기](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) - [Thomas](https://huggingface.co/thomwolf)
+2. [WMT19 MT 모델 이식하기](https://huggingface.co/blog/porting-fsmt) - [Stas](https://huggingface.co/stas)
+
+경험상 모델을 추가할 때 주의해야 할 가장 중요한 사항은 다음과 같습니다:
+
+-  같은 일을 반복하지 마세요! 새로운 🤗 Transformers 모델을 위해 추가할 코드의 대부분은 이미 🤗 Transformers 어딘가에 존재합니다. 이미 존재하는 복사할 수 있는 유사한 모델과 토크나이저를 찾는데 시간을 투자하세요. [grep](https://www.gnu.org/software/grep/)와 [rg](https://github.com/BurntSushi/ripgrep)를 참고하세요. 모델의 토크나이저가 한 모델을 기반으로 하고 모델링 코드가 다른 모델을 기반으로 하는 경우가 존재할 수도 있습니다. 예를 들어 FSMT의 모델링 코드는 BART를 기반으로 하고 FSMT의 토크나이저 코드는 XLM을 기반으로 합니다.
+-  이것은 과학적인 도전보다는 공학적인 도전입니다. 논문의 모델의 모든 이론적 측면을 이해하려는 것보다 효율적인 디버깅 환경을 만드는 데 더 많은 시간을 소비해야 합니다.
+-  막힐 때 도움을 요청하세요! 모델은 🤗 Transformers의 핵심 구성 요소이므로 Hugging Face의 우리는 당신이 모델을 추가하는 각 단계에서 기꺼이 도움을 줄 준비가 되어 있습니다. 진전이 없다고 느끼면 주저하지 말고 도움을 요청하세요.
+
+다음에서는 모델을 🤗 Transformers로 이식하는 데 가장 유용한 일반적인 절차를 제공하려고 노력합니다.
+
+다음 목록은 모델을 추가하는 데 수행해야 할 모든 작업의 요약이며 To-Do 목록으로 사용할 수 있습니다:
+
+☐ (선택 사항) BrandNewBert의 이론적 측면 이해<br>
+☐ Hugging Face 개발 환경 준비<br>
+☐ 원본 리포지토리의 디버깅 환경 설정<br>
+☐ 원본 리포지토리와 체크포인트를 사용하여 `forward()` pass가 성공적으로 실행되는 스크립트 작성<br>
+☐ 🤗 Transformers에 모델 스켈레톤 성공적으로 추가<br>
+☐ 원본 체크포인트를 🤗 Transformers 체크포인트로 성공적으로 변환<br>
+☐ 🤗 Transformers에서 원본 체크포인트와 동일한 출력을 내주는 `forward()` pass 성공적으로 실행<br>
+☐ 🤗 Transformers에서 모델 테스트 완료<br>
+☐ 🤗 Transformers에 토크나이저 성공적으로 추가<br>
+☐ 종단 간 통합 테스트 실행<br>
+☐ 문서 작성 완료<br>
+☐ 모델 가중치를 허브에 업로드<br>
+☐ Pull request 제출<br>
+☐ (선택 사항) 데모 노트북 추가
+
+우선, 일반적으로는 `BrandNewBert`의 이론적인 이해로 시작하는 것을 권장합니다. 그러나 이론적 측면을 직접 이해하는 대신 *직접 해보면서* 모델의 이론적 측면을 이해하는 것을 선호하는 경우 바로 `BrandNewBert` 코드 베이스로 빠져드는 것도 괜찮습니다. 이 옵션은 엔지니어링 기술이 이론적 기술보다 더 뛰어난 경우, `BrandNewBert`의 논문을 이해하는 데 어려움이 있는 경우, 또는 과학적인 논문을 읽는 것보다 프로그래밍에 훨씬 더 흥미 있는 경우에 더 적합할 수 있습니다.
+
+### 1. (선택 사항) BrandNewBert의 이론적 측면 [[1-optional-theoretical-aspects-of-brandnewbert]]
+
+만약 그런 서술적인 작업이 존재한다면, *BrandNewBert*의 논문을 읽어보는 시간을 가져야 합니다. 이해하기 어려운 섹션이 많을 수 있습니다. 그렇더라도 걱정하지 마세요! 목표는 논문의 깊은 이론적 이해가 아니라 *BrandNewBert*를 🤗 Transformers에서 효과적으로 재구현하기 위해 필요한 정보를 추출하는 것입니다. 이를 위해 이론적 측면에 너무 많은 시간을 투자할 필요는 없지만 다음과 같은 실제적인 측면에 집중해야 합니다:
+
+- *BrandNewBert*는 어떤 유형의 모델인가요? BERT와 유사한 인코더 모델인가요? GPT2와 유사한 디코더 모델인가요? BART와 유사한 인코더-디코더 모델인가요? 이들 간의 차이점에 익숙하지 않은 경우[model_summary](model_summary)를 참조하세요.
+- *BrandNewBert*의 응용 분야는 무엇인가요? 텍스트 분류인가요? 텍스트 생성인가요? 요약과 같은 Seq2Seq 작업인가요?
+- *brand_new_bert*와 BERT/GPT-2/BART의 차이점은 무엇인가요?
+- *brand_new_bert*와 가장 유사한 [🤗 Transformers 모델](https://huggingface.co/transformers/#contents)은 무엇인가요?
+- 어떤 종류의 토크나이저가 사용되나요? Sentencepiece 토크나이저인가요? Word piece 토크나이저인가요? BERT 또는 BART에 사용되는 동일한 토크나이저인가요?
+
+모델의 아키텍처에 대해 충분히 이해했다는 생각이 든 후, 궁금한 사항이 있으면 Hugging Face 팀에 문의하십시오. 이는 모델의 아키텍처, 어텐션 레이어 등에 관한 질문을 포함할 수 있습니다. Hugging Face의 유지 관리자들은 보통 코드를 검토하는 것에 대해 매우 기뻐하므로 당신을 돕는 일을 매우 환영할 것입니다!
+
+### 2. 개발 환경 설정 [[2-next-prepare-your-environment]]
+
+1. 저장소 페이지에서 "Fork" 버튼을 클릭하여 저장소의 사본을 GitHub 사용자 계정으로 만듭니다.
+
+2. `transformers` fork를 로컬 디스크에 클론하고 베이스 저장소를 원격 저장소로 추가합니다:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. 개발 환경을 설정합니다. 다음 명령을 실행하여 개발 환경을 설정할 수 있습니다:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+각 운영 체제에 따라 Transformers의 선택적 의존성이 개수가 증가하면 이 명령이 실패할 수 있습니다. 그런 경우에는 작업 중인 딥 러닝 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)을 설치한 후, 다음 명령을 수행하면 됩니다:
+
+```bash
+pip install -e ".[quality]"
+```
+
+대부분의 경우에는 이것으로 충분합니다. 그런 다음 상위 디렉토리로 돌아갑니다.
+
+```bash
+cd ..
+```
+
+4. Transformers에 *brand_new_bert*의 PyTorch 버전을 추가하는 것을 권장합니다. PyTorch를 설치하려면 다음 링크의 지침을 따르십시오: https://pytorch.org/get-started/locally/.
+
+**참고:** CUDA를 설치할 필요는 없습니다. 새로운 모델이 CPU에서 작동하도록 만드는 것으로 충분합니다.
+
+5. *brand_new_bert*를 이식하기 위해서는 해당 원본 저장소에 접근할 수 있어야 합니다:
+
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+cd brand_new_bert
+pip install -e .
+```
+
+이제 *brand_new_bert*를 🤗 Transformers로 이식하기 위한 개발 환경을 설정하였습니다.
+
+### 3.-4. 원본 저장소에서 사전 훈련된 체크포인트 실행하기 [[3.-4.-run-a-pretrained-checkpoint-using-the-original-repository]]
+
+먼저, 원본 *brand_new_bert* 저장소에서 작업을 시작합니다. 원본 구현은 보통 "연구용"으로 많이 사용됩니다. 즉, 문서화가 부족하고 코드가 이해하기 어려울 수 있습니다. 그러나 이것이 바로 *brand_new_bert*를 다시 구현하려는 동기가 되어야 합니다. Hugging Face에서의 주요 목표 중 하나는 **거인의 어깨 위에 서는 것**이며, 이는 여기에서 쉽게 해석되어 동작하는 모델을 가져와서 가능한 한 **접근 가능하고 사용자 친화적이며 아름답게** 만드는 것입니다. 이것은 🤗 Transformers에서 모델을 다시 구현하는 가장 중요한 동기입니다 - 새로운 복잡한 NLP 기술을 **모두에게** 접근 가능하게 만드는 것을 목표로 합니다.
+
+따라서 원본 저장소에 대해 자세히 살펴보는 것으로 시작해야 합니다.
+
+원본 저장소에서 공식 사전 훈련된 모델을 성공적으로 실행하는 것은 종종 **가장 어려운** 단계입니다. 우리의 경험에 따르면, 원본 코드 베이스에 익숙해지는 데 시간을 투자하는 것이 매우 중요합니다. 다음을 파악해야 합니다:
+
+- 사전 훈련된 가중치를 어디서 찾을 수 있는지?
+- 사전 훈련된 가중치를 해당 모델에로드하는 방법은?
+- 모델과 독립적으로 토크나이저를 실행하는 방법은?
+- 간단한 forward pass에 필요한 클래스와 함수를 파악하기 위해 forward pass를 한 번 추적해 보세요. 일반적으로 해당 함수들만 다시 구현하면 됩니다.
+- 모델의 중요한 구성 요소를 찾을 수 있어야 합니다. 모델 클래스는 어디에 있나요? 모델 하위 클래스(*EncoderModel*, *DecoderModel* 등)가 있나요? self-attention 레이어는 어디에 있나요? self-attention, cross-attention 등 여러 가지 다른 어텐션 레이어가 있나요?
+- 원본 환경에서 모델을 디버그할 수 있는 방법은 무엇인가요? *print* 문을 추가해야 하나요? *ipdb*와 같은 대화식 디버거를 사용할 수 있나요? PyCharm과 같은 효율적인 IDE를 사용해 모델을 디버그할 수 있나요?
+
+원본 저장소에서 코드를 이식하는 작업을 시작하기 전에 원본 저장소에서 코드를 **효율적으로** 디버그할 수 있어야 합니다! 또한, 오픈 소스 라이브러리로 작업하고 있다는 것을 기억해야 합니다. 따라서 원본 저장소에서 issue를 열거나 pull request를 열기를 주저하지 마십시오. 이 저장소의 유지 관리자들은 누군가가 자신들의 코드를 살펴본다는 것에 대해 매우 기뻐할 것입니다!
+
+현재 시점에서, 원래 모델을 디버깅하기 위해 어떤 디버깅 환경과 전략을 선호하는지는 당신에게 달렸습니다. 우리는 고가의 GPU 환경을 구축하는 것은 비추천합니다. 대신, 원래 저장소로 들어가서 작업을 시작할 때와 🤗 Transformers 모델의 구현을 시작할 때에도 CPU에서 작업하는 것이 좋습니다. 모델이 이미 🤗 Transformers로 성공적으로 이식되었을 때에만 모델이 GPU에서도 예상대로 작동하는지 확인해야합니다.
+
+일반적으로, 원래 모델을 실행하기 위한 두 가지 가능한 디버깅 환경이 있습니다.
+
+- [Jupyter 노트북](https://jupyter.org/) / [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb)
+- 로컬 Python 스크립트
+
+Jupyter 노트북의 장점은 셀 단위로 실행할 수 있다는 것입니다. 이는 논리적인 구성 요소를 더 잘 분리하고 중간 결과를 저장할 수 있으므로 디버깅 사이클이 더 빨라질 수 있습니다. 또한, 노트북은 다른 기여자와 쉽게 공유할 수 있으므로 Hugging Face 팀의 도움을 요청하려는 경우 매우 유용할 수 있습니다. Jupyter 노트북에 익숙하다면 이를 사용하는 것을 강력히 추천합니다.
+
+Jupyter 노트북의 단점은 사용에 익숙하지 않은 경우 새로운 프로그래밍 환경에 적응하는 데 시간을 할애해야 하며, `ipdb`와 같은 알려진 디버깅 도구를 더 이상 사용할 수 없을 수도 있다는 것입니다.
+
+각 코드 베이스에 대해 좋은 첫 번째 단계는 항상 **작은** 사전 훈련된 체크포인트를 로드하고 더미 정수 벡터 입력을 사용하여 단일 forward pass를 재현하는 것입니다. 이와 같은 스크립트는 다음과 같을 수 있습니다(의사 코드로 작성):
+
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
+
+다음으로, 디버깅 전략에 대해 일반적으로 다음과 같은 몇 가지 선택지가 있습니다:
+
+- 원본 모델을 많은 작은 테스트 가능한 구성 요소로 분해하고 각각에 대해 forward pass를 실행하여 검증합니다.
+- 원본 모델을 원본 *tokenizer*과 원본 *model*로만 분해하고 해당 부분에 대해 forward pass를 실행한 후 검증을 위해 중간 출력(print 문 또는 중단점)을 사용합니다.
+
+다시 말하지만, 어떤 전략을 선택할지는 당신에게 달려 있습니다. 원본 코드 베이스에 따라 하나 또는 다른 전략이 유리할 수 있습니다.
+
+원본 코드 베이스를 모델의 작은 하위 구성 요소로 분해할 수 있는지 여부, 예를 들어 원본 코드 베이스가 즉시 실행 모드에서 간단히 실행될 수 있는 경우, 그런 경우에는 그 노력이 가치가 있다는 것이 일반적입니다. 초기에 더 어려운 방법을 선택하는 것에는 몇 가지 중요한 장점이 있습니다.
+
+- 원본 모델을 🤗 Transformers 구현과 비교할 때 각 구성 요소가 일치하는지 자동으로 확인할 수 있습니다. 즉, 시각적인 비교(print 문을 통한 비교가 아닌) 대신 🤗 Transformers 구현과 그에 대응하는 원본 구성 요소가 일치하는지 확인할 수 있습니다.
+- 전체 모델을 모듈별로, 즉 작은 구성 요소로 분해함으로써 모델을 이식하는 큰 문제를 단순히 개별 구성 요소를 이식하는 작은 문제로 분해할 수 있으므로 작업을 더 잘 구조화할 수 있습니다.
+- 모델을 논리적으로 의미 있는 구성 요소로 분리하는 것은 모델의 설계에 대한 더 나은 개요를 얻고 모델을 더 잘 이해하는 데 도움이 됩니다.
+- 이러한 구성 요소별 테스트를 통해 코드를 변경하면서 회귀가 발생하지 않도록 보장할 수 있습니다.
+
+[Lysandre의 ELECTRA 통합 검사](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed)는 이를 수행하는 좋은 예제입니다.
+
+그러나 원본 코드 베이스가 매우 복잡하거나 중간 구성 요소를 컴파일된 모드에서 실행하는 것만 허용하는 경우, 모델을 테스트 가능한 작은 하위 구성 요소로 분해하는 것이 시간이 많이 소요되거나 불가능할 수도 있습니다. [T5의 MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) 라이브러리는 매우 복잡하며 모델을 하위 구성 요소로 분해하는 간단한 방법을 제공하지 않습니다. 이러한 라이브러리의 경우, 보통 print 문을 통해 확인합니다.
+
+어떤 전략을 선택하더라도 권장되는 절차는 동일합니다. 먼저 시작 레이어를 디버그하고 마지막 레이어를 마지막에 디버그하는 것이 좋습니다.
+
+다음 순서로 각 레이어의 출력을 검색하는 것이 좋습니다:
+
+1. 모델에 전달된 입력 ID 가져오기
+2. 워드 임베딩 가져오기
+3. 첫 번째 Transformer 레이어의 입력 가져오기
+4. 첫 번째 Transformer 레이어의 출력 가져오기
+5. 다음 n-1개의 Transformer 레이어의 출력 가져오기
+6. BrandNewBert 모델의 출력 가져오기
+
+입력 ID는 정수 배열로 구성되며, 예를 들어 `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`와 같을 수 있습니다.
+
+다음 레이어의 출력은 종종 다차원 실수 배열로 구성되며, 다음과 같이 나타낼 수 있습니다:
+
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+🤗 Transformers에 추가되는 모든 모델은 통합 테스트를 통과해야 합니다. 즉, 원본 모델과 🤗 Transformers의 재구현 버전이 0.001의 정밀도로 정확히 동일한 출력을 내야 합니다! 동일한 모델이 다른 라이브러리에서 작성되었을 때 라이브러리 프레임워크에 따라 약간 다른 출력을 얻는 것은 정상이므로 1e-3(0.001)의 오차는 허용합니다. 거의 동일한 출력을 내는 것만으로는 충분하지 않으며, 완벽히 일치하는 수준이어야 합니다. 따라서 🤗 Transformers 버전의 중간 출력을 *brand_new_bert*의 원래 구현의 중간 출력과 여러 번 비교해야 합니다. 이 경우 원본 저장소의 **효율적인** 디버깅 환경이 절대적으로 중요합니다. 디버깅 환경을 가능한 한 효율적으로 만드는 몇 가지 조언을 제시합니다.
+
+- 중간 결과를 디버그하는 가장 좋은 방법을 찾으세요. 원본 저장소가 PyTorch로 작성되었다면 원본 모델을 더 작은 하위 구성 요소로 분해하여 중간 값을 검색하는 긴 스크립트를 작성하는 것에 시간을 투자할 가치가 있습니다. 원본 저장소가 Tensorflow 1로 작성되었다면 [tf.print](https://www.tensorflow.org/api_docs/python/tf/print)와 같은 Tensorflow 출력 작업을 사용하여 중간 값을 출력해야 할 수도 있습니다. 원본 저장소가 Jax로 작성되었다면 forward pass를 실행할 때 모델이 **jit 되지 않도록** 해야 합니다. 예를 들어 [이 링크](https://github.com/google/jax/issues/196)를 확인해 보세요.
+- 사용 가능한 가장 작은 사전 훈련된 체크포인트를 사용하세요. 체크포인트가 작을수록 디버그 사이클이 더 빨라집니다. 전반적으로 forward pass에 10초 이상이 걸리는 경우 효율적이지 않습니다. 매우 큰 체크포인트만 사용할 수 있는 경우, 새 환경에서 임의로 초기화된 가중치로 더미 모델을 만들고 해당 가중치를 🤗 Transformers 버전과 비교하기 위해 저장하는 것이 더 의미가 있을 수 있습니다.
+- 디버깅 설정에서 가장 쉽게 forward pass를 호출하는 방법을 사용하세요. 원본 저장소에서 **단일** forward pass만 호출하는 함수를 찾는 것이 이상적입니다. 이 함수는 일반적으로 `predict`, `evaluate`, `forward`, `__call__`과 같이 호출됩니다. `autoregressive_sample`과 같은 텍스트 생성에서 `forward`를 여러 번 호출하여 텍스트를 생성하는 등의 작업을 수행하는 함수를 디버그하고 싶지 않을 것입니다.
+- 토큰화 과정을 모델의 *forward* pass와 분리하려고 노력하세요. 원본 저장소에서 입력 문자열을 입력해야 하는 예제가 있는 경우, 입력 문자열이 입력 ID로 변경되는 순간을 찾아서 시작하세요. 이 경우 직접 ID를 입력할 수 있도록 작은 스크립트를 작성하거나 원본 코드를 수정해야 할 수도 있습니다.
+- 디버깅 설정에서 모델이 훈련 모드가 아니라는 것을 확인하세요. 훈련 모드에서는 모델의 여러 드롭아웃 레이어 때문에 무작위 출력이 생성될 수 있습니다. 디버깅 환경에서 forward pass가 **결정론적**이도록 해야 합니다. 또는 동일한 프레임워크에 있는 경우 *transformers.utils.set_seed*를 사용하세요.
+
+다음 섹션에서는 *brand_new_bert*에 대해 이 작업을 수행하는 데 더 구체적인 세부 사항/팁을 제공합니다.
+
+### 5.-14. 🤗 Transformers에 BrandNewBert를 이식하기 [[5.-14.-port-brandnewbert-to-transformers]]
+
+이제, 마침내 🤗 Transformers에 새로운 코드를 추가할 수 있습니다. 🤗 Transformers 포크의 클론으로 이동하세요:
+
+```bash
+cd transformers
+```
+
+다음과 같이 이미 존재하는 모델의 모델 아키텍처와 정확히 일치하는 모델을 추가하는 특별한 경우에는 [이 섹션](#write-a-conversion-script)에 설명된대로 변환 스크립트만 추가하면 됩니다. 이 경우에는 이미 존재하는 모델의 전체 모델 아키텍처를 그대로 재사용할 수 있습니다.
+
+그렇지 않으면 새로운 모델 생성을 시작합시다. 여기에서 두 가지 선택지가 있습니다:
+
+- `transformers-cli add-new-model-like`를 사용하여 기존 모델과 유사한 새로운 모델 추가하기
+- `transformers-cli add-new-model`을 사용하여 템플릿을 기반으로 한 새로운 모델 추가하기 (선택한 모델 유형에 따라 BERT 또는 Bart와 유사한 모습일 것입니다)
+
+두 경우 모두, 모델의 기본 정보를 입력하는 설문조사가 제시됩니다. 두 번째 명령어는 `cookiecutter`를 설치해야 합니다. 자세한 정보는 [여기](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model)에서 확인할 수 있습니다.
+
+**huggingface/transformers 메인 저장소에 Pull Request 열기**
+
+자동으로 생성된 코드를 수정하기 전에, 지금은 "작업 진행 중 (WIP)" 풀 리퀘스트를 열기 위한 시기입니다. 예를 들어, 🤗 Transformers에 "*brand_new_bert* 추가"라는 제목의 "[WIP] Add *brand_new_bert*" 풀 리퀘스트를 엽니다. 이렇게 하면 당신과 Hugging Face 팀이 🤗 Transformers에 모델을 통합하는 작업을 함께할 수 있습니다.
+
+다음을 수행해야 합니다:
+
+1. 메인 브랜치에서 작업을 잘 설명하는 이름으로 브랜치 생성
+
+```bash
+git checkout -b add_brand_new_bert
+```
+
+2. 자동으로 생성된 코드 커밋
+
+```bash
+git add .
+git commit
+```
+
+3. 현재 메인을 가져오고 리베이스
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+4. 변경 사항을 계정에 푸시
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+5. 만족스럽다면, GitHub에서 자신의 포크한 웹 페이지로 이동합니다. "Pull request"를 클릭합니다. Hugging Face 팀의 일부 멤버의 GitHub 핸들을 리뷰어로 추가하여 Hugging Face 팀이 앞으로의 변경 사항에 대해 알림을 받을 수 있도록 합니다.
+
+6. GitHub 풀 리퀘스트 웹 페이지 오른쪽에 있는 "Convert to draft"를 클릭하여 PR을 초안으로 변경합니다.
+
+다음으로, 어떤 진전을 이루었다면 작업을 커밋하고 계정에 푸시하여 풀 리퀘스트에 표시되도록 해야 합니다. 또한, 다음과 같이 현재 메인과 작업을 업데이트해야 합니다:
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+일반적으로, 모델 또는 구현에 관한 모든 질문은 자신의 PR에서 해야 하며, PR에서 토론되고 해결되어야 합니다. 이렇게 하면 Hugging Face 팀이 새로운 코드를 커밋하거나 질문을 할 때 항상 알림을 받을 수 있습니다. Hugging Face 팀에게 문제 또는 질문을 효율적으로 이해할 수 있도록 추가한 코드를 명시하는 것이 도움이 될 때가 많습니다.
+
+이를 위해, 변경 사항을 모두 볼 수 있는 "Files changed" 탭으로 이동하여 질문하고자 하는 줄로 이동한 다음 "+" 기호를 클릭하여 코멘트를 추가할 수 있습니다. 질문이나 문제가 해결되면, 생성된 코멘트의 "Resolve" 버튼을 클릭할 수 있습니다.
+
+마찬가지로, Hugging Face 팀은 코드를 리뷰할 때 코멘트를 남길 것입니다. 우리는 PR에서 대부분의 질문을 GitHub에서 묻는 것을 권장합니다. 공개에 크게 도움이 되지 않는 매우 일반적인 질문의 경우, Slack이나 이메일을 통해 Hugging Face 팀에게 문의할 수 있습니다.
+
+**5. brand_new_bert에 대해 생성된 모델 코드를 적용하기**
+
+먼저, 우리는 모델 자체에만 초점을 맞추고 토크나이저에 대해서는 신경 쓰지 않을 것입니다. 모든 관련 코드는 다음의 생성된 파일에서 찾을 수 있습니다: `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 및 `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+
+이제 마침내 코딩을 시작할 수 있습니다 :). `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`의 생성된 코드는 인코더 전용 모델인 경우 BERT와 동일한 아키텍처를 가지거나, 인코더-디코더 모델인 경우 BART와 동일한 아키텍처를 가질 것입니다. 이 시점에서, 모델의 이론적 측면에 대해 배운 내용을 다시 상기해야 합니다: *모델이 BERT 또는 BART와 어떻게 다른가요?*. 자주 변경해야 하는 것은 *self-attention* 레이어, 정규화 레이어의 순서 등을 변경하는 것입니다. 다시 말하지만, 자신의 모델을 구현하는 데 도움이 되도록 Transformers에서 이미 존재하는 모델의 유사한 아키텍처를 살펴보는 것이 유용할 수 있습니다.
+
+**참고로** 이 시점에서, 코드가 완전히 정확하거나 깨끗하다고 확신할 필요는 없습니다. 오히려 처음에는 원본 코드의 첫 번째 *불완전하고* 복사된 버전을 `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`에 추가하는 것이 좋습니다. 필요한 모든 코드가 추가될 때까지 이러한 작업을 진행한 후, 다음 섹션에서 설명한 변환 스크립트를 사용하여 코드를 점진적으로 개선하고 수정하는 것이 훨씬 효율적입니다. 이 시점에서 작동해야 하는 유일한 것은 다음 명령이 작동하는 것입니다:
+
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+
+model = BrandNewBertModel(BrandNewBertConfig())
+```
+
+위의 명령은 `BrandNewBertConfig()`에 정의된 기본 매개변수에 따라 무작위 가중치로 모델을 생성하며, 이로써 모든 구성 요소의 `init()` 메서드가 작동함을 보장합니다.
+
+모든 무작위 초기화는 `BrandnewBertPreTrainedModel` 클래스의 `_init_weights` 메서드에서 수행되어야 합니다. 이 메서드는 구성 설정 변수에 따라 모든 리프 모듈을 초기화해야 합니다. BERT의 `_init_weights` 메서드 예제는 다음과 같습니다:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+```
+
+몇 가지 모듈에 대해 특별한 초기화가 필요한 경우 사용자 정의 방식을 사용할 수도 있습니다. 예를 들어, `Wav2Vec2ForPreTraining`에서 마지막 두 개의 선형 레이어는 일반적인 PyTorch `nn.Linear`의 초기화를 가져야 하지만, 다른 모든 레이어는 위와 같은 초기화를 사용해야 합니다. 이는 다음과 같이 코드화됩니다:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstnace(module, Wav2Vec2ForPreTraining):
+        module.project_hid.reset_parameters()
+        module.project_q.reset_parameters()
+        module.project_hid._is_hf_initialized = True
+        module.project_q._is_hf_initialized = True
+    elif isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+```
+
+`_is_hf_initialized` 플래그는 서브모듈을 한 번만 초기화하도록 내부적으로 사용됩니다. `module.project_q` 및 `module.project_hid`에 대해 `True`로 설정함으로써, 우리가 수행한 사용자 정의 초기화가 이후에 덮어쓰이지 않도록 합니다. 즉, `_init_weights` 함수가 이들에게 적용되지 않습니다.
+
+**6. 변환 스크립트 작성하기**
+
+다음으로, 디버그에 사용한 체크포인트를 기존 저장소에서 만든 🤗 Transformers 구현과 호환되는 체크포인트로 변환할 수 있는 변환 스크립트를 작성해야 합니다. 변환 스크립트를 처음부터 작성하는 것보다는 *brand_new_bert*와 동일한 프레임워크로 작성된 유사한 모델을 변환한 기존 변환 스크립트를 찾아보는 것이 좋습니다. 일반적으로 기존 변환 스크립트를 복사하여 사용 사례에 맞게 약간 수정하는 것으로 충분합니다. 모델에 대해 유사한 기존 변환 스크립트를 어디에서 찾을 수 있는지 Hugging Face 팀에게 문의하는 것을 망설이지 마세요.
+
+- TensorFlow에서 PyTorch로 모델을 이전하는 경우, 좋은 참고 자료로 BERT의 변환 스크립트 [여기](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)를 참조할 수 있습니다.
+- PyTorch에서 PyTorch로 모델을 이전하는 경우, 좋은 참고 자료로 BART의 변환 스크립트 [여기](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)를 참조할 수 있습니다.
+
+다음에서는 PyTorch 모델이 레이어 가중치를 저장하고 레이어 이름을 정의하는 방법에 대해 간단히 설명하겠습니다. PyTorch에서 레이어의 이름은 레이어에 지정한 클래스 속성의 이름으로 정의됩니다. 다음과 같이 PyTorch에서 `SimpleModel`이라는 더미 모델을 정의해 봅시다:
+
+```python
+from torch import nn
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+
+이제 이 모델 정의의 인스턴스를 생성할 수 있으며 `dense`, `intermediate`, `layer_norm` 등의 가중치가 랜덤하게 할당됩니다. 모델을 출력하여 아키텍처를 확인할 수 있습니다.
+
+```python
+model = SimpleModel()
+
+print(model)
+```
+
+이는 다음과 같이 출력됩니다:
+
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+우리는 레이어의 이름이 PyTorch에서 클래스 속성의 이름으로 정의되어 있는 것을 볼 수 있습니다. 특정 레이어의 가중치 값을 출력하여 확인할 수 있습니다:
+
+```python
+print(model.dense.weight.data)
+```
+
+가중치가 무작위로 초기화되었음을 확인할 수 있습니다.
+
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+변환 스크립트에서는 이러한 무작위로 초기화된 가중치를 체크포인트의 해당 레이어의 정확한 가중치로 채워야 합니다. 예를 들면 다음과 같습니다:
+
+```python
+# retrieve matching layer weights, e.g. by
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+이렇게 하면 PyTorch 모델의 무작위로 초기화된 각 가중치와 해당 체크포인트 가중치가 **모양과 이름** 모두에서 정확히 일치하는지 확인해야 합니다. 이를 위해 모양에 대한 assert 문을 추가하고 체크포인트 가중치의 이름을 출력해야 합니다. 예를 들어 다음과 같은 문장을 추가해야 합니다:
+
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
+
+또한 두 가중치의 이름을 출력하여 일치하는지 확인해야 합니다. *예시*:
+
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+모양 또는 이름이 일치하지 않는 경우, 랜덤으로 초기화된 레이어에 잘못된 체크포인트 가중치를 할당한 것으로 추측됩니다.
+
+잘못된 모양은 `BrandNewBertConfig()`의 구성 매개변수 설정이 변환하려는 체크포인트에 사용된 설정과 정확히 일치하지 않기 때문일 가능성이 가장 큽니다. 그러나 PyTorch의 레이어 구현 자체에서 가중치를 전치해야 할 수도 있습니다.
+
+마지막으로, **모든** 필요한 가중치가 초기화되었는지 확인하고 초기화에 사용되지 않은 모든 체크포인트 가중치를 출력하여 모델이 올바르게 변환되었는지 확인해야 합니다. 잘못된 모양 문장이나 잘못된 이름 할당으로 인해 변환 시도가 실패하는 것은 완전히 정상입니다. 이는 `BrandNewBertConfig()`에서 잘못된 매개변수를 사용하거나 🤗 Transformers 구현에서 잘못된 아키텍처, 🤗 Transformers 구현의 구성 요소 중 하나의 `init()` 함수에 버그가 있는 경우이거나 체크포인트 가중치 중 하나를 전치해야 하는 경우일 가능성이 가장 높습니다.
+
+이 단계는 이전 단계와 함께 반복되어야 하며 모든 체크포인트의 가중치가 Transformers 모델에 올바르게 로드되었을 때까지 계속되어야 합니다. 🤗 Transformers 구현에 체크포인트를 올바르게 로드한 후에는 `/path/to/converted/checkpoint/folder`와 같은 원하는 폴더에 모델을 저장할 수 있어야 합니다. 해당 폴더에는 `pytorch_model.bin` 파일과 `config.json` 파일이 모두 포함되어야 합니다.
+
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+**7. 순방향 패스 구현하기**
+
+🤗 Transformers 구현에 사전 훈련된 가중치를 정확하게 로드한 후에는 순방향 패스가 올바르게 구현되었는지 확인해야 합니다. [원본 저장소에 익숙해지기](#34-run-a-pretrained-checkpoint-using-the-original-repository)에서 이미 원본 저장소를 사용하여 모델의 순방향 패스를 실행하는 스크립트를 만들었습니다. 이제 원본 대신 🤗 Transformers 구현을 사용하는 유사한 스크립트를 작성해야 합니다. 다음과 같이 작성되어야 합니다:
+
+```python
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
+
+🤗 Transformers 구현과 원본 모델 구현이 처음부터 정확히 동일한 출력을 제공하지 않거나 순방향 패스에서 오류가 발생할 가능성이 매우 높습니다. 실망하지 마세요. 예상된 일입니다! 먼저, 순방향 패스에서 오류가 발생하지 않도록 해야 합니다. 종종 잘못된 차원이 사용되어 *차원 불일치* 오류가 발생하거나 잘못된 데이터 유형 개체가 사용되는 경우가 있습니다. 예를 들면 `torch.long` 대신에 `torch.float32`가 사용된 경우입니다. 해결할 수 없는 오류가 발생하면 Hugging Face 팀에 도움을 요청하는 것이 좋습니다.
+
+🤗 Transformers 구현이 올바르게 작동하는지 확인하는 마지막 단계는 출력이 `1e-3`의 정밀도로 동일한지 확인하는 것입니다. 먼저, 출력 모양이 동일하도록 보장해야 합니다. 즉, 🤗 Transformers 구현 스크립트와 원본 구현 사이에서 `outputs.shape`는 동일한 값을 반환해야 합니다. 그 다음으로, 출력 값이 동일하도록 해야 합니다. 이는 새로운 모델을 추가할 때 가장 어려운 부분 중 하나입니다. 출력이 동일하지 않은 일반적인 실수 사례는 다음과 같습니다:
+
+- 일부 레이어가 추가되지 않았습니다. 즉, *활성화* 레이어가 추가되지 않았거나 잔차 연결이 빠졌습니다.
+- 단어 임베딩 행렬이 연결되지 않았습니다.
+- 잘못된 위치 임베딩이 사용되었습니다. 원본 구현에서는 오프셋을 사용합니다.
+- 순방향 패스 중에 Dropout이 적용되었습니다. 이를 수정하려면 *model.training이 False*인지 확인하고 순방향 패스 중에 Dropout 레이어가 잘못 활성화되지 않도록 하세요. 즉, [PyTorch의 기능적 Dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)에 *self.training*을 전달하세요.
+
+문제를 해결하는 가장 좋은 방법은 일반적으로 원본 구현과 🤗 Transformers 구현의 순방향 패스를 나란히 놓고 차이점이 있는지 확인하는 것입니다. 이상적으로는 순방향 패스의 중간 출력을 디버그/출력하여 원본 구현과 🤗 Transformers 구현의 정확한 위치를 찾을 수 있어야 합니다. 먼저, 두 스크립트의 하드코딩된 `input_ids`가 동일한지 확인하세요. 다음으로, `input_ids`의 첫 번째 변환의 출력(일반적으로 단어 임베딩)이 동일한지 확인하세요. 그런 다음 네트워크의 가장 마지막 레이어까지 진행해보세요. 어느 시점에서 두 구현 사이에 차이가 있는 것을 알게 되는데, 이는 🤗 Transformers 구현의 버그 위치를 가리킬 것입니다. 저희 경험상으로는 원본 구현과 🤗 Transformers 구현 모두에서 동일한 위치에 많은 출력 문을 추가하고 이들의 중간 표현에 대해 동일한 값을 보이는 출력 문을 연속적으로 제거하는 것이 간단하고 효과적인 방법입니다.
+
+`torch.allclose(original_output, output, atol=1e-3)`로 출력을 확인하여 두 구현이 동일한 출력을 하는 것을 확신한다면, 가장 어려운 부분은 끝났습니다! 축하드립니다. 남은 작업은 쉬운 일이 될 것입니다 😊.
+
+**8. 필요한 모든 모델 테스트 추가하기**
+
+이 시점에서 새로운 모델을 성공적으로 추가했습니다. 그러나 해당 모델이 요구되는 디자인에 완전히 부합하지 않을 수도 있습니다. 🤗 Transformers와 완벽하게 호환되는 구현인지 확인하기 위해 모든 일반 테스트를 통과해야 합니다. Cookiecutter는 아마도 모델을 위한 테스트 파일을 자동으로 추가했을 것입니다. 아마도 `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`와 같은 경로에 위치할 것입니다. 이 테스트 파일을 실행하여 일반 테스트가 모두 통과하는지 확인하세요.
+
+```bash
+pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```
+
+모든 일반 테스트를 수정한 후, 이제 수행한 작업을 충분히 테스트하여 다음 사항을 보장해야 합니다.
+
+- a) 커뮤니티가 *brand_new_bert*의 특정 테스트를 살펴봄으로써 작업을 쉽게 이해할 수 있도록 함
+- b) 모델에 대한 향후 변경 사항이 모델의 중요한 기능을 손상시키지 않도록 함
+
+먼저 통합 테스트를 추가해야 합니다. 이러한 통합 테스트는 이전에 모델을 🤗 Transformers로 구현하기 위해 사용한 디버깅 스크립트와 동일한 작업을 수행합니다. Cookiecutter에 이미 이러한 모델 테스트의 템플릿인 `BrandNewBertModelIntegrationTests`가 추가되어 있으며, 여러분이 작성해야 할 내용으로만 채워 넣으면 됩니다. 이러한 테스트가 통과하는지 확인하려면 다음을 실행하세요.
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
+
+<Tip>
+
+Windows를 사용하는 경우 `RUN_SLOW=1`을 `SET RUN_SLOW=1`로 바꿔야 합니다.
+
+</Tip>
+
+둘째로, *brand_new_bert*에 특화된 모든 기능도 별도의 테스트에서 추가로 테스트해야 합니다. 이 부분은 종종 잊히는데, 두 가지 측면에서 굉장히 유용합니다.
+
+- *brand_new_bert*의 특수 기능이 어떻게 작동해야 하는지 보여줌으로써 커뮤니티에게 모델 추가 과정에서 습득한 지식을 전달하는 데 도움이 됩니다.
+- 향후 기여자는 이러한 특수 테스트를 실행하여 모델에 대한 변경 사항을 빠르게 테스트할 수 있습니다.
+
+
+**9. 토크나이저 구현하기**
+
+다음으로, *brand_new_bert*의 토크나이저를 추가해야 합니다. 보통 토크나이저는 🤗 Transformers의 기존 토크나이저와 동일하거나 매우 유사합니다.
+
+토크나이저가 올바르게 작동하는지 확인하기 위해 먼저 원본 리포지토리에서 문자열을 입력하고 `input_ids`를 반환하는 스크립트를 생성하는 것이 좋습니다. 다음과 같은 유사한 스크립트일 수 있습니다 (의사 코드로 작성):
+
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+원본 리포지토리를 자세히 살펴보고 올바른 토크나이저 함수를 찾거나, 복제본에서 변경 사항을 적용하여 `input_ids`만 출력하도록 해야 합니다. 원본 리포지토리를 사용하는 기능적인 토큰화 스크립트를 작성한 후, 🤗 Transformers의 유사한 스크립트를 생성해야 합니다. 다음과 같이 작성되어야 합니다:
+
+```python
+from transformers import BrandNewBertTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+
+input_ids = tokenizer(input_str).input_ids
+```
+
+두 개의 `input_ids`가 동일한 값을 반환할 때, 마지막 단계로 토크나이저 테스트 파일도 추가해야 합니다.
+
+*brand_new_bert*의 모델링 테스트 파일과 유사하게, *brand_new_bert*의 토크나이제이션 테스트 파일에는 몇 가지 하드코딩된 통합 테스트가 포함되어야 합니다.
+
+**10. 종단 간 통합 테스트 실행**
+
+토크나이저를 추가한 후에는 모델과 토크나이저를 사용하여 몇 가지 종단 간 통합 테스트를 추가해야 합니다. `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`에 추가해주세요. 이러한 테스트는 🤗 Transformers 구현이 예상대로 작동하는지를 의미 있는 text-to-text 예시로 보여줘야 합니다. 그 예시로는 *예를 들어* source-to-target 번역 쌍, article-to-summary 쌍, question-to-answer 쌍 등이 포함될 수 있습니다. 불러온 체크포인트 중 어느 것도 다운스트림 작업에서 미세 조정되지 않았다면, 모델 테스트만으로 충분합니다. 모델이 완전히 기능을 갖추었는지 확인하기 위해 마지막 단계로 GPU에서 모든 테스트를 실행하는 것이 좋습니다. 모델의 내부 텐서의 일부에 `.to(self.device)` 문을 추가하는 것을 잊었을 수 있으며, 이 경우 테스트에서 오류로 표시됩니다. GPU에 액세스할 수 없는 경우, Hugging Face 팀이 테스트를 대신 실행할 수 있습니다.
+
+**11. 기술문서 추가**
+
+이제 *brand_new_bert*에 필요한 모든 기능이 추가되었습니다. 거의 끝났습니다! 추가해야 할 것은 멋진 기술문서과 기술문서 페이지입니다. Cookiecutter가 `docs/source/model_doc/brand_new_bert.md`라는 템플릿 파일을 추가해줬을 것입니다. 이 페이지를 사용하기 전에 모델을 사용하는 사용자들은 일반적으로 이 페이지를 먼저 확인합니다. 따라서 문서는 이해하기 쉽고 간결해야 합니다. 모델을 사용하는 방법을 보여주기 위해 *팁*을 추가하는 것이 커뮤니티에 매우 유용합니다. 독스트링에 관련하여 Hugging Face 팀에 문의하는 것을 주저하지 마세요.
+
+다음으로, `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`에 추가된 독스트링이 올바르며 필요한 모든 입력 및 출력을 포함하도록 확인하세요. [여기](writing-documentation)에서 우리의 문서 작성 가이드와 독스트링 형식에 대한 상세 가이드가 있습니다. 문서는 일반적으로 커뮤니티와 모델의 첫 번째 접점이기 때문에, 문서는 적어도 코드만큼의 주의를 기울여야 합니다.
+
+**코드 리팩토링**
+
+좋아요, 이제 *brand_new_bert*를 위한 모든 필요한 코드를 추가했습니다. 이 시점에서 다음을 실행하여 잠재적으로 잘못된 코드 스타일을 수정해야 합니다:
+
+그리고 코딩 스타일이 품질 점검을 통과하는지 확인하기 위해 다음을 실행하고 확인해야 합니다:
+
+```bash
+make style
+```
+
+🤗 Transformers에는 여전히 실패할 수 있는 몇 가지 매우 엄격한 디자인 테스트가 있습니다. 이는 독스트링에 누락된 정보나 잘못된 명명 때문에 종종 발생합니다. 여기서 막히면 Hugging Face 팀이 도움을 줄 것입니다.
+
+```bash
+make quality
+```
+
+마지막으로, 코드가 정확히 작동하는 것을 확인한 후에는 항상 코드를 리팩토링하는 것이 좋은 생각입니다. 모든 테스트가 통과된 지금은 추가한 코드를 다시 검토하고 리팩토링하는 좋은 시기입니다.
+
+이제 코딩 부분을 완료했습니다. 축하합니다! 🎉 멋져요! 😎
+
+**12. 모델을 모델 허브에 업로드하세요**
+
+이 마지막 파트에서는 모든 체크포인트를 변환하여 모델 허브에 업로드하고 각 업로드된 모델 체크포인트에 대한 모델 카드를 추가해야 합니다. [Model sharing and uploading Page](model_sharing)를 읽고 허브 기능에 익숙해지세요. *brand_new_bert*의 저자 조직 아래에 모델을 업로드할 수 있는 필요한 액세스 권한을 얻기 위해 Hugging Face 팀과 협업해야 합니다. `transformers`의 모든 모델에 있는 `push_to_hub` 메서드는 체크포인트를 허브에 빠르고 효율적으로 업로드하는 방법입니다. 아래에 작은 코드 조각이 붙여져 있습니다:
+
+각 체크포인트에 적합한 모델 카드를 만드는 데 시간을 할애하는 것은 가치가 있습니다. 모델 카드는 체크포인트의 특성을 강조해야 합니다. *예를 들어* 이 체크포인트는 어떤 데이터셋에서 사전 훈련/세부 훈련되었는지? 이 모델은 어떤 하위 작업에서 사용해야 하는지? 그리고 모델을 올바르게 사용하는 방법에 대한 몇 가지 코드도 포함해야 합니다.
+
+```python
+brand_new_bert.push_to_hub("brand_new_bert")
+# Uncomment the following line to push to an organization.
+# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
+```
+
+**13. (선택 사항) 노트북 추가**
+
+*brand_new_bert*를 다운스트림 작업에서 추론 또는 미세 조정에 사용하는 방법을 자세히 보여주는 노트북을 추가하는 것이 매우 유용합니다. 이것은 PR을 병합하는 데 필수적이지는 않지만 커뮤니티에 매우 유용합니다.
+
+**14. 완료된 PR 제출**
+
+이제 프로그래밍을 마쳤으며, 마지막 단계로 PR을 메인 브랜치에 병합해야 합니다. 보통 Hugging Face 팀은 이미 여기까지 도움을 주었을 것입니다. 그러나 PR에 멋진 설명을 추가하고 리뷰어에게 특정 디자인 선택 사항을 강조하려면 완료된 PR에 약간의 설명을 추가하는 시간을 할애하는 것이 가치가 있습니다.
+
+### 작업물을 공유하세요!! [[share-your-work]]
+
+이제 커뮤니티에서 작업물을 인정받을 시간입니다! 모델 추가 작업을 완료하는 것은 Transformers와 전체 NLP 커뮤니티에 큰 기여입니다. 당신의 코드와 이식된 사전 훈련된 모델은 수백, 심지어 수천 명의 개발자와 연구원에 의해 확실히 사용될 것입니다. 당신의 작업에 자랑스러워해야 하며 이를 커뮤니티와 공유해야 합니다.
+
+**당신은 커뮤니티 내 모든 사람들에게 매우 쉽게 접근 가능한 또 다른 모델을 만들었습니다! 🤯**

From cf84738d2eee4de270b433bfaceb67c816c8b328 Mon Sep 17 00:00:00 2001
From: Hyeonseo Yun <0525yhs@gmail.com>
Date: Thu, 10 Aug 2023 01:27:27 +0900
Subject: [PATCH 905/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`model=5Fsummary.md`=20to=20Korean=20(#24625)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: model_summary.md

* feat: nmt and manual edit model_summary.mdx

* fix: resolve suggestions

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>

* fix: resolve suggestions2

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>

---------

Co-authored-by: Sohyun Sim <96299403+sim-so@users.noreply.github.com>
Co-authored-by: Wonhyeong Seo <wonhseo@kakao.com>
---
 docs/source/ko/_toctree.yml     |   4 +-
 docs/source/ko/model_summary.md | 107 ++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/model_summary.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 56db314e8f70..b8f43312e193 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -166,8 +166,8 @@
     title: 🤗 Transformers로 할 수 있는 작업
   - local: tasks_explained
     title: 🤗 Transformers로 작업을 해결하는 방법
-  - local: in_translation
-    title: (번역중) The Transformer model family
+  - local: model_summary
+    title: Transformer 모델군
   - local: in_translation
     title: (번역중) Summary of the tokenizers
   - local: attention
diff --git a/docs/source/ko/model_summary.md b/docs/source/ko/model_summary.md
new file mode 100644
index 000000000000..568b9425335d
--- /dev/null
+++ b/docs/source/ko/model_summary.md
@@ -0,0 +1,107 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformer 모델군[[the-transformer-model-family]]
+
+2017년에 소개된 [기본 Transformer](https://arxiv.org/abs/1706.03762) 모델은 자연어 처리(NLP) 작업을 넘어 새롭고 흥미로운 모델들에 영감을 주었습니다. [단백질 접힘 구조 예측](https://huggingface.co/blog/deep-learning-with-proteins), [치타의 달리기 훈련](https://huggingface.co/blog/train-decision-transformers), [시계열 예측](https://huggingface.co/blog/time-series-transformers) 등을 위한 다양한 모델이 생겨났습니다. Transformer의 변형이 너무 많아서, 큰 그림을 놓치기 쉽습니다. 하지만 여기 있는 모든 모델의 공통점은 기본 Trasnformer 아키텍처를 기반으로 한다는 점입니다. 일부 모델은 인코더 또는 디코더만 사용하고, 다른 모델들은 인코더와 디코더를 모두 사용하기도 합니다. 이렇게 Transformer 모델군 내 상위 레벨에서의 차이점을 분류하고 검토하면 유용한 분류 체계를 얻을 수 있으며, 이전에 접해보지 못한 Transformer 모델들 또한 이해하는 데 도움이 될 것입니다. 
+
+기본 Transformer 모델에 익숙하지 않거나 복습이 필요한 경우, Hugging Face 강의의 [트랜스포머는 어떻게 동작하나요?](https://huggingface.co/course/chapter1/4?fw=pt) 챕터를 확인하세요. 
+
+<div align="center">
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="YouTube video player"
+    frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+    picture-in-picture" allowfullscreen></iframe>
+</div>
+
+## 컴퓨터 비전[[computer-vision]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FacQBpeFBVvrDUlzFlkejoz%2FModelscape-timeline%3Fnode-id%3D0%253A1%26t%3Dm0zJ7m2BQ9oe0WtO-1" allowfullscreen></iframe> 
+
+### 합성곱 네트워크[[convolutional-network]]
+
+[Vision Transformer](https://arxiv.org/abs/2010.11929)가 확장성과 효율성을 입증하기 전까지 오랫동안 합성곱 네트워크(CNN)가 컴퓨터 비전 작업의 지배적인 패러다임이었습니다. 그럼에도 불구하고, 이동 불변성(translation invariance)과 같은 CNN의 우수한 부분이 도드라지기 때문에 몇몇 (특히 특정 과업에서의) Transformer 모델은 아키텍처에 합성곱을 통합하기도 했습니다. [ConvNeXt](model_doc/convnext)는 이런 관례를 뒤집어 CNN을 현대화하기 위해 Transformer의 디자인을 차용합니다. 예를 들면 ConvNeXt는 겹치지 않는 슬라이딩 창(sliding window)을 사용하여 이미지를 패치화하고, 더 큰 커널로 전역 수용 필드(global receptive field)를 확장시킵니다. ConvNeXt는 또한 메모리 효율을 높이고 성능을 향상시키기 위해 여러 레이어 설계를 선택하기 때문에 Transformer와 견줄만합니다!
+
+### 인코더[[cv-encoder]]
+
+[Vision Transformer(ViT)](model_doc/vit)는 합성곱 없는 컴퓨터 비전 작업의 막을 열었습니다. ViT는 표준 Transformer 인코더를 사용하지만, 가장 큰 혁신은 이미지를 처리하는 방식이었습니다. 문장을 토큰으로 분할하는 것처럼 이미지를 고정된 크기의 패치로 분할하고, 이를 사용하여 임베딩을 생성합니다. ViT는 Transformer의 효율적인 아키텍처를 활용하여 훈련에 더 적은 자원을 사용하면서도 당시 CNN에 비견하는 결과를 입증했습니다. 그리고 ViT를 뒤이어 분할(segmentation)과 같은 고밀도 비전 작업과 탐지 작업도 다룰 수 있는 다른 비전 모델이 등장했습니다.
+
+이러한 모델 중 하나가 [Swin](model_doc/swin) Transformer입니다. 이 모델은 작은 크기의 패치에서 계층적 특징 맵(CNN 👀과 같지만 ViT와는 다름)을 만들고 더 깊은 레이어의 인접 패치와 병합합니다. 어텐션(Attention)은 지역 윈도우 내에서만 계산되며, 모델이 더 잘 학습할 수 있도록 어텐션 레이어 간에 윈도우를 이동하며 연결을 생성합니다. Swin Transformer는 계층적 특징 맵을 생성할 수 있으므로, 분할(segmentation)과 탐지와 같은 고밀도 예측 작업에 적합합니다. [SegFormer](model_doc/segformer) 역시 Transformer 인코더를 사용하여 계층적 특징 맵을 구축하지만, 상단에 간단한 다층 퍼셉트론(MLP) 디코더를 추가하여 모든 특징 맵을 결합하고 예측을 수행합니다. 
+
+BeIT와 ViTMAE와 같은 다른 비전 모델은 BERT의 사전훈련 목표(objective)에서 영감을 얻었습니다. [BeIT](model_doc/beit)는 *마스크드 이미지 모델링(MIM)*으로 사전훈련되며, 이미지 패치는 임의로 마스킹되고 이미지도 시각적 토큰으로 토큰화됩니다. BeIT는 마스킹된 패치에 해당하는 시각적 토큰을 예측하도록 학습됩니다. [ViTMAE](model_doc/vitmae)도 비슷한 사전훈련 목표가 있지만, 시각적 토큰 대신 픽셀을 예측해야 한다는 점이 다릅니다. 특이한 점은 이미지 패치의 75%가 마스킹되어 있다는 것입니다! 디코더는 마스킹된 토큰과 인코딩된 패치에서 픽셀을 재구성합니다. 사전훈련이 끝나면 디코더는 폐기되고 인코더는 다운스트림 작업에 사용할 준비가 됩니다.
+
+### 디코더[[cv-decoder]]
+
+대부분의 비전 모델은 인코더에 의존하여 이미지 표현을 학습하기 때문에 디코더 전용 비전 모델은 드뭅니다. 하지만 이미지 생성 등의 사례의 경우, GPT-2와 같은 텍스트 생성 모델에서 보았듯이 디코더가 가장 적합합니다. [ImageGPT](model_doc/imagegpt)는 GPT-2와 동일한 아키텍처를 사용하지만, 시퀀스의 다음 토큰을 예측하는 대신 이미지의 다음 픽셀을 예측합니다. ImageGPT는 이미지 생성 뿐만 아니라 이미지 분류를 위해 미세 조정할 수도 있습니다. 
+
+### 인코더-디코더[[cv-encoder-decoder]]
+
+비전 모델은 일반적으로 인코더(백본으로도 알려짐)를 사용하여 중요한 이미지 특징을 추출한 후, 이를 Transformer 디코더로 전달합니다. [DETR](model_doc/detr)에 사전훈련된 백본이 있지만, 객체 탐지를 위해 완전한 Transformer 인코더-디코더 아키텍처도 사용합니다. 인코더는 이미지 표현을 학습하고 이를 디코더에서 객체 쿼리(각 객체 쿼리는 이미지의 영역 또는 객체에 중점을 두고 학습된 임베딩)와 결합합니다. DETR은 각 객체 쿼리에 대한 바운딩 박스 좌표와 클래스 레이블을 예측합니다.
+
+## 자연어처리[[natural-language-processing]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FUhbQAZDlpYW5XEpdFy6GoG%2Fnlp-model-timeline%3Fnode-id%3D0%253A1%26t%3D4mZMr4r1vDEYGJ50-1" allowfullscreen></iframe>
+
+### 인코더[[nlp-encoder]]
+
+[BERT](model_doc/bert)는 인코더 전용 Transformer로, 다른 토큰을 보고 소위 "부정 행위"를 저지르는 걸 막기 위해 입력에서 특정 토큰을 임의로 마스킹합니다. 사전훈련의 목표는 컨텍스트를 기반으로 마스킹된 토큰을 예측하는 것입니다. 이를 통해 BERT는 왼쪽과 오른쪽 컨텍스트를 충분히 활용하여 입력에 대해 더 깊고 풍부한 표현을 학습할 수 있습니다. 그러나 BERT의 사전훈련 전략에는 여전히 개선의 여지가 남아 있었습니다. [RoBERTa](model_doc/roberta)는 더 긴 시간 동안 더 큰 배치에 대한 훈련을 포함하고, 전처리 중에 한 번만 마스킹하는 것이 아니라 각 에폭에서 토큰을 임의로 마스킹하고, 다음 문장 예측 목표를 제거하는 새로운 사전훈련 방식을 도입함으로써 이를 개선했습니다. 
+
+성능 개선을 위한 전략으로 모델 크기를 키우는 것이 지배적입니다. 하지만 큰 모델을 훈련하려면 계산 비용이 많이 듭니다. 계산 비용을 줄이는 한 가지 방법은 [DistilBERT](model_doc/distilbert)와 같이 작은 모델을 사용하는 것입니다. DistilBERT는 압축 기법인 [지식 증류(knowledge distillation)](https://arxiv.org/abs/1503.02531)를 사용하여, 거의 모든 언어 이해 능력을 유지하면서 더 작은 버전의 BERT를 만듭니다. 
+
+그러나 대부분의 Transformer 모델에 더 많은 매개변수를 사용하는 경향이 이어졌고, 이에 따라 훈련 효율성을 개선하는 것에 중점을 둔 새로운 모델이 등장했습니다. [ALBERT](model_doc/albert)는 두 가지 방법으로 매개변수 수를 줄여 메모리 사용량을 줄였습니다. 바로 큰 어휘를 두 개의 작은 행렬로 분리하는 것과 레이어가 매개변수를 공유하도록 하는 것입니다. [DeBERTa](model_doc/deberta)는 단어와 그 위치를 두 개의 벡터로 개별적으로 인코딩하는 분리된(disentangled) 어텐션 메커니즘을 추가했습니다. 어텐션은 단어와 위치 임베딩을 포함하는 단일 벡터 대신 이 별도의 벡터에서 계산됩니다. [Longformer](model_doc/longformer)는 특히 시퀀스 길이가 긴 문서를 처리할 때, 어텐션을 더 효율적으로 만드는 것에 중점을 두었습니다. 지역(local) 윈도우 어텐션(각 토큰 주변의 고정된 윈도우 크기에서만 계산되는 어텐션)과 전역(global) 어텐션(분류를 위해 `[CLS]`와 같은 특정 작업 토큰에만 해당)의 조합을 사용하여 전체(full) 어텐션 행렬 대신 희소(sparse) 어텐션 행렬을 생성합니다. 
+
+### 디코더[[nlp-decoder]]
+
+[GPT-2](model_doc/gpt2)는 시퀀스에서 다음 단어를 예측하는 디코더 전용 Transformer입니다. 토큰을 오른쪽으로 마스킹하여 모델이 이전 토큰을 보고 "부정 행위"를 하지 못하도록 합니다. GPT-2는 방대한 텍스트에 대해 사전훈련하여 텍스트가 일부만 정확하거나 사실인 경우에도 상당히 능숙하게 텍스트를 생성할 수 있게 되었습니다. 하지만 GPT-2는 BERT가 사전훈련에서 갖는 양방향 컨텍스트가 부족하기 때문에 특정 작업에 적합하지 않았습니다. [XLNET](model_doc/xlnet)은 양방향 훈련이 가능한 permutation language modeling objective(PLM)를 사용하여 BERT와 GPT-2의 사전훈련 목표에 대한 장점을 함께 가지고 있습니다.
+
+GPT-2 이후, 언어 모델은 더욱 거대해졌고 현재는 *대규모 언어 모델(LLM)*로 알려져 있습니다. 충분히 큰 데이터 세트로 사전훈련된 LLM은 퓨샷(few-shot) 또는 제로샷(zero-shot) 학습을 수행합니다. [GPT-J](model_doc/gptj)는 6B 크기의 매개변수가 있고 400B 크기의 토큰으로 훈련된 LLM입니다. GPT-J에 이어 디코더 전용 모델군인 [OPT](model_doc/opt)가 등장했으며, 이 중 가장 큰 모델은 175B 크기이고 180B 크기의 토큰으로 훈련되었습니다. [BLOOM](model_doc/bloom)은 비슷한 시기에 출시되었으며, 이 중 가장 큰 모델은 176B 크기의 매개변수가 있고 46개의 언어와 13개의 프로그래밍 언어로 된 366B 크기의 토큰으로 훈련되었습니다. 
+
+### 인코더-디코더[[nlp-encoder-decoder]]
+
+[BART](model_doc/bart)는 기본 Transformer 아키텍처를 유지하지만, 일부 텍스트 스팬(span)이 단일 `마스크` 토큰으로 대체되는 *text infilling* 변형으로 사전훈련 목표를 수정합니다. 디코더는 변형되지 않은 토큰(향후 토큰은 마스킹됨)을 예측하고 인코더의 은닉 상태를 사용하여 이 작업을 돕습니다. [Pegasus](model_doc/pegasus)는 BART와 유사하지만, Pegasus는 텍스트 스팬 대신 전체 문장을 마스킹합니다. Pegasus는 마스크드 언어 모델링 외에도 gap sentence generation(GSG)로 사전훈련됩니다. GSG는 문서에 중요한 문장 전체를 마스킹하여 `마스크` 토큰으로 대체하는 것을 목표로 합니다. 디코더는 남은 문장에서 출력을 생성해야 합니다. [T5](model_doc/t5)는 특정 접두사를 사용하여 모든 NLP 작업을 텍스트 투 텍스트 문제로 변환하는 더 특수한 모델입니다. 예를 들어, 접두사 `Summarize:`은 요약 작업을 나타냅니다. T5는 지도(GLUE 및 SuperGLUE) 훈련과 자기지도 훈련(토큰의 15%를 임의로 샘플링하여 제거)으로 사전훈련됩니다.
+
+## 오디오[[audio]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2Fvrchl8jDV9YwNVPWu2W0kK%2Fspeech-and-audio-model-timeline%3Fnode-id%3D0%253A1%26t%3DmM4H8pPMuK23rClL-1" allowfullscreen></iframe>
+
+### 인코더[[audio-encoder]]
+
+[Wav2Vec2](model_doc/wav2vec2)는 Transformer 인코더를 사용하여 원본 오디오 파형(raw audio waveform)에서 직접 음성 표현을 학습합니다. 허위 음성 표현 세트에서 실제 음성 표현을 판별하는 대조 작업으로 사전훈련됩니다. [HuBERT](model_doc/hubert)는 Wav2Vec2와 유사하지만 훈련 과정이 다릅니다. 타겟 레이블이 유사한 오디오 세그먼트가 클러스터에 할당되어 은닉 단위(unit)가 되는 군집화(clustering) 단계에서 생성됩니다. 은닉 단위는 예측을 위한 임베딩에 매핑됩니다.
+
+### 인코더-디코더[[audio-encoder-decoder]]
+
+[Speech2Text](model_doc/speech_to_text)는 자동 음성 인식(ASR) 및 음성 번역을 위해 고안된 음성 모델입니다. 이 모델은 오디오 파형에서 추출한 log mel-filter bank 특징을 채택하고 자기회귀 방식으로 사전훈련하여, 전사본 또는 번역을 만듭니다. [Whisper](model_doc/whisper)은 ASR 모델이지만, 다른 많은 음성 모델과 달리 제로샷 성능을 위해 대량의 ✨ 레이블이 지정된 ✨ 오디오 전사 데이터에 대해 사전훈련됩니다. 데이터 세트의 큰 묶음에는 영어가 아닌 언어도 포함되어 있어서 자원이 적은 언어에도 Whisper를 사용할 수 있습니다. 구조적으로, Whisper는 Speech2Text와 유사합니다. 오디오 신호는 인코더에 의해 인코딩된 log-mel spectrogram으로 변환됩니다. 디코더는 인코더의 은닉 상태와 이전 토큰으로부터 자기회귀 방식으로 전사를 생성합니다.
+
+## 멀티모달[[multimodal]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FcX125FQHXJS2gxeICiY93p%2Fmultimodal%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### 인코더[[mm-encoder]]
+
+[VisualBERT](model_doc/visual_bert)는 BERT 이후에 출시된 비전 언어 작업을 위한 멀티모달 모델입니다. 이 모델은 BERT와 사전훈련된 객체 탐지 시스템을 결합하여 이미지 특징을 시각 임베딩으로 추출하고, 텍스트 임베딩과 함께 BERT로 전달합니다. VisualBERT는 마스킹되지 않은 텍스트와 시각 임베딩을 기반으로 마스킹된 텍스트를 예측하고, 텍스트가 이미지와 일치하는지 예측해야 합니다. ViT가 이미지 임베딩을 구하는 방식이 더 쉬웠기 때문에, ViT가 출시된 후 [ViLT](model_doc/vilt)는 아키텍처에 ViT를 채택했습니다. 이미지 임베딩은 텍스트 임베딩과 함께 처리됩니다. 여기에서, ViLT는 이미지 텍스트 매칭, 마스크드 언어 모델링, 전체 단어 마스킹을 통해 사전훈련됩니다.
+
+[CLIP](model_doc/clip)은 다른 접근 방식을 사용하여 (`이미지`, `텍스트`)의 쌍 예측을 수행합니다. (`이미지`, `텍스트`) 쌍에서의 이미지와 텍스트 임베딩 간의 유사도를 최대화하기 위해 4억 개의 (`이미지`, `텍스트`) 쌍 데이터 세트에 대해 이미지 인코더(ViT)와 텍스트 인코더(Transformer)를 함께 훈련합니다. 사전훈련 후, 자연어를 사용하여 이미지가 주어진 텍스트를 예측하거나 그 반대로 예측하도록 CLIP에 지시할 수 있습니다. [OWL-ViT](model_doc/owlvit)는 CLIP을 제로샷 객체 탐지를 위한 백본(backbone)으로 사용하여 CLIP 상에 구축됩니다. 사전훈련 후, 객체 탐지 헤드가 추가되어 (`클래스`, `바운딩 박스`) 쌍에 대한 집합(set) 예측을 수행합니다.
+
+### 인코더-디코더[[mm-encoder-decoder]]
+
+광학 문자 인식(OCR)은 이미지를 이해하고 텍스트를 생성하기 위해 다양한 구성 요소를 필요로 하는 전통적인 텍스트 인식 작업입니다. [TrOCR](model_doc/trocr)은 종단간(end-to-end) Transformer를 사용하여 이 프로세스를 간소화합니다. 인코더는 이미지 이해를 위한 ViT 방식의 모델이며 이미지를 고정된 크기의 패치로 처리합니다. 디코더는 인코더의 은닉 상태를 받아서 자기회귀 방식으로 텍스트를 생성합니다. [Donut](model_doc/donut)은 OCR 기반 접근 방식에 의존하지 않는 더 일반적인 시각 문서 이해 모델입니다. 이 모델은 Swin Transformer를 인코더로, 다국어 BART를 디코더로 사용합니다. Donut은 이미지와 텍스트 주석을 기반으로 다음 단어를 예측하여 텍스트를 읽도록 사전훈련됩니다. 디코더는 프롬프트가 주어지면 토큰 시퀀스를 생성합니다. 프롬프트는 각 다운스트림 작업에 대한 특수 토큰으로 표현됩니다. 예를 들어, 문서 파싱(parsing)에는 인코더의 은닉 상태와 결합되어 문서를 정형 출력 형식(JSON)으로 파싱하는 특수 `파싱` 토큰이 있습니다.
+
+## 강화 학습[[reinforcement-learning]]
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FiB3Y6RvWYki7ZuKO6tNgZq%2Freinforcement-learning%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### 디코더[[rl-decoder]]
+
+Decision 및 Trajectory Transformer는 상태(state), 행동(action), 보상(reward)을 시퀀스 모델링 문제로 표현합니다. [Decision Transformer](model_doc/decision_transformer)는 기대 보상(returns-to-go), 과거 상태 및 행동을 기반으로 미래의 원하는 수익(return)으로 이어지는 일련의 행동을 생성합니다. 마지막 *K* 시간 스텝(timestep)에 대해, 세 가지 모달리티는 각각 토큰 임베딩으로 변환되고 GPT와 같은 모델에 의해 처리되어 미래의 액션 토큰을 예측합니다. [Trajectory Transformer](model_doc/trajectory_transformer)도 상태, 행동, 보상을 토큰화하여 GPT 아키텍처로 처리합니다. 보상 조건에 중점을 둔 Decision Transformer와 달리 Trajectory Transformer는 빔 서치(beam search)로 미래 행동을 생성합니다.
\ No newline at end of file

From 704bf595eb08aeedac61ab43a479f3f342dbefe9 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 9 Aug 2023 18:28:02 +0200
Subject: [PATCH 906/935] Update Bark generation configs and tests (#25409)

* update bark generation configs for more coherent parameter

* make style

* update bark hub repo
---
 .../bark/generation_configuration_bark.py     | 27 ++++++++++++-------
 src/transformers/models/bark/modeling_bark.py |  6 ++---
 tests/models/bark/test_modeling_bark.py       | 24 ++++++++++++-----
 tests/models/bark/test_processor_bark.py      |  2 +-
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/bark/generation_configuration_bark.py b/src/transformers/models/bark/generation_configuration_bark.py
index 9a280b99898d..ea00d6f0516a 100644
--- a/src/transformers/models/bark/generation_configuration_bark.py
+++ b/src/transformers/models/bark/generation_configuration_bark.py
@@ -36,8 +36,8 @@ def __init__(
         return_dict_in_generate=False,
         output_hidden_states=False,
         output_attentions=False,
-        temperature=0.7,
-        do_sample=True,
+        temperature=1.0,
+        do_sample=False,
         text_encoding_offset=10_048,
         text_pad_token=129_595,
         semantic_infer_token=129_599,
@@ -70,9 +70,9 @@ def __init__(
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            temperature (`float`, *optional*, defaults to 0.7):
+            temperature (`float`, *optional*, defaults to 1.0):
                 The value used to modulate the next token probabilities.
-            do_sample (`bool`, *optional*, defaults to `True`):
+            do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
             text_encoding_offset (`int`, *optional*, defaults to 10_048):
                 Text encoding offset.
@@ -119,8 +119,8 @@ def __init__(
         return_dict_in_generate=False,
         output_hidden_states=False,
         output_attentions=False,
-        temperature=0.7,
-        do_sample=True,
+        temperature=1.0,
+        do_sample=False,
         coarse_semantic_pad_token=12_048,
         coarse_rate_hz=75,
         n_coarse_codebooks=2,
@@ -150,9 +150,9 @@ def __init__(
             output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            temperature (`float`, *optional*, defaults to 0.7):
+            temperature (`float`, *optional*, defaults to 1.0):
                 The value used to modulate the next token probabilities.
-            do_sample (`bool`, *optional*, defaults to `True`):
+            do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
             coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
                 Coarse semantic pad token.
@@ -194,7 +194,7 @@ class BarkFineGenerationConfig(GenerationConfig):
 
     def __init__(
         self,
-        temperature=0.5,
+        temperature=1.0,
         max_fine_history_length=512,
         max_fine_input_length=1024,
         n_fine_codebooks=8,
@@ -209,7 +209,7 @@ def __init__(
         documentation from [`GenerationConfig`] for more information.
 
         Args:
-            temperature (`float`, *optional*, defaults to 0.5):
+            temperature (`float`, *optional*):
                 The value used to modulate the next token probabilities.
             max_fine_history_length (`int`, *optional*, defaults to 512):
                 Max length of the fine history vector.
@@ -224,6 +224,13 @@ def __init__(
         self.max_fine_input_length = max_fine_input_length
         self.n_fine_codebooks = n_fine_codebooks
 
+    def validate(self):
+        """
+        Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
+        temperature.
+        """
+        pass
+
 
 class BarkGenerationConfig(GenerationConfig):
     model_type = "bark"
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index 1cc42bc811de..368c0b5e01e2 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -1336,7 +1336,7 @@ def generate(
             input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :]
             for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks):
                 logits = self.forward(n_inner, input_buffer).logits
-                if temperature is None:
+                if temperature is None or temperature == 1.0:
                     relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size]
                     codebook_preds = torch.argmax(relevant_logits, -1)
                 else:
@@ -1499,8 +1499,8 @@ def generate(
         ```python
         >>> from transformers import AutoProcessor, BarkModel
 
-        >>> processor = AutoProcessor.from_pretrained("ylacombe/bark-small")
-        >>> model = BarkModel.from_pretrained("ylacombe/bark-small")
+        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
+        >>> model = BarkModel.from_pretrained("suno/bark-small")
 
         >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
         >>> voice_preset = "v2/en_speaker_6"
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index 82a902ded44a..6fc4cb58a639 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -894,11 +894,11 @@ def test_resize_embeddings_untied(self):
 class BarkModelIntegrationTests(unittest.TestCase):
     @cached_property
     def model(self):
-        return BarkModel.from_pretrained("ylacombe/bark-large").to(torch_device)
+        return BarkModel.from_pretrained("suno/bark").to(torch_device)
 
     @cached_property
     def processor(self):
-        return BarkProcessor.from_pretrained("ylacombe/bark-large")
+        return BarkProcessor.from_pretrained("suno/bark")
 
     @cached_property
     def inputs(self):
@@ -937,6 +937,7 @@ def test_generate_semantic(self):
             output_ids = self.model.semantic.generate(
                 **input_ids,
                 do_sample=False,
+                temperature=1.0,
                 semantic_generation_config=self.semantic_generation_config,
             )
 
@@ -957,6 +958,7 @@ def test_generate_coarse(self):
             output_ids = self.model.semantic.generate(
                 **input_ids,
                 do_sample=False,
+                temperature=1.0,
                 semantic_generation_config=self.semantic_generation_config,
             )
 
@@ -964,6 +966,7 @@ def test_generate_coarse(self):
                 output_ids,
                 history_prompt=history_prompt,
                 do_sample=False,
+                temperature=1.0,
                 semantic_generation_config=self.semantic_generation_config,
                 coarse_generation_config=self.coarse_generation_config,
                 codebook_size=self.model.generation_config.codebook_size,
@@ -994,6 +997,7 @@ def test_generate_fine(self):
             output_ids = self.model.semantic.generate(
                 **input_ids,
                 do_sample=False,
+                temperature=1.0,
                 semantic_generation_config=self.semantic_generation_config,
             )
 
@@ -1001,6 +1005,7 @@ def test_generate_fine(self):
                 output_ids,
                 history_prompt=history_prompt,
                 do_sample=False,
+                temperature=1.0,
                 semantic_generation_config=self.semantic_generation_config,
                 coarse_generation_config=self.coarse_generation_config,
                 codebook_size=self.model.generation_config.codebook_size,
@@ -1040,9 +1045,16 @@ def test_generate_end_to_end_with_sub_models_args(self):
         input_ids = self.inputs
 
         with torch.no_grad():
-            self.model.generate(**input_ids, do_sample=False, coarse_do_sample=True, coarse_temperature=0.7)
             self.model.generate(
-                **input_ids, do_sample=False, coarse_do_sample=True, coarse_temperature=0.7, fine_temperature=0.3
+                **input_ids, do_sample=False, temperature=1.0, coarse_do_sample=True, coarse_temperature=0.7
+            )
+            self.model.generate(
+                **input_ids,
+                do_sample=False,
+                temperature=1.0,
+                coarse_do_sample=True,
+                coarse_temperature=0.7,
+                fine_temperature=0.3,
             )
             self.model.generate(
                 **input_ids,
@@ -1061,7 +1073,7 @@ def test_generate_end_to_end_with_offload(self):
 
         with torch.no_grad():
             # standard generation
-            output_with_no_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
+            output_with_no_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
 
             torch.cuda.empty_cache()
 
@@ -1088,7 +1100,7 @@ def test_generate_end_to_end_with_offload(self):
             self.assertTrue(hasattr(self.model.semantic, "_hf_hook"))
 
             # output with cpu offload
-            output_with_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
+            output_with_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
 
         # checks if same output
         self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())
diff --git a/tests/models/bark/test_processor_bark.py b/tests/models/bark/test_processor_bark.py
index aa25951b5c41..15b0871d8144 100644
--- a/tests/models/bark/test_processor_bark.py
+++ b/tests/models/bark/test_processor_bark.py
@@ -26,7 +26,7 @@
 @require_torch
 class BarkProcessorTest(unittest.TestCase):
     def setUp(self):
-        self.checkpoint = "ylacombe/bark-small"
+        self.checkpoint = "suno/bark-small"
         self.tmpdirname = tempfile.mkdtemp()
         self.voice_preset = "en_speaker_1"
         self.input_string = "This is a test string"

From cb3c821cb74e3c519667ab4db88002c9d0d2034d Mon Sep 17 00:00:00 2001
From: hukuda222 <zionbanzai3@gmail.com>
Date: Thu, 10 Aug 2023 01:28:57 +0900
Subject: [PATCH 907/935] aligned sample_beam output selection with beam_search
 (#25375)

* aligned sample_beam specs with beam_search

* pull origin main

* Revert "pull origin main"

This reverts commit 06d356f1137bb52272e120a03636598c44449cf3.

* update test_utils.py

* fix format

* remove comment

---------

Co-authored-by: Shogo Fujita <shogo.fujita@legalontech.jp>
---
 src/transformers/generation/utils.py |  5 ++--
 tests/generation/test_utils.py       | 34 +++++++---------------------
 2 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index eaee4d029d42..0a32785ef66e 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1691,18 +1691,19 @@ def generate(
 
             # 12. prepare beam search scorer
             beam_scorer = BeamSearchScorer(
-                batch_size=batch_size * generation_config.num_return_sequences,
+                batch_size=batch_size,
                 num_beams=generation_config.num_beams,
                 device=inputs_tensor.device,
                 length_penalty=generation_config.length_penalty,
                 do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
                 max_length=generation_config.max_length,
             )
 
             # 13. interleave input_ids with `num_beams` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
                 input_ids=input_ids,
-                expand_size=generation_config.num_beams * generation_config.num_return_sequences,
+                expand_size=generation_config.num_beams,
                 is_encoder_decoder=self.config.is_encoder_decoder,
                 **model_kwargs,
             )
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 0cdc92398ba8..e6faf5babd1d 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -438,7 +438,6 @@ def _beam_sample_generate(
         input_ids,
         attention_mask,
         max_length,
-        num_return_sequences,
         beam_scorer,
         beam_kwargs,
         logits_warper,
@@ -463,7 +462,7 @@ def _beam_sample_generate(
             **logits_warper_kwargs,
             **model_kwargs,
         )
-        # beam_search does not automatically interleave `batch_size` dim for `num_beams * num_return_sequences`
+        # beam_search does not automatically interleave `batch_size` dim for `num_beams`
         torch.manual_seed(0)
         kwargs = {}
         if model.config.is_encoder_decoder:
@@ -471,13 +470,13 @@ def _beam_sample_generate(
                 model,
                 input_ids,
                 attention_mask,
-                num_interleave=beam_scorer.num_beams * num_return_sequences,
+                num_interleave=beam_scorer.num_beams,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
             )
             kwargs["encoder_outputs"] = encoder_outputs
         elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0)
+            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
 
         # prevent flaky generation test failures
         logits_processor = LogitsProcessorList()
@@ -486,7 +485,7 @@ def _beam_sample_generate(
         with torch.no_grad():
             model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_beam_sample = model.beam_sample(
-                input_ids.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0),
+                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
                 beam_scorer,
                 max_length=max_length,
                 logits_warper=logits_warper,
@@ -891,13 +890,9 @@ def test_beam_search_generate(self):
 
             self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
 
-            # check `generate()` and `beam_search()` are equal for `num_return_sequences`
-            num_return_sequences = 2
             if model.config.is_encoder_decoder:
                 max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
-            )
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_generate, output_beam_search = self._beam_search_generate(
                 model=model,
@@ -1036,21 +1031,15 @@ def test_beam_sample_generate(self):
             model = model_class(config).to(torch_device).eval()
 
             # check `generate()` and `beam_search()` are equal
-            # change `num_return_sequences = 2` but not for `beam_scorer`
-            num_return_sequences = 2
             if model.config.is_encoder_decoder:
                 max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0] * num_return_sequences, max_length
-            )
-            beam_kwargs["num_return_sequences"] = num_return_sequences
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_generate, output_beam_sample = self._beam_sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_length=max_length,
-                num_return_sequences=num_return_sequences,
                 beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
                 logits_warper=logits_warper,
@@ -1074,20 +1063,15 @@ def test_beam_sample_generate_dict_output(self):
             model = model_class(config).to(torch_device).eval()
             logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
 
-            num_return_sequences = 2
             if model.config.is_encoder_decoder:
                 max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
-                input_ids.shape[0] * num_return_sequences, max_length
-            )
-            beam_kwargs["num_return_sequences"] = num_return_sequences
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_beam_sample, output_generate = self._beam_sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_length=max_length,
-                num_return_sequences=num_return_sequences,
                 beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
                 logits_warper=logits_warper,
@@ -1113,9 +1097,7 @@ def test_beam_sample_generate_dict_output(self):
             self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
 
             for output in (output_beam_sample, output_generate):
-                self._check_outputs(
-                    output, input_ids, model.config, num_return_sequences=num_return_sequences * beam_scorer.num_beams
-                )
+                self._check_outputs(output, input_ids, model.config, num_return_sequences=beam_scorer.num_beams)
 
     def test_generate_without_input_ids(self):
         config, _, _, max_length = self._get_input_ids_and_config()

From 944ddce8bfd09ebbbdc71fb1d116421db42149b2 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 9 Aug 2023 17:41:21 +0100
Subject: [PATCH 908/935] Enable passing number of channels when inferring data
 format (#25412)

---
 src/transformers/image_utils.py | 13 ++++++++++---
 tests/utils/test_image_utils.py |  4 ++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index f06fdb0e6dfa..72ed45492baf 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -144,17 +144,24 @@ def to_numpy_array(img) -> np.ndarray:
     return to_numpy(img)
 
 
-def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
+def infer_channel_dimension_format(
+    image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
+) -> ChannelDimension:
     """
     Infers the channel dimension format of `image`.
 
     Args:
         image (`np.ndarray`):
             The image to infer the channel dimension of.
+        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
+            The number of channels of the image.
 
     Returns:
         The channel dimension of the image.
     """
+    num_channels = num_channels if num_channels is not None else (1, 3)
+    num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
+
     if image.ndim == 3:
         first_dim, last_dim = 0, 2
     elif image.ndim == 4:
@@ -162,9 +169,9 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
     else:
         raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
 
-    if image.shape[first_dim] in (1, 3):
+    if image.shape[first_dim] in num_channels:
         return ChannelDimension.FIRST
-    elif image.shape[last_dim] in (1, 3):
+    elif image.shape[last_dim] in num_channels:
         return ChannelDimension.LAST
     raise ValueError("Unable to infer channel dimension format")
 
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 6d6c0f5d9b05..f62f647c42d9 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -578,6 +578,10 @@ def test_infer_channel_dimension(self):
         with pytest.raises(ValueError):
             infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)))
 
+        # But if we explicitly set one of the number of channels to 50 it works
+        inferred_dim = infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)), num_channels=50)
+        self.assertEqual(inferred_dim, ChannelDimension.LAST)
+
         # Test we correctly identify the channel dimension
         image = np.random.randint(0, 256, (3, 4, 5))
         inferred_dim = infer_channel_dimension_format(image)

From d0c1aebea467af499331234e7b285a6bf91ea073 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 9 Aug 2023 18:51:51 +0100
Subject: [PATCH 909/935] Bark: flexible generation config overload (#25414)

---
 src/transformers/models/bark/generation_configuration_bark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bark/generation_configuration_bark.py b/src/transformers/models/bark/generation_configuration_bark.py
index ea00d6f0516a..92d836333935 100644
--- a/src/transformers/models/bark/generation_configuration_bark.py
+++ b/src/transformers/models/bark/generation_configuration_bark.py
@@ -224,7 +224,7 @@ def __init__(
         self.max_fine_input_length = max_fine_input_length
         self.n_fine_codebooks = n_fine_codebooks
 
-    def validate(self):
+    def validate(self, **kwargs):
         """
         Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
         temperature.

From b175fc39d93512204e7e784bd5b4f6c87580d54b Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Thu, 10 Aug 2023 09:13:52 +0200
Subject: [PATCH 910/935] [DINOv2] Update pooler output (#25392)

Update pooler output
---
 .../models/dinov2/modeling_dinov2.py          | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index a0cf8be82f8c..5af80fc4503d 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -583,7 +583,7 @@ def _set_gradient_checkpointing(self, module: Dinov2Encoder, value: bool = False
     DINOV2_START_DOCSTRING,
 )
 class Dinov2Model(Dinov2PreTrainedModel):
-    def __init__(self, config: Dinov2Config, add_pooling_layer: bool = True):
+    def __init__(self, config: Dinov2Config):
         super().__init__(config)
         self.config = config
 
@@ -591,7 +591,6 @@ def __init__(self, config: Dinov2Config, add_pooling_layer: bool = True):
         self.encoder = Dinov2Encoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.pooler = Dinov2Pooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -651,10 +650,10 @@ def forward(
         )
         sequence_output = encoder_outputs[0]
         sequence_output = self.layernorm(sequence_output)
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        pooled_output = sequence_output[:, 0, :]
 
         if not return_dict:
-            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            head_outputs = (sequence_output, pooled_output)
             return head_outputs + encoder_outputs[1:]
 
         return BaseModelOutputWithPooling(
@@ -665,22 +664,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->Dinov2
-class Dinov2Pooler(nn.Module):
-    def __init__(self, config: Dinov2Config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
 @add_start_docstrings(
     """
     Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
@@ -693,7 +676,7 @@ def __init__(self, config: Dinov2Config) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.dinov2 = Dinov2Model(config, add_pooling_layer=False)
+        self.dinov2 = Dinov2Model(config)
 
         # Classifier head
         self.classifier = (
@@ -770,7 +753,7 @@ def forward(
                 loss = loss_fct(logits, labels)
 
         if not return_dict:
-            output = (logits,) + outputs[1:]
+            output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
         return ImageClassifierOutput(

From b14d4641f60b2287e7212c500d6d2cbfda3e8401 Mon Sep 17 00:00:00 2001
From: TaeYupNoh <107118671+TaeYupNoh@users.noreply.github.com>
Date: Thu, 10 Aug 2023 16:50:51 +0900
Subject: [PATCH 911/935] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated?=
 =?UTF-8?q?=20`philosophy.md`=20to=20Korean=20(#25010)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* docs: ko: philosophy.md

* feat: chatgpt draft

* fix: manual edits

* fix: resolve suggestions
---
 docs/source/ko/_toctree.yml  |  4 +--
 docs/source/ko/philosophy.md | 66 ++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ko/philosophy.md

diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index b8f43312e193..d200b3b7e9ca 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -158,8 +158,8 @@
   title: (번역중) 기여하기
 
 - sections:
-  - local: in_translation
-    title: (번역중) Philosophy
+  - local: philosophy
+    title: 이념과 목표
   - local: in_translation
     title: (번역중) Glossary
   - local: task_summary
diff --git a/docs/source/ko/philosophy.md b/docs/source/ko/philosophy.md
new file mode 100644
index 000000000000..94b6c46f60e2
--- /dev/null
+++ b/docs/source/ko/philosophy.md
@@ -0,0 +1,66 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 이념과 목표 [[philosophy]]
+
+🤗 Transformers는 다음과 같은 목적으로 만들어진 독자적인 라이브러리입니다:
+
+- 대규모 Transformers 모델을 사용하거나 연구하거나 확장하려는 기계 학습 연구원 및 교육자를 위한 것입니다.
+- 모델을 미세 조정하거나 제작용으로 사용하고자 하는 실전 개발자를 위한 것입니다.
+- 특정 기계 학습 작업을 해결하기 위해 사전훈련된 모델을 다운로드하고 사용하기만 하려는 엔지니어를 위한 것입니다.
+
+이 라이브러리는 두 가지 주요 목표를 가지고 설계되었습니다:
+
+1. 사용하기 쉽고 빠르게 만드는 것:
+
+- 학습해야 할 사용자 대상 추상화의 수를 제한했습니다. 실제로 거의 추상화가 없으며, 각 모델을 사용하기 위해 필요한 세 가지 표준 클래스인 [configuration](main_classes/configuration), [models](main_classes/model) 및 전처리 클래스인 ([tokenizer](main_classes/tokenizer)는 NLP용, [image processor](main_classes/image_processor)는 비전용, [feature extractor](main_classes/feature_extractor)는 오디오용, [processor](main_classes/processors)는 멀티모달 입력용)만 사용합니다.
+- 이러한 클래스는 공통적인 `from_pretrained()` 메서드를 사용하여 미리 훈련된 인스턴스에서 간단하고 통일된 방식으로 초기화할 수 있습니다. 이 메소드는 미리 훈련된 체크포인트에서 관련 클래스 인스턴스와 관련 데이터(구성의 하이퍼파라미터, 토크나이저의 어휘, 모델의 가중치)를 (필요한 경우) 다운로드하고 캐시하며 가져옵니다. 체크포인트는 [Hugging Face Hub](https://huggingface.co/models)에서 제공되거나 사용자 자체의 저장된 체크포인트에서 제공됩니다.
+- 이 세 가지 기본 클래스 위에 라이브러리는 [`pipeline`] API를 제공하여 주어진 작업에 대해 모델을 빠르게 추론하는 데 사용하고, [`Trainer`]를 제공하여 PyTorch 모델을 빠르게 훈련하거나 미세 조정할 수 있도록 합니다(모든 TensorFlow 모델은 `Keras.fit`과 호환됩니다).
+- 결과적으로, 이 라이브러리는 신경망을 구축하기 위한 모듈식 도구 상자가 아닙니다. 라이브러리를 확장하거나 구축하려면 일반적인 Python, PyTorch, TensorFlow, Keras 모듈을 사용하고 라이브러리의 기본 클래스를 상속하여 모델 로딩 및 저장과 같은 기능을 재사용하면 됩니다. 모델에 대한 코딩 철학에 대해 더 자세히 알고 싶다면 [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy) 블로그 글을 확인해보세요.
+
+2. 원래 모델과 가능한 한 근접한 성능을 제공하는 최신 모델을 제공하는 것:
+
+- 각 아키텍처에 대해 공식 저자가 제공한 결과를 재현하는 적어도 한 가지 예제를 제공합니다.
+- 코드는 원래 코드와 가능한 한 유사하게 유지되므로 PyTorch 코드는 TensorFlow 코드로 변환되어 *pytorchic*하지 않을 수 있고, 그 반대의 경우도 마찬가지입니다.
+
+기타 목표 몇 가지:
+
+- 모델의 내부를 가능한 일관되게 노출시키기:
+
+  - 전체 은닉 상태와 어텐션 가중치에 대한 액세스를 단일 API를 사용하여 제공합니다.
+  - 전처리 클래스 및 기본 모델 API는 모델 간에 쉽게 전환할 수 있도록 표준화되어 있습니다.
+
+- 미세 조정 및 모델 탐색을 위한 유망한 도구들을 주관적으로 선택하기:
+
+  - 미세 조정을 위해 어휘 및 임베딩에 새로운 토큰을 간단하고 일관된 방식으로 추가하는 방법을 제공합니다.
+  - Transformer 헤드를 마스킹하고 가지치기하는 간단한 방법을 제공합니다.
+
+- PyTorch, TensorFlow 2.0 및 Flax 간에 쉽게 전환할 수 있도록 하여 하나의 프레임워크로 훈련하고 다른 프레임워크로 추론할 수 있게 합니다.
+
+## 주요 개념 [[main-concepts]]
+
+이 라이브러리는 각 모델에 대해 세 가지 유형의 클래스를 기반으로 구축되었습니다:
+
+- **모델 클래스**는 라이브러리에서 제공하는 사전 훈련된 가중치와 함께 작동하는 PyTorch 모델([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)), Keras 모델([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)), JAX/Flax 모델([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen.html))일 수 있습니다.
+- **구성 클래스**는 모델을 구축하는 데 필요한 하이퍼파라미터(예: 레이어 수 및 은닉 크기)를 저장합니다. 구성 클래스를 직접 인스턴스화할 필요는 없습니다. 특히, 수정 없이 고 사전 학습된 모델을 사용하는 경우 모델을 생성하면 모델의 일부인 구성을 자동으로 인스턴스화됩니다.
+- **전처리 클래스**는 원시 데이터를 모델이 수용하는 형식으로 변환합니다. [Tokenizer](main_classes/tokenizer)는 각 모델의 어휘를 저장하고, 문자열을 토큰 임베딩 인덱스 리스트로 인코딩하고 디코딩하기 위한 메소드를 제공합니다. [Image processors](main_classes/image_processor)는 비전 입력을 전처리하고, [feature extractors](main_classes/feature_extractor)는 오디오 입력을 전처리하며, [processor](main_classes/processors)는 멀티모달 입력을 처리합니다.
+
+모든 이러한 클래스는 사전 훈련된 인스턴스에서 인스턴스화하고 로컬로 저장하며, 세 가지 메소드를 사용하여 Hub에서 공유할 수 있습니다:
+
+- `from_pretrained()` 메소드를 사용하면 라이브러리 자체에서 제공하는 사전 훈련된 버전(지원되는 모델은 [Model Hub](https://huggingface.co/models)에서 찾을 수 있음)이나 사용자가 로컬로 저장한 경우(또는 서버에 저장한 경우)의 모델, 구성 및 전처리 클래스를 인스턴스화할 수 있습니다.
+- `save_pretrained()` 메소드를 사용하면 모델, 구성 및 전처리 클래스를 로컬로 저장하여 `from_pretrained()`를 사용하여 다시 가져올 수 있습니다.
+- `push_to_hub()` 메소드를 사용하면 모델, 구성 및 전처리 클래스를 Hub에 공유하여 모두에게 쉽게 접근할 수 있습니다.
+

From 16edf4d9fdc68d6aa43fc9a99db57a0ea70b638b Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 10 Aug 2023 10:53:22 +0200
Subject: [PATCH 912/935] Doc checks (#25408)

* Document check_dummies

* Type hints and doc in other files

* Document check inits

* Add documentation to

* Address review comments
---
 utils/check_copies.py       | 139 +++++++++++---
 utils/check_doc_toc.py      |   5 +-
 utils/check_doctest_list.py |  11 +-
 utils/check_dummies.py      |  92 +++++++--
 utils/check_inits.py        |  78 +++++++-
 utils/check_repo.py         | 364 ++++++++++++++++++------------------
 6 files changed, 462 insertions(+), 227 deletions(-)

diff --git a/utils/check_copies.py b/utils/check_copies.py
index 0352b6419e30..563f88a5ec13 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -40,6 +40,7 @@
 import glob
 import os
 import re
+from typing import List, Optional, Tuple
 
 import black
 from doc_builder.style_doc import style_docstrings_in_code
@@ -125,14 +126,22 @@
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
 
 
-def _should_continue(line, indent):
+def _should_continue(line: str, indent: str) -> bool:
     # Helper function. Returns `True` if `line` is empty, starts with the `indent` or is the end parenthesis of a
     # function definition
     return line.startswith(indent) or len(line.strip()) == 0 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
 
 
-def find_code_in_transformers(object_name):
-    """Find and return the code source code of `object_name`."""
+def find_code_in_transformers(object_name: str) -> str:
+    """
+    Find and return the source code of an object.
+
+    Args:
+        object_name (`str`): The name of the object we want the source code of.
+
+    Returns:
+        `str`: The source code of the object.
+    """
     parts = object_name.split(".")
     i = 0
 
@@ -181,7 +190,16 @@ def find_code_in_transformers(object_name):
 _re_fill_pattern = re.compile(r"<FILL\s+[^>]*>")
 
 
-def get_indent(code):
+def get_indent(code: str) -> str:
+    """
+    Find the indent in the first non empty line in a code sample.
+
+    Args:
+        code (`str`): The code to inspect.
+
+    Returns:
+        `str`: The indent looked at (as string).
+    """
     lines = code.split("\n")
     idx = 0
     while idx < len(lines) and len(lines[idx]) == 0:
@@ -191,9 +209,15 @@ def get_indent(code):
     return ""
 
 
-def blackify(code):
+def blackify(code: str) -> str:
     """
-    Applies the black part of our `make style` command to `code`.
+    Applies the black part of our `make style` command to some code.
+
+    Args:
+        code (`str`): The code to format.
+
+    Returns:
+        `str`: The formatted code.
     """
     has_indent = len(get_indent(code)) > 0
     if has_indent:
@@ -204,14 +228,22 @@ def blackify(code):
     return result[len("class Bla:\n") :] if has_indent else result
 
 
-def check_codes_match(observed_code, theoretical_code):
+def check_codes_match(observed_code: str, theoretical_code: str) -> Optional[int]:
     """
-    Checks if the code in `observed_code` and `theoretical_code` match with the exception of the class/function name.
-    Returns the index of the first line where there is a difference (if any) and `None` if the codes match.
+    Checks if two version of a code match with the exception of the class/function name.
+
+    Args:
+        observed_code (`str`): The code found.
+        theoretical_code (`str`): The code to match.
+
+    Returns:
+        `Optional[int]`: The index of the first line where there is a difference (if any) and `None` if the codes
+        match.
     """
     observed_code_header = observed_code.split("\n")[0]
     theoretical_code_header = theoretical_code.split("\n")[0]
 
+    # Catch the function/class name: it is expected that those do not match.
     _re_class_match = re.compile(r"class\s+([^\(:]+)(?:\(|:)")
     _re_func_match = re.compile(r"def\s+([^\(]+)\(")
     for re_pattern in [_re_class_match, _re_func_match]:
@@ -220,6 +252,7 @@ def check_codes_match(observed_code, theoretical_code):
             theoretical_name = re_pattern.search(theoretical_code_header).groups()[0]
             theoretical_code_header = theoretical_code_header.replace(theoretical_name, observed_obj_name)
 
+    # Find the first diff. Line 0 is special since we need to compare with the function/class names ignored.
     diff_index = 0
     if theoretical_code_header != observed_code_header:
         return 0
@@ -231,11 +264,19 @@ def check_codes_match(observed_code, theoretical_code):
         diff_index += 1
 
 
-def is_copy_consistent(filename, overwrite=False):
+def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[Tuple[str, int]]]:
     """
-    Check if the code commented as a copy in `filename` matches the original.
+    Check if the code commented as a copy in a file matches the original.
 
-    Return the differences or overwrites the content depending on `overwrite`.
+    Args:
+        filename (`str`):
+            The name of the file to check.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
+
+    Returns:
+        `Optional[List[Tuple[str, int]]]`: If `overwrite=False`, returns the list of differences as tuples `(str, int)`
+        with the name of the object having a diff and the line number where theere is the first diff.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
@@ -308,8 +349,12 @@ def is_copy_consistent(filename, overwrite=False):
 
 def check_copies(overwrite: bool = False):
     """
-    Check every file is copy-consistent with the original and maybe `overwrite` content when it is not. Also check the
-    model list in the main README and other READMEs/index.md are consistent.
+    Check every file is copy-consistent with the original. Also check the model list in the main README and other
+    READMEs/index.md are consistent.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
     """
     all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
     diffs = []
@@ -328,8 +373,11 @@ def check_copies(overwrite: bool = False):
 
 def check_full_copies(overwrite: bool = False):
     """
-    Check the files that are full copies of others (as indicated in `FULL_COPIES`) are copy-consistent and maybe
-    `overwrite` to fix issues.
+    Check the files that are full copies of others (as indicated in `FULL_COPIES`) are copy-consistent.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
     """
     diffs = []
     for target, source in FULL_COPIES.items():
@@ -354,8 +402,18 @@ def check_full_copies(overwrite: bool = False):
         )
 
 
-def get_model_list(filename, start_prompt, end_prompt):
-    """Extracts the model list from a README, between `start_prompt` and `end_prompt`."""
+def get_model_list(filename: str, start_prompt: str, end_prompt: str) -> str:
+    """
+    Extracts the model list from a README.
+
+    Args:
+        filename (`str`): The name of the README file to check.
+        start_prompt (`str`): The string to look for that introduces the model list.
+        end_prompt (`str`): The string to look for that ends the model list.
+
+    Returns:
+        `str`: The model list.
+    """
     with open(os.path.join(REPO_PATH, filename), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     # Find the start of the list.
@@ -368,6 +426,7 @@ def get_model_list(filename, start_prompt, end_prompt):
     current_line = ""
     end_index = start_index
 
+    # Keep going until the end of the list.
     while not lines[end_index].startswith(end_prompt):
         if lines[end_index].startswith("1."):
             if len(current_line) > 1:
@@ -382,7 +441,7 @@ def get_model_list(filename, start_prompt, end_prompt):
     return "".join(result)
 
 
-def convert_to_localized_md(model_list, localized_model_list, format_str):
+def convert_to_localized_md(model_list: str, localized_model_list: str, format_str: str) -> Tuple[bool, str]:
     """
     Compare the model list from the main README to the one in a localized README.
 
@@ -458,19 +517,33 @@ def _rep(match):
     return readmes_match, "\n".join((x[1] for x in sorted_index)) + "\n"
 
 
-def convert_readme_to_index(model_list):
+def convert_readme_to_index(model_list: str) -> str:
     """
-    Converts the model list of the README to the index.md format.
+    Converts the model list of the README to the index.md format (adapting links to the doc to relative links).
+
+    Args:
+        model_list (`str`): The model list of the main README.
+
+    Returns:
+        `str`: The model list in the format for the index.
     """
     # We need to replce both link to the main doc and stable doc (the order of the next two instructions is important).
     model_list = model_list.replace("https://huggingface.co/docs/transformers/main/", "")
     return model_list.replace("https://huggingface.co/docs/transformers/", "")
 
 
-def _find_text_in_file(filename, start_prompt, end_prompt):
+def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> Tuple[str, int, int, List[str]]:
     """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
+    Find the text in a file between two prompts.
+
+    Args:
+        filename (`str`): The name of the file to look into.
+        start_prompt (`str`): The string to look for that introduces the content looked for.
+        end_prompt (`str`): The string to look for that ends the content looked for.
+
+    Returns:
+        Tuple[str, int, int, List[str]]: The content between the two prompts, the index of the start line in the
+        original file, the index of the end line in the original file and the list of lines of that file.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
@@ -493,9 +566,13 @@ def _find_text_in_file(filename, start_prompt, end_prompt):
     return "".join(lines[start_index:end_index]), start_index, end_index, lines
 
 
-def check_model_list_copy(overwrite=False, max_per_line=119):
+def check_model_list_copy(overwrite: bool = False):
     """
     Check the model lists in the README is consistent with the ones in the other READMES and also with `index.nmd`.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the copies when they don't match.
     """
     # Fix potential doc links in the README
     with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
@@ -526,6 +603,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
         end_prompt=LOCALIZED_READMES["README.md"]["end_prompt"],
     )
 
+    # Buld the converted Markdown.
     converted_md_lists = []
     for filename, value in LOCALIZED_READMES.items():
         _start_prompt = value["start_prompt"]
@@ -537,6 +615,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
 
         converted_md_lists.append((filename, readmes_match, converted_md_list, _start_prompt, _end_prompt))
 
+    # Build the converted index and compare it.
     converted_md_list = convert_readme_to_index(md_list)
     if converted_md_list != index_list:
         if overwrite:
@@ -548,6 +627,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
                 "`make fix-copies` to fix this."
             )
 
+    # Compare the converted Markdowns
     for converted_md_list in converted_md_lists:
         filename, readmes_match, converted_md, _start_prompt, _end_prompt = converted_md_list
 
@@ -606,10 +686,13 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
 )
 
 
-def check_readme(overwrite=False):
+def check_readme(overwrite: bool = False):
     """
-    Check if the main README contains all the models in the library or not. If `overwrite`, will add an entry for the
-    missing models using `README_TEMPLATE`.
+    Check if the main README contains all the models in the library or not.
+
+    Args:
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an entry for the missing models using `README_TEMPLATE`.
     """
     info = LOCALIZED_READMES["README.md"]
     models, start_index, end_index, lines = _find_text_in_file(
diff --git a/utils/check_doc_toc.py b/utils/check_doc_toc.py
index 83c6be479536..ccbff5e0b648 100644
--- a/utils/check_doc_toc.py
+++ b/utils/check_doc_toc.py
@@ -34,6 +34,7 @@
 
 import argparse
 from collections import defaultdict
+from typing import List
 
 import yaml
 
@@ -41,7 +42,7 @@
 PATH_TO_TOC = "docs/source/en/_toctree.yml"
 
 
-def clean_model_doc_toc(model_doc):
+def clean_model_doc_toc(model_doc: List[dict]) -> List[dict]:
     """
     Cleans a section of the table of content of the model documentation (one specific modality) by removing duplicates
     and sorting models alphabetically.
@@ -77,7 +78,7 @@ def clean_model_doc_toc(model_doc):
     return sorted(new_doc, key=lambda s: s["title"].lower())
 
 
-def check_model_doc(overwrite=False):
+def check_model_doc(overwrite: bool = False):
     """
     Check that the content of the table of content in `_toctree.yml` is clean (no duplicates and sorted for the model
     API doc) and potentially auto-cleans it.
diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py
index 3815a2bda0ba..ee751bc279c4 100644
--- a/utils/check_doctest_list.py
+++ b/utils/check_doctest_list.py
@@ -40,7 +40,16 @@
 DOCTEST_FILE_PATHS = ["documentation_tests.txt", "slow_documentation_tests.txt"]
 
 
-def clean_doctest_list(doctest_file, overwrite=False):
+def clean_doctest_list(doctest_file: str, overwrite: bool = False):
+    """
+    Cleans the doctest in a given file.
+
+    Args:
+        doctest_file (`str`):
+            The path to the doctest file to check or clean.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to fix problems. If `False`, will error when the file is not clean.
+    """
     non_existent_paths = []
     all_paths = []
     with open(doctest_file, "r", encoding="utf-8") as f:
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index 39869e87fb67..a3ab6ebfa77b 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -12,10 +12,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This script is responsible for making sure the dummies in utils/dummies_xxx.py are up to date with the main init.
+
+Why dummies? This is to make sure that a user can always import all objects from `transformers`, even if they don't
+have the necessary extra libs installed. Those objects will then raise helpful error message whenever the user tries
+to access one of their methods.
+
+Usage (from the root of the repo):
+
+Check that the dummy files are up to date (used in `make repo-consistency`):
+
+```bash
+python utils/check_dummies.py
+```
 
+Update the dummy files if needed (used in `make fix-copies`):
+
+```bash
+python utils/check_dummies.py --fix_and_overwrite
+```
+"""
 import argparse
 import os
 import re
+from typing import Dict, List, Optional
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
@@ -26,13 +47,16 @@
 _re_backend = re.compile(r"is\_([a-z_]*)_available()")
 # Matches from xxx import bla
 _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
+# Matches if not is_xxx_available()
 _re_test_backend = re.compile(r"^\s+if\s+not\s+\(?is\_[a-z_]*\_available\(\)")
 
 
+# Template for the dummy objects.
 DUMMY_CONSTANT = """
 {0} = None
 """
 
+
 DUMMY_CLASS = """
 class {0}(metaclass=DummyObject):
     _backends = {1}
@@ -48,8 +72,18 @@ def {0}(*args, **kwargs):
 """
 
 
-def find_backend(line):
-    """Find one (or multiple) backend in a code line of the init."""
+def find_backend(line: str) -> Optional[str]:
+    """
+    Find one (or multiple) backend in a code line of the init.
+
+    Args:
+        line (`str`): A code line in an init file.
+
+    Returns:
+        Optional[`str`]: If one (or several) backend is found, returns it. In the case of multiple backends (the line
+        contains `if is_xxx_available() and `is_yyy_available()`) returns all backends joined on `_and_` (so
+        `xxx_and_yyy` for instance).
+    """
     if _re_test_backend.search(line) is None:
         return None
     backends = [b[0] for b in _re_backend.findall(line)]
@@ -57,8 +91,13 @@ def find_backend(line):
     return "_and_".join(backends)
 
 
-def read_init():
-    """Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects."""
+def read_init() -> Dict[str, List[str]]:
+    """
+    Read the init and extract backend-specific objects.
+
+    Returns:
+        Dict[str, List[str]]: A dictionary mapping backend name to the list of object names requiring that backend.
+    """
     with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
 
@@ -83,8 +122,10 @@ def read_init():
                 line = lines[line_index]
                 single_line_import_search = _re_single_line_import.search(line)
                 if single_line_import_search is not None:
+                    # Single-line imports
                     objects.extend(single_line_import_search.groups()[0].split(", "))
                 elif line.startswith(" " * 12):
+                    # Multiple-line imports (with 3 indent level)
                     objects.append(line[12:-2])
                 line_index += 1
 
@@ -95,8 +136,17 @@ def read_init():
     return backend_specific_objects
 
 
-def create_dummy_object(name, backend_name):
-    """Create the code for the dummy object corresponding to `name`."""
+def create_dummy_object(name: str, backend_name: str) -> str:
+    """
+    Create the code for a dummy object.
+
+    Args:
+        name (`str`): The name of the object.
+        backend_name (`str`): The name of the backend required for that object.
+
+    Returns:
+        `str`: The code of the dummy object.
+    """
     if name.isupper():
         return DUMMY_CONSTANT.format(name)
     elif name.islower():
@@ -105,11 +155,21 @@ def create_dummy_object(name, backend_name):
         return DUMMY_CLASS.format(name, backend_name)
 
 
-def create_dummy_files(backend_specific_objects=None):
-    """Create the content of the dummy files."""
+def create_dummy_files(backend_specific_objects: Optional[Dict[str, List[str]]] = None) -> Dict[str, str]:
+    """
+    Create the content of the dummy files.
+
+    Args:
+        backend_specific_objects (`Dict[str, List[str]]`, *optional*):
+            The mapping backend name to list of backend-specific objects. If not passed, will be obtained by calling
+            `read_init()`.
+
+    Returns:
+        `Dict[str, str]`: A dictionary mapping backend name to code of the corresponding backend file.
+    """
     if backend_specific_objects is None:
         backend_specific_objects = read_init()
-    # For special correspondence backend to module name as used in the function requires_modulename
+
     dummy_files = {}
 
     for backend, objects in backend_specific_objects.items():
@@ -122,10 +182,17 @@ def create_dummy_files(backend_specific_objects=None):
     return dummy_files
 
 
-def check_dummies(overwrite=False):
-    """Check if the dummy files are up to date and maybe `overwrite` with the right content."""
+def check_dummies(overwrite: bool = False):
+    """
+    Check if the dummy files are up to date and maybe `overwrite` with the right content.
+
+    Args:
+        overwrite (`bool`, *optional*, default to `False`):
+            Whether or not to overwrite the content of the dummy files. Will raise an error if they are not up to date
+            when `overwrite=False`.
+    """
     dummy_files = create_dummy_files()
-    # For special correspondence backend to shortcut as used in utils/dummy_xxx_objects.py
+    # For special correspondence backend name to shortcut as used in utils/dummy_xxx_objects.py
     short_names = {"torch": "pt"}
 
     # Locate actual dummy modules and read their content.
@@ -143,6 +210,7 @@ def check_dummies(overwrite=False):
         else:
             actual_dummies[backend] = ""
 
+    # Compare actual with what they should be.
     for backend in dummy_files.keys():
         if dummy_files[backend] != actual_dummies[backend]:
             if overwrite:
diff --git a/utils/check_inits.py b/utils/check_inits.py
index 12b61223e42a..43361adbf8f5 100644
--- a/utils/check_inits.py
+++ b/utils/check_inits.py
@@ -12,13 +12,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Utility that checks the custom inits of Transformers are well-defined: Transformers uses init files that delay the
+import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
+make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
+delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. The goal of this
+script is to check the objects defined in both halves are the same.
+
+This also checks the main init properly references all submodules, even if it doesn't import anything from them: every
+submodule should be defined as a key of `_import_structure`, with an empty list as value potentially, or the submodule
+won't be importable.
+
+Use from the root of the repo with:
+
+```bash
+python utils/check_inits.py
+```
+
+for a check that will error in case of inconsistencies (used by `make repo-consistency`).
+
+There is no auto-fix possible here sadly :-(
+"""
 
 import collections
 import os
 import re
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple
 
 
+# Path is set with the intent you should run this script from the root of the repo.
 PATH_TO_TRANSFORMERS = "src/transformers"
 
 
@@ -46,8 +70,18 @@
 _re_else = re.compile(r"^\s*else:")
 
 
-def find_backend(line):
-    """Find one (or multiple) backend in a code line of the init."""
+def find_backend(line: str) -> Optional[str]:
+    """
+    Find one (or multiple) backend in a code line of the init.
+
+    Args:
+        line (`str`): A code line of the main init.
+
+    Returns:
+        Optional[`str`]: If one (or several) backend is found, returns it. In the case of multiple backends (the line
+        contains `if is_xxx_available() and `is_yyy_available()`) returns all backends joined on `_and_` (so
+        `xxx_and_yyy` for instance).
+    """
     if _re_test_backend.search(line) is None:
         return None
     backends = [b[0] for b in _re_backend.findall(line)]
@@ -55,14 +89,23 @@ def find_backend(line):
     return "_and_".join(backends)
 
 
-def parse_init(init_file):
+def parse_init(init_file) -> Optional[Tuple[Dict[str, List[str]], Dict[str, List[str]]]]:
     """
-    Read an init_file and parse (per backend) the _import_structure objects defined and the TYPE_CHECKING objects
-    defined
+    Read an init_file and parse (per backend) the `_import_structure` objects defined and the `TYPE_CHECKING` objects
+    defined.
+
+    Args:
+        init_file (`str`): Path to the init file to inspect.
+
+    Returns:
+        `Optional[Tuple[Dict[str, List[str]], Dict[str, List[str]]]]`: A tuple of two dictionaries mapping backends to list of
+        imported objects, one for the `_import_structure` part of the init and one for the `TYPE_CHECKING` part of the
+        init. Returns `None` if the init is not a custom init.
     """
     with open(init_file, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
 
+    # Get the to `_import_structure` definition.
     line_index = 0
     while line_index < len(lines) and not lines[line_index].startswith("_import_structure = {"):
         line_index += 1
@@ -91,7 +134,9 @@ def parse_init(init_file):
             objects.append(line[9:-3])
         line_index += 1
 
+    # Those are stored with the key "none".
     import_dict_objects = {"none": objects}
+
     # Let's continue with backend-specific objects in _import_structure
     while not lines[line_index].startswith("if TYPE_CHECKING"):
         # If the line is an if not is_backend_available, we grab all objects associated.
@@ -151,6 +196,7 @@ def parse_init(init_file):
         line_index += 1
 
     type_hint_objects = {"none": objects}
+
     # Let's continue with backend-specific objects
     while line_index < len(lines):
         # If the line is an if is_backend_available, we grab all objects associated.
@@ -186,19 +232,33 @@ def parse_init(init_file):
     return import_dict_objects, type_hint_objects
 
 
-def analyze_results(import_dict_objects, type_hint_objects):
+def analyze_results(import_dict_objects: Dict[str, List[str]], type_hint_objects: Dict[str, List[str]]) -> List[str]:
     """
     Analyze the differences between _import_structure objects and TYPE_CHECKING objects found in an init.
+
+    Args:
+        import_dict_objects (`Dict[str, List[str]]`):
+            A dictionary mapping backend names (`"none"` for the objects independent of any specific backend) to
+            list of imported objects.
+        type_hint_objects (`Dict[str, List[str]]`):
+            A dictionary mapping backend names (`"none"` for the objects independent of any specific backend) to
+            list of imported objects.
+
+    Returns:
+        `List[str]`: The list of errors corresponding to mismatches.
     """
 
     def find_duplicates(seq):
         return [k for k, v in collections.Counter(seq).items() if v > 1]
 
+    # If one backend is missing from the other part of the init, error early.
     if list(import_dict_objects.keys()) != list(type_hint_objects.keys()):
         return ["Both sides of the init do not have the same backends!"]
 
     errors = []
+    # Find all errors.
     for key in import_dict_objects.keys():
+        # Duplicate imports in any half.
         duplicate_imports = find_duplicates(import_dict_objects[key])
         if duplicate_imports:
             errors.append(f"Duplicate _import_structure definitions for: {duplicate_imports}")
@@ -206,6 +266,7 @@ def find_duplicates(seq):
         if duplicate_type_hints:
             errors.append(f"Duplicate TYPE_CHECKING objects for: {duplicate_type_hints}")
 
+        # Missing imports in either part of the init.
         if sorted(set(import_dict_objects[key])) != sorted(set(type_hint_objects[key])):
             name = "base imports" if key == "none" else f"{key} backend"
             errors.append(f"Differences for {name}:")
@@ -237,7 +298,7 @@ def check_all_inits():
         raise ValueError("\n\n".join(failures))
 
 
-def get_transformers_submodules():
+def get_transformers_submodules() -> List[str]:
     """
     Returns the list of Transformers submodules.
     """
@@ -272,6 +333,9 @@ def get_transformers_submodules():
 
 
 def check_submodules():
+    """
+    Check all submodules of Transformers are properly registered in the main init. Error otherwise.
+    """
     # This is to make sure the transformers module imported is the one in the repo.
     from transformers.utils import direct_transformers_import
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 7af69519c68f..8be8469465d5 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -12,15 +12,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""
+Utility that performs several consistency checks on the repo. This includes:
+- checking all models are properly defined in the __init__ of models/
+- checking all models are in the main __init__
+- checking all models are properly tested
+- checking all object in the main __init__ are documented
+- checking all models are in at least one auto class
+- checking all the auto mapping are properly defined (no typos, importable)
+- checking the list of deprecated models is up to date
+
+Use from the root of the repo with (as used in `make repo-consistency`):
+
+```bash
+python utils/check_repo.py
+```
+
+It has no auto-fix mode.
+"""
 import inspect
 import os
 import re
 import sys
+import types
 import warnings
 from collections import OrderedDict
 from difflib import get_close_matches
 from pathlib import Path
+from typing import List, Tuple
 
 from transformers import is_flax_available, is_tf_available, is_torch_available
 from transformers.models.auto import get_values
@@ -60,91 +79,25 @@
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
     "InstructBlipQFormerModel",  # Building part of bigger (tested) model.
-    "NllbMoeDecoder",
-    "NllbMoeEncoder",
     "UMT5EncoderModel",  # Building part of bigger (tested) model.
-    "LlamaDecoder",  # Building part of bigger (tested) model.
     "Blip2QFormerModel",  # Building part of bigger (tested) model.
-    "DetaEncoder",  # Building part of bigger (tested) model.
-    "DetaDecoder",  # Building part of bigger (tested) model.
     "ErnieMForInformationExtraction",
-    "GraphormerEncoder",  # Building part of bigger (tested) model.
     "GraphormerDecoderHead",  # Building part of bigger (tested) model.
-    "CLIPSegDecoder",  # Building part of bigger (tested) model.
-    "TableTransformerEncoder",  # Building part of bigger (tested) model.
-    "TableTransformerDecoder",  # Building part of bigger (tested) model.
-    "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
-    "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
-    "InformerEncoder",  # Building part of bigger (tested) model.
-    "InformerDecoder",  # Building part of bigger (tested) model.
-    "AutoformerEncoder",  # Building part of bigger (tested) model.
-    "AutoformerDecoder",  # Building part of bigger (tested) model.
     "JukeboxVQVAE",  # Building part of bigger (tested) model.
     "JukeboxPrior",  # Building part of bigger (tested) model.
-    "DeformableDetrEncoder",  # Building part of bigger (tested) model.
-    "DeformableDetrDecoder",  # Building part of bigger (tested) model.
-    "OPTDecoder",  # Building part of bigger (tested) model.
-    "FlaxWhisperDecoder",  # Building part of bigger (tested) model.
-    "FlaxWhisperEncoder",  # Building part of bigger (tested) model.
-    "WhisperDecoder",  # Building part of bigger (tested) model.
-    "WhisperEncoder",  # Building part of bigger (tested) model.
     "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
     "SegformerDecodeHead",  # Building part of bigger (tested) model.
-    "PLBartEncoder",  # Building part of bigger (tested) model.
-    "PLBartDecoder",  # Building part of bigger (tested) model.
-    "PLBartDecoderWrapper",  # Building part of bigger (tested) model.
-    "BigBirdPegasusEncoder",  # Building part of bigger (tested) model.
-    "BigBirdPegasusDecoder",  # Building part of bigger (tested) model.
-    "BigBirdPegasusDecoderWrapper",  # Building part of bigger (tested) model.
-    "DetrEncoder",  # Building part of bigger (tested) model.
-    "DetrDecoder",  # Building part of bigger (tested) model.
-    "DetrDecoderWrapper",  # Building part of bigger (tested) model.
-    "ConditionalDetrEncoder",  # Building part of bigger (tested) model.
-    "ConditionalDetrDecoder",  # Building part of bigger (tested) model.
-    "M2M100Encoder",  # Building part of bigger (tested) model.
-    "M2M100Decoder",  # Building part of bigger (tested) model.
-    "MCTCTEncoder",  # Building part of bigger (tested) model.
     "MgpstrModel",  # Building part of bigger (tested) model.
-    "Speech2TextEncoder",  # Building part of bigger (tested) model.
-    "Speech2TextDecoder",  # Building part of bigger (tested) model.
-    "LEDEncoder",  # Building part of bigger (tested) model.
-    "LEDDecoder",  # Building part of bigger (tested) model.
-    "BartDecoderWrapper",  # Building part of bigger (tested) model.
-    "BartEncoder",  # Building part of bigger (tested) model.
     "BertLMHeadModel",  # Needs to be setup as decoder.
-    "BlenderbotSmallEncoder",  # Building part of bigger (tested) model.
-    "BlenderbotSmallDecoderWrapper",  # Building part of bigger (tested) model.
-    "BlenderbotEncoder",  # Building part of bigger (tested) model.
-    "BlenderbotDecoderWrapper",  # Building part of bigger (tested) model.
-    "MBartEncoder",  # Building part of bigger (tested) model.
-    "MBartDecoderWrapper",  # Building part of bigger (tested) model.
     "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
-    "MegatronBertEncoder",  # Building part of bigger (tested) model.
-    "MegatronBertDecoder",  # Building part of bigger (tested) model.
-    "MegatronBertDecoderWrapper",  # Building part of bigger (tested) model.
-    "MusicgenDecoder",  # Building part of bigger (tested) model.
-    "MvpDecoderWrapper",  # Building part of bigger (tested) model.
-    "MvpEncoder",  # Building part of bigger (tested) model.
-    "PegasusEncoder",  # Building part of bigger (tested) model.
-    "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
-    "PegasusXEncoder",  # Building part of bigger (tested) model.
-    "PegasusXDecoder",  # Building part of bigger (tested) model.
-    "PegasusXDecoderWrapper",  # Building part of bigger (tested) model.
-    "DPREncoder",  # Building part of bigger (tested) model.
-    "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
     "RealmBertModel",  # Building part of bigger (tested) model.
     "RealmReader",  # Not regular model.
     "RealmScorer",  # Not regular model.
     "RealmForOpenQA",  # Not regular model.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
-    "Speech2Text2DecoderWrapper",  # Building part of bigger (tested) model.
-    "TFDPREncoder",  # Building part of bigger (tested) model.
     "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
     "TFRobertaForMultipleChoice",  # TODO: fix
     "TFRobertaPreLayerNormForMultipleChoice",  # TODO: fix
-    "TrOCRDecoderWrapper",  # Building part of bigger (tested) model.
-    "TFWhisperEncoder",  # Building part of bigger (tested) model.
-    "TFWhisperDecoder",  # Building part of bigger (tested) model.
     "SeparableConv1D",  # Building part of bigger (tested) model.
     "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
     "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
@@ -155,18 +108,6 @@
     "TFBlipTextLMHeadModel",  # No need to test it as it is tested by BlipTextVision models
     "BridgeTowerTextModel",  # No need to test it as it is tested by BridgeTowerModel model.
     "BridgeTowerVisionModel",  # No need to test it as it is tested by BridgeTowerModel model.
-    "SpeechT5Decoder",  # Building part of bigger (tested) model.
-    "SpeechT5DecoderWithoutPrenet",  # Building part of bigger (tested) model.
-    "SpeechT5DecoderWithSpeechPrenet",  # Building part of bigger (tested) model.
-    "SpeechT5DecoderWithTextPrenet",  # Building part of bigger (tested) model.
-    "SpeechT5Encoder",  # Building part of bigger (tested) model.
-    "SpeechT5EncoderWithoutPrenet",  # Building part of bigger (tested) model.
-    "SpeechT5EncoderWithSpeechPrenet",  # Building part of bigger (tested) model.
-    "SpeechT5EncoderWithTextPrenet",  # Building part of bigger (tested) model.
-    "SpeechT5SpeechDecoder",  # Building part of bigger (tested) model.
-    "SpeechT5SpeechEncoder",  # Building part of bigger (tested) model.
-    "SpeechT5TextDecoder",  # Building part of bigger (tested) model.
-    "SpeechT5TextEncoder",  # Building part of bigger (tested) model.
     "BarkCausalModel",  # Building part of bigger (tested) model.
     "BarkModel",  # Does not have a forward signature - generation tested with integration tests
 ]
@@ -236,12 +177,6 @@
     "AutoformerForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",
-    "PegasusXEncoder",
-    "PegasusXDecoder",
-    "PegasusXDecoderWrapper",
-    "PegasusXEncoder",
-    "PegasusXDecoder",
-    "PegasusXDecoderWrapper",
     "SamModel",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
@@ -250,17 +185,11 @@
     "ViltForImageAndTextRetrieval",
     "ViltForTokenClassification",
     "ViltForMaskedLM",
-    "XGLMEncoder",
-    "XGLMDecoder",
-    "XGLMDecoderWrapper",
     "PerceiverForMultimodalAutoencoding",
     "PerceiverForOpticalFlow",
     "SegformerDecodeHead",
     "TFSegformerDecodeHead",
     "FlaxBeitForMaskedImageModeling",
-    "PLBartEncoder",
-    "PLBartDecoder",
-    "PLBartDecoderWrapper",
     "BeitForMaskedImageModeling",
     "ChineseCLIPTextModel",
     "ChineseCLIPVisionModel",
@@ -347,7 +276,7 @@
 ]
 
 # DO NOT edit this list!
-# (The corresponding pytorch objects should never be in the main `__init__`, but it's too late to remove)
+# (The corresponding pytorch objects should never have been in the main `__init__`, but it's too late to remove)
 OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK = [
     "FlaxBertLayer",
     "FlaxBigBirdLayer",
@@ -361,8 +290,7 @@
     "TFViTMAELayer",
 ]
 
-# Update this list for models that have multiple model types for the same
-# model doc
+# Update this list for models that have multiple model types for the same model doc.
 MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
     [
         ("data2vec-text", "data2vec"),
@@ -378,6 +306,10 @@
 
 
 def check_missing_backends():
+    """
+    Checks if all backends are installed (otherwise the check of this script is incomplete). Will error in the CI if
+    that's not the case but only throw a warning for users running this.
+    """
     missing_backends = []
     if not is_torch_available():
         missing_backends.append("PyTorch")
@@ -402,7 +334,9 @@ def check_missing_backends():
 
 
 def check_model_list():
-    """Check the model list inside the transformers library."""
+    """
+    Checks the model listed as subfolders of `models` match the models available in `transformers.models`.
+    """
     # Get the models from the directory structure of `src/transformers/models/`
     models_dir = os.path.join(PATH_TO_TRANSFORMERS, "models")
     _models = []
@@ -413,7 +347,7 @@ def check_model_list():
         if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir):
             _models.append(model)
 
-    # Get the models from the directory structure of `src/transformers/models/`
+    # Get the models in the submodule `transformers.models`
     models = [model for model in dir(transformers.models) if not model.startswith("__")]
 
     missing_models = sorted(set(_models).difference(models))
@@ -425,8 +359,8 @@ def check_model_list():
 
 # If some modeling modules should be ignored for all checks, they should be added in the nested list
 # _ignore_modules of this function.
-def get_model_modules():
-    """Get the model modules inside the transformers library."""
+def get_model_modules() -> List[str]:
+    """Get all the model modules inside the transformers library (except deprecated models)."""
     _ignore_modules = [
         "modeling_auto",
         "modeling_encoder_decoder",
@@ -454,21 +388,32 @@ def get_model_modules():
     ]
     modules = []
     for model in dir(transformers.models):
-        if model == "deprecated":
-            continue
         # There are some magic dunder attributes in the dir, we ignore them
-        if not model.startswith("__"):
-            model_module = getattr(transformers.models, model)
-            for submodule in dir(model_module):
-                if submodule.startswith("modeling") and submodule not in _ignore_modules:
-                    modeling_module = getattr(model_module, submodule)
-                    if inspect.ismodule(modeling_module):
-                        modules.append(modeling_module)
+        if model == "deprecated" or model.startswith("__"):
+            continue
+
+        model_module = getattr(transformers.models, model)
+        for submodule in dir(model_module):
+            if submodule.startswith("modeling") and submodule not in _ignore_modules:
+                modeling_module = getattr(model_module, submodule)
+                if inspect.ismodule(modeling_module):
+                    modules.append(modeling_module)
     return modules
 
 
-def get_models(module, include_pretrained=False):
-    """Get the objects in module that are models."""
+def get_models(module: types.ModuleType, include_pretrained: bool = False) -> List[Tuple[str, type]]:
+    """
+    Get the objects in a module that are models.
+
+    Args:
+        module (`types.ModuleType`):
+            The module from which we are extracting models.
+        include_pretrained (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the `PreTrainedModel` subclass (like `BertPreTrainedModel`) or not.
+
+    Returns:
+        List[Tuple[str, type]]: List of models as tuples (class name, actual class).
+    """
     models = []
     model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel)
     for attr_name in dir(module):
@@ -480,12 +425,10 @@ def get_models(module, include_pretrained=False):
     return models
 
 
-def is_a_private_model(model):
-    """Returns True if the model should not be in the main init."""
-    if model in PRIVATE_MODELS:
-        return True
-
-    # Wrapper, Encoder and Decoder are all privates
+def is_building_block(model: str) -> bool:
+    """
+    Returns `True` if a model is a building block part of a bigger model.
+    """
     if model.endswith("Wrapper"):
         return True
     if model.endswith("Encoder"):
@@ -494,7 +437,13 @@ def is_a_private_model(model):
         return True
     if model.endswith("Prenet"):
         return True
-    return False
+
+
+def is_a_private_model(model: str) -> bool:
+    """Returns `True` if the model should not be in the main init."""
+    if model in PRIVATE_MODELS:
+        return True
+    return is_building_block(model)
 
 
 def check_models_are_in_init():
@@ -514,11 +463,14 @@ def check_models_are_in_init():
 
 # If some test_modeling files should be ignored when checking models are all tested, they should be added in the
 # nested list _ignore_files of this function.
-def get_model_test_files():
-    """Get the model test files.
+def get_model_test_files() -> List[str]:
+    """
+    Get the model test files.
 
-    The returned files should NOT contain the `tests` (i.e. `PATH_TO_TESTS` defined in this script). They will be
-    considered as paths relative to `tests`. A caller has to use `os.path.join(PATH_TO_TESTS, ...)` to access the files.
+    Returns:
+        `List[str]`: The list of test files. The returned files will NOT contain the `tests` (i.e. `PATH_TO_TESTS`
+        defined in this script). They will be considered as paths relative to `tests`. A caller has to use
+        `os.path.join(PATH_TO_TESTS, ...)` to access the files.
     """
 
     _ignore_files = [
@@ -531,7 +483,6 @@ def get_model_test_files():
         "test_modeling_tf_encoder_decoder",
     ]
     test_files = []
-    # Check both `PATH_TO_TESTS` and `PATH_TO_TESTS/models`
     model_test_root = os.path.join(PATH_TO_TESTS, "models")
     model_test_dirs = []
     for x in os.listdir(model_test_root):
@@ -553,9 +504,17 @@ def get_model_test_files():
 
 # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class
 # for the all_model_classes variable.
-def find_tested_models(test_file):
-    """Parse the content of test_file to detect what's in all_model_classes"""
-    # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the class
+def find_tested_models(test_file: str) -> List[str]:
+    """
+    Parse the content of test_file to detect what's in `all_model_classes`. This detects the models that inherit from
+    the common test class.
+
+    Args:
+        test_file (`str`): The path to the test file to check
+
+    Returns:
+        `List[str]`: The list of models tested in that file.
+    """
     with open(os.path.join(PATH_TO_TESTS, test_file), "r", encoding="utf-8", newline="\n") as f:
         content = f.read()
     all_models = re.findall(r"all_model_classes\s+=\s+\(\s*\(([^\)]*)\)", content)
@@ -571,8 +530,25 @@ def find_tested_models(test_file):
         return model_tested
 
 
-def check_models_are_tested(module, test_file):
-    """Check models defined in module are tested in test_file."""
+def should_be_tested(model_name: str) -> bool:
+    """
+    Whether or not a model should be tested.
+    """
+    if model_name in IGNORE_NON_TESTED:
+        return False
+    return not is_building_block(model_name)
+
+
+def check_models_are_tested(module: types.ModuleType, test_file: str) -> List[str]:
+    """Check models defined in a module are all tested in a given file.
+
+    Args:
+        module (`types.ModuleType`): The module in which we get the models.
+        test_file (`str`): The path to the file where the module is tested.
+
+    Returns:
+        `List[str]`: The list of error messages corresponding to models not tested.
+    """
     # XxxPreTrainedModel are not tested
     defined_models = get_models(module)
     tested_models = find_tested_models(test_file)
@@ -586,7 +562,7 @@ def check_models_are_tested(module, test_file):
         ]
     failures = []
     for model_name, _ in defined_models:
-        if model_name not in tested_models and model_name not in IGNORE_NON_TESTED:
+        if model_name not in tested_models and should_be_tested(model_name):
             failures.append(
                 f"{model_name} is defined in {module.__name__} but is not tested in "
                 + f"{os.path.join(PATH_TO_TESTS, test_file)}. Add it to the all_model_classes in that file."
@@ -602,6 +578,7 @@ def check_all_models_are_tested():
     test_files = get_model_test_files()
     failures = []
     for module in modules:
+        # Matches a module to its test file.
         test_file = [file for file in test_files if f"test_{module.__name__.split('.')[-1]}.py" in file]
         if len(test_file) == 0:
             failures.append(f"{module.__name__} does not have its corresponding test file {test_file}.")
@@ -616,7 +593,7 @@ def check_all_models_are_tested():
         raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
 
 
-def get_all_auto_configured_models():
+def get_all_auto_configured_models() -> List[str]:
     """Return the list of all models in at least one auto class."""
     result = set()  # To avoid duplicates we concatenate all model classes in a set.
     if is_torch_available():
@@ -634,8 +611,8 @@ def get_all_auto_configured_models():
     return list(result)
 
 
-def ignore_unautoclassed(model_name):
-    """Rules to determine if `name` should be in an auto class."""
+def ignore_unautoclassed(model_name: str) -> bool:
+    """Rules to determine if a model should be in an auto class."""
     # Special white list
     if model_name in IGNORE_NON_AUTO_CONFIGURED:
         return True
@@ -645,8 +622,19 @@ def ignore_unautoclassed(model_name):
     return False
 
 
-def check_models_are_auto_configured(module, all_auto_models):
-    """Check models defined in module are each in an auto class."""
+def check_models_are_auto_configured(module: types.ModuleType, all_auto_models: List[str]) -> List[str]:
+    """
+    Check models defined in module are each in an auto class.
+
+    Args:
+        module (`types.ModuleType`):
+            The module in which we get the models.
+        all_auto_models (`List[str]`):
+            The list of all models in an auto class (as obtained with `get_all_auto_configured_models()`).
+
+    Returns:
+        `List[str]`: The list of error messages corresponding to models not tested.
+    """
     defined_models = get_models(module)
     failures = []
     for model_name, _ in defined_models:
@@ -661,6 +649,7 @@ def check_models_are_auto_configured(module, all_auto_models):
 
 def check_all_models_are_auto_configured():
     """Check all models are each in an auto class."""
+    # This is where we need to check we have all backends or the check is incomplete.
     check_missing_backends()
     modules = get_model_modules()
     all_auto_models = get_all_auto_configured_models()
@@ -675,6 +664,7 @@ def check_all_models_are_auto_configured():
 
 def check_all_auto_object_names_being_defined():
     """Check all names defined in auto (name) mappings exist in the library."""
+    # This is where we need to check we have all backends or the check is incomplete.
     check_missing_backends()
 
     failures = []
@@ -695,7 +685,7 @@ def check_all_auto_object_names_being_defined():
         mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
 
     for name, mapping in mappings_to_check.items():
-        for model_type, class_names in mapping.items():
+        for _, class_names in mapping.items():
             if not isinstance(class_names, tuple):
                 class_names = (class_names,)
                 for class_name in class_names:
@@ -716,6 +706,7 @@ def check_all_auto_object_names_being_defined():
 
 def check_all_auto_mapping_names_in_config_mapping_names():
     """Check all keys defined in auto mappings (mappings of names) appear in `CONFIG_MAPPING_NAMES`."""
+    # This is where we need to check we have all backends or the check is incomplete.
     check_missing_backends()
 
     failures = []
@@ -736,7 +727,7 @@ def check_all_auto_mapping_names_in_config_mapping_names():
         mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
 
     for name, mapping in mappings_to_check.items():
-        for model_type, class_names in mapping.items():
+        for model_type in mapping:
             if model_type not in CONFIG_MAPPING_NAMES:
                 failures.append(
                     f"`{model_type}` appears in the mapping `{name}` but it is not defined in the keys of "
@@ -747,7 +738,8 @@ def check_all_auto_mapping_names_in_config_mapping_names():
 
 
 def check_all_auto_mappings_importable():
-    """Check all auto mappings could be imported."""
+    """Check all auto mappings can be imported."""
+    # This is where we need to check we have all backends or the check is incomplete.
     check_missing_backends()
 
     failures = []
@@ -761,7 +753,7 @@ def check_all_auto_mappings_importable():
         mapping_names = [x for x in dir(module) if x.endswith("_MAPPING_NAMES")]
         mappings_to_check.update({name: getattr(module, name) for name in mapping_names})
 
-    for name, _ in mappings_to_check.items():
+    for name in mappings_to_check:
         name = name.replace("_MAPPING_NAMES", "_MAPPING")
         if not hasattr(transformers, name):
             failures.append(f"`{name}`")
@@ -770,44 +762,46 @@ def check_all_auto_mappings_importable():
 
 
 def check_objects_being_equally_in_main_init():
-    """Check if an object is in the main __init__ if its counterpart in PyTorch is."""
+    """
+    Check if a (TensorFlow or Flax) object is in the main __init__ iif its counterpart in PyTorch is.
+    """
     attrs = dir(transformers)
 
     failures = []
     for attr in attrs:
         obj = getattr(transformers, attr)
-        if hasattr(obj, "__module__"):
-            module_path = obj.__module__
-            if "models.deprecated" in module_path:
-                continue
-            module_name = module_path.split(".")[-1]
-            module_dir = ".".join(module_path.split(".")[:-1])
-            if (
-                module_name.startswith("modeling_")
-                and not module_name.startswith("modeling_tf_")
-                and not module_name.startswith("modeling_flax_")
-            ):
-                parent_module = sys.modules[module_dir]
-
-                frameworks = []
-                if is_tf_available():
-                    frameworks.append("TF")
-                if is_flax_available():
-                    frameworks.append("Flax")
-
-                for framework in frameworks:
-                    other_module_path = module_path.replace("modeling_", f"modeling_{framework.lower()}_")
-                    if os.path.isfile("src/" + other_module_path.replace(".", "/") + ".py"):
-                        other_module_name = module_name.replace("modeling_", f"modeling_{framework.lower()}_")
-                        other_module = getattr(parent_module, other_module_name)
-                        if hasattr(other_module, f"{framework}{attr}"):
-                            if not hasattr(transformers, f"{framework}{attr}"):
-                                if f"{framework}{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
-                                    failures.append(f"{framework}{attr}")
-                        if hasattr(other_module, f"{framework}_{attr}"):
-                            if not hasattr(transformers, f"{framework}_{attr}"):
-                                if f"{framework}_{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
-                                    failures.append(f"{framework}_{attr}")
+        if not hasattr(obj, "__module__") or "models.deprecated" in obj.__module__:
+            continue
+
+        module_path = obj.__module__
+        module_name = module_path.split(".")[-1]
+        module_dir = ".".join(module_path.split(".")[:-1])
+        if (
+            module_name.startswith("modeling_")
+            and not module_name.startswith("modeling_tf_")
+            and not module_name.startswith("modeling_flax_")
+        ):
+            parent_module = sys.modules[module_dir]
+
+            frameworks = []
+            if is_tf_available():
+                frameworks.append("TF")
+            if is_flax_available():
+                frameworks.append("Flax")
+
+            for framework in frameworks:
+                other_module_path = module_path.replace("modeling_", f"modeling_{framework.lower()}_")
+                if os.path.isfile("src/" + other_module_path.replace(".", "/") + ".py"):
+                    other_module_name = module_name.replace("modeling_", f"modeling_{framework.lower()}_")
+                    other_module = getattr(parent_module, other_module_name)
+                    if hasattr(other_module, f"{framework}{attr}"):
+                        if not hasattr(transformers, f"{framework}{attr}"):
+                            if f"{framework}{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
+                                failures.append(f"{framework}{attr}")
+                    if hasattr(other_module, f"{framework}_{attr}"):
+                        if not hasattr(transformers, f"{framework}_{attr}"):
+                            if f"{framework}_{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK:
+                                failures.append(f"{framework}_{attr}")
     if len(failures) > 0:
         raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures))
 
@@ -815,8 +809,16 @@ def check_objects_being_equally_in_main_init():
 _re_decorator = re.compile(r"^\s*@(\S+)\s+$")
 
 
-def check_decorator_order(filename):
-    """Check that in the test file `filename` the slow decorator is always last."""
+def check_decorator_order(filename: str) -> List[int]:
+    """
+    Check that in a given test file, the slow decorator is always last.
+
+    Args:
+        filename (`str`): The path to a test file to check.
+
+    Returns:
+        `List[int]`: The list of failures as a list of indices where there are problems.
+    """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     decorator_before = None
@@ -849,8 +851,13 @@ def check_all_decorator_order():
         )
 
 
-def find_all_documented_objects():
-    """Parse the content of all doc files to detect which classes and functions it documents"""
+def find_all_documented_objects() -> List[str]:
+    """
+    Parse the content of all doc files to detect which classes and functions it documents.
+
+    Returns:
+        `List[str]`: The list of all object names being documented.
+    """
     documented_obj = []
     for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"):
         with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
@@ -959,8 +966,8 @@ def find_all_documented_objects():
 ]
 
 
-def ignore_undocumented(name):
-    """Rules to determine if `name` should be undocumented."""
+def ignore_undocumented(name: str) -> bool:
+    """Rules to determine if `name` should be undocumented (returns `True` if it should not be documented)."""
     # NOT DOCUMENTED ON PURPOSE.
     # Constants uppercase are not documented.
     if name.isupper():
@@ -1047,7 +1054,7 @@ def check_model_type_doc_match():
 _re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE)
 
 
-def is_rst_docstring(docstring):
+def is_rst_docstring(docstring: str) -> True:
     """
     Returns `True` if `docstring` is written in rst.
     """
@@ -1061,7 +1068,7 @@ def is_rst_docstring(docstring):
 
 
 def check_docstrings_are_in_md():
-    """Check all docstrings are in md"""
+    """Check all docstrings are written in md and nor rst."""
     files_with_rst = []
     for file in Path(PATH_TO_TRANSFORMERS).glob("**/*.py"):
         with open(file, encoding="utf-8") as f:
@@ -1084,6 +1091,9 @@ def check_docstrings_are_in_md():
 
 
 def check_deprecated_constant_is_up_to_date():
+    """
+    Check if the constant `DEPRECATED_MODELS` in `models/auto/configuration_auto.py` is up to date.
+    """
     deprecated_folder = os.path.join(PATH_TO_TRANSFORMERS, "models", "deprecated")
     deprecated_models = [m for m in os.listdir(deprecated_folder) if not m.startswith("_")]
 

From 123ad5363f1a24bdf8ff97e10200f05144cdebf6 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 10 Aug 2023 10:42:34 +0100
Subject: [PATCH 913/935] Generation: strict generation config validation at
 save time (#25411)

* strict gen config save; Add tests

* add note that the warning will be an exception in v4.34
---
 .../generation/configuration_utils.py         | 30 ++++++++++++----
 tests/generation/test_configuration_utils.py  | 35 +++++++++++++++++++
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 51644d9a6f0c..ef0963f675d0 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -354,8 +354,8 @@ def validate(self, is_init=False):
         # 1. detect sampling-only parameterization when not in sampling mode
         if self.do_sample is False:
             greedy_wrong_parameter_msg = (
-                "`do_sample` is set to `False`. However, {flag_name} is set to {flag_value} -- this flag is only used "
-                "in sample-based generation modes. You should set `do_sample=True` or unset {flag_name}."
+                "`do_sample` is set to `False`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
+                "used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
                 + fix_location
             )
             if self.temperature != 1.0:
@@ -392,8 +392,8 @@ def validate(self, is_init=False):
         # 2. detect beam-only parameterization when not in beam mode
         if self.num_beams == 1:
             single_beam_wrong_parameter_msg = (
-                "`num_beams` is set to 1. However, {flag_name} is set to {flag_value} -- this flag is only used in "
-                "beam-based generation modes. You should set `num_beams>1` or unset {flag_name}." + fix_location
+                "`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
+                "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`." + fix_location
             )
             if self.early_stopping is not False:
                 warnings.warn(
@@ -430,9 +430,9 @@ def validate(self, is_init=False):
             # constrained beam search
             if self.constraints is not None:
                 constrained_wrong_parameter_msg = (
-                    "`constraints` is not `None`, triggering constrained beam search. However, {flag_name} is set to "
-                    "{flag_value}, which is incompatible with this generation mode. Set `constraints=None` or unset "
-                    "{flag_name} to continue." + fix_location
+                    "`constraints` is not `None`, triggering constrained beam search. However, `{flag_name}` is set "
+                    "to `{flag_value}`, which is incompatible with this generation mode. Set `constraints=None` or "
+                    "unset `{flag_name}` to continue." + fix_location
                 )
                 if self.do_sample is True:
                     raise ValueError(
@@ -497,6 +497,22 @@ def save_pretrained(
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
+
+        # At save time, validate the instance -- if any warning/exception is thrown, we refuse to save the instance
+        try:
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                self.validate()
+            for w in caught_warnings:
+                raise ValueError(w.message)
+        except ValueError as exc:
+            warnings.warn(
+                "The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. "
+                "Fix these issues to save the configuration. This warning will be raised to an exception in v4.34."
+                "\n\nThrown during validation:\n" + str(exc),
+                UserWarning,
+            )
+            return
+
         use_auth_token = kwargs.pop("use_auth_token", None)
 
         if use_auth_token is not None:
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index f58b227c1459..a181b00ee08d 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 
 import copy
+import os
 import tempfile
 import unittest
+import warnings
 
 from huggingface_hub import HfFolder, delete_repo
 from parameterized import parameterized
@@ -118,6 +120,39 @@ def test_kwarg_init(self):
         self.assertEqual(loaded_config.do_sample, True)
         self.assertEqual(loaded_config.num_beams, 1)  # default value
 
+    def test_refuse_to_save(self):
+        """Tests that we refuse to save a generation config that fails validation."""
+
+        # setting the temperature alone is invalid, as we also need to set do_sample to True -> throws a warning that
+        # is caught, doesn't save, and raises a warning
+        config = GenerationConfig()
+        config.temperature = 0.5
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 1)
+            self.assertTrue("Fix these issues to save the configuration." in str(captured_warnings[0].message))
+            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
+
+        # greedy decoding throws an exception if we try to return multiple sequences -> throws an exception that is
+        # caught, doesn't save, and raises a warning
+        config = GenerationConfig()
+        config.num_return_sequences = 2
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 1)
+            self.assertTrue("Fix these issues to save the configuration." in str(captured_warnings[0].message))
+            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
+
+        # final check: no warnings thrown if it is correct, and file is saved
+        config = GenerationConfig()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 0)
+            self.assertTrue(len(os.listdir(tmp_dir)) == 1)
+
 
 @is_staging_test
 class ConfigPushToHubTester(unittest.TestCase):

From d0839f1a7429cac49ed5eb347f0a536258c3707f Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Thu, 10 Aug 2023 10:50:12 +0100
Subject: [PATCH 914/935] [WavLM] Fix Arxiv link and authors (#25415)

* [WavLM] Fix Arxiv link and authors

* make style
---
 src/transformers/models/wavlm/modeling_wavlm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index d573ee601b4c..61a75babf1c0 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -1059,8 +1059,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 WAVLM_START_DOCSTRING = r"""
     WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
-    Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
-    Michael Zeng, Xuedong Huang.
+    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
+    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
+    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving etc.).

From 3e41cf13fc56335ace852b14decc198557052d4f Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 10 Aug 2023 10:54:26 +0100
Subject: [PATCH 915/935] Generate: Load generation config when `device_map` is
 passed (#25413)

---
 src/transformers/modeling_utils.py | 24 ++++++++++++++----------
 tests/test_modeling_utils.py       | 14 ++++++++++++++
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index adb2b4919d77..f5b5fa4dd760 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2849,9 +2849,9 @@ def from_pretrained(
                     "'sequential'."
                 )
 
-            kwargs = {"no_split_module_classes": no_split_modules}
+            device_map_kwargs = {"no_split_module_classes": no_split_modules}
             if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
-                kwargs["special_dtypes"] = special_dtypes
+                device_map_kwargs["special_dtypes"] = special_dtypes
             elif len(special_dtypes) > 0:
                 logger.warning(
                     "This model has some weights that should be kept in higher precision, you need to upgrade "
@@ -2863,12 +2863,12 @@ def from_pretrained(
                     dtype=target_dtype,
                     low_zero=(device_map == "balanced_low_0"),
                     max_memory=max_memory,
-                    **kwargs,
+                    **device_map_kwargs,
                 )
-            kwargs["max_memory"] = max_memory
+            device_map_kwargs["max_memory"] = max_memory
             # Make sure tied weights are tied before creating the device map.
             model.tie_weights()
-            device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
+            device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs)
 
             if load_in_8bit or load_in_4bit:
                 # The LM head / tied weights or any last module can stay on disk / CPU
@@ -2966,7 +2966,7 @@ def from_pretrained(
         model.eval()
 
         # If it is a model with generation capabilities, attempt to load the generation config
-        if model.can_generate():
+        if model.can_generate() and pretrained_model_name_or_path is not None:
             try:
                 model.generation_config = GenerationConfig.from_pretrained(
                     pretrained_model_name_or_path,
@@ -2982,7 +2982,7 @@ def from_pretrained(
                     _from_pipeline=from_pipeline,
                     **kwargs,
                 )
-            except (OSError, TypeError):
+            except OSError:
                 logger.info(
                     "Generation config file not found, using a generation config created from the model config."
                 )
@@ -2990,10 +2990,14 @@ def from_pretrained(
 
         # Dispatch model with hooks on all devices if necessary
         if device_map is not None:
-            kwargs = {"device_map": device_map, "offload_dir": offload_folder, "offload_index": offload_index}
+            device_map_kwargs = {
+                "device_map": device_map,
+                "offload_dir": offload_folder,
+                "offload_index": offload_index,
+            }
             if "skip_keys" in inspect.signature(dispatch_model).parameters:
-                kwargs["skip_keys"] = model._skip_keys_device_placement
-            dispatch_model(model, **kwargs)
+                device_map_kwargs["skip_keys"] = model._skip_keys_device_placement
+            dispatch_model(model, **device_map_kwargs)
 
         if output_loading_info:
             if loading_info is None:
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 5019d0ccb308..bdadbe08005b 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -1036,6 +1036,20 @@ def test_pretrained_low_mem_new_config(self):
 
             self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__)
 
+    def test_generation_config_is_loaded_with_model(self):
+        # Note: `joaogante/tiny-random-gpt2-with-generation-config` has a `generation_config.json` containing a dummy
+        # `transformers_version` field set to `foo`. If loading the file fails, this test also fails.
+
+        # 1. Load without further parameters
+        model = AutoModelForCausalLM.from_pretrained("joaogante/tiny-random-gpt2-with-generation-config")
+        self.assertEqual(model.generation_config.transformers_version, "foo")
+
+        # 2. Load with `device_map`
+        model = AutoModelForCausalLM.from_pretrained(
+            "joaogante/tiny-random-gpt2-with-generation-config", device_map="auto"
+        )
+        self.assertEqual(model.generation_config.transformers_version, "foo")
+
 
 @require_torch
 @is_staging_test

From e7b001db4fbd33d77de95cf684d13d7605660d1b Mon Sep 17 00:00:00 2001
From: Merve Noyan <merveenoyan@gmail.com>
Date: Thu, 10 Aug 2023 14:25:00 +0300
Subject: [PATCH 916/935] Fix rendering for `torch.compile()` docs (#25432)

fix rendering
---
 docs/source/en/perf_torch_compile.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md
index 6c51cd2e8ea3..a840e7d551ce 100644
--- a/docs/source/en/perf_torch_compile.md
+++ b/docs/source/en/perf_torch_compile.md
@@ -293,6 +293,7 @@ Below you can find inference durations in milliseconds for each model with and w
 We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://download.pytorch.org/whl/nightly/cu118)) and observed improvement in latency both for uncompiled and compiled models. 
 
 ### A100
+
 | **Task/Model** | **Batch Size** | **torch 2.0 - no compile** | **torch 2.0 -<br> compile** |
 |:---:|:---:|:---:|:---:|
 | Image Classification/BeiT | Unbatched | 12.462 | 6.954 | 
@@ -303,6 +304,7 @@ We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://
 | Object Detection/DETR | 16 | 163.749 | 163.706  |
 
 ### T4
+
 | **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
 |:---:|:---:|:---:|:---:|
 | Image Classification/BeiT | Unbatched | 14.408 | 14.052 | 
@@ -313,6 +315,7 @@ We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://
 | Object Detection/DETR | 16 | OOM | OOM   |
 
 ### V100
+
 | **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
 |:---:|:---:|:---:|:---:|
 | Image Classification/BeiT | Unbatched | 13.477 | 7.926 | 
@@ -327,6 +330,7 @@ We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://
 We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly.
 
 ### A100
+
 | **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
 |:---:|:---:|:---:|:---:|
 | Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | 
@@ -340,6 +344,7 @@ We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly.
 
 
 ### T4
+
 | **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** | 
 |:---:|:---:|:---:|:---:|
 | Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | 

From 2d6839eaa666a106e3fa338864143bf2ef22f976 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 10 Aug 2023 16:42:05 +0200
Subject: [PATCH 917/935] Add `examples`  to tests to run when `setup.py` is
 modified (#25437)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/tests_fetcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index f5a2904027b1..aee7a3cbb590 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -690,7 +690,7 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filter_models=T
 
     # Grab the corresponding test files:
     if "setup.py" in modified_files:
-        test_files_to_run = ["tests"]
+        test_files_to_run = ["tests", "examples"]
         repo_utils_launch = True
     else:
         # All modified tests need to be run.

From a7da2996a00c0ea083012ac86ab70f0bc4799f33 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 10 Aug 2023 11:07:32 -0400
Subject: [PATCH 918/935] Fix issue with ratio evaluation steps and auto find
 batch size (#25436)

* Fully rebased solution

* 500
---
 src/transformers/integrations.py     |  2 +-
 src/transformers/trainer.py          | 25 +++++++++++++++++--------
 src/transformers/trainer_callback.py | 17 +++++++++++++----
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 456f8d9d06ae..112c1a7a5525 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -746,7 +746,7 @@ def setup(self, args, state, model, **kwargs):
             # keep track of model topology and gradients, unsupported on TPU
             _watch_model = os.getenv("WANDB_WATCH", "false")
             if not is_torch_tpu_available() and _watch_model in ("all", "parameters", "gradients"):
-                self._wandb.watch(model, log=_watch_model, log_freq=max(100, args.logging_steps))
+                self._wandb.watch(model, log=_watch_model, log_freq=max(100, state.logging_steps))
 
     def on_train_begin(self, args, state, control, model=None, **kwargs):
         if self._wandb is None:
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 82fd41874d73..e3971dec474a 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1586,14 +1586,6 @@ def _inner_training_loop(
                 f" {args.max_steps}"
             )
 
-        # Compute absolute values for logging, eval, and save if given as ratio
-        if args.logging_steps and args.logging_steps < 1:
-            args.logging_steps = math.ceil(max_steps * args.logging_steps)
-        if args.eval_steps and args.eval_steps < 1:
-            args.eval_steps = math.ceil(max_steps * args.eval_steps)
-        if args.save_steps and args.save_steps < 1:
-            args.save_steps = math.ceil(max_steps * args.save_steps)
-
         if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
             if self.args.n_gpu > 1:
                 # nn.DataParallel(model) replicates the model, creating new variables and module
@@ -1627,6 +1619,23 @@ def _inner_training_loop(
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
 
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+
         # Activate gradient checkpointing if needed
         if args.gradient_checkpointing:
             self.model.gradient_checkpointing_enable()
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index bade6841ed1b..49b12ea558d4 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -53,6 +53,12 @@ class TrainerState:
             During training, represents the number of update steps completed.
         max_steps (`int`, *optional*, defaults to 0):
             The number of update steps to do during the current training.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Log every X updates steps
+        eval_steps (`int`, *optional*):
+            Run an evaluation every X steps.
+        save_steps (`int`, *optional*, defaults to 500):
+            Save checkpoint every X updates steps.
         total_flos (`float`, *optional*, defaults to 0):
             The total number of floating operations done by the model since the beginning of training (stored as floats
             to avoid overflow).
@@ -77,6 +83,9 @@ class TrainerState:
     epoch: Optional[float] = None
     global_step: int = 0
     max_steps: int = 0
+    logging_steps: int = 500
+    eval_steps: int = 500
+    save_steps: int = 500
     num_train_epochs: int = 0
     total_flos: float = 0
     log_history: List[Dict[str, float]] = None
@@ -421,13 +430,13 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         # Log
         if state.global_step == 1 and args.logging_first_step:
             control.should_log = True
-        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % args.logging_steps == 0:
+        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % state.logging_steps == 0:
             control.should_log = True
 
         # Evaluate
         if (
             args.evaluation_strategy == IntervalStrategy.STEPS
-            and state.global_step % args.eval_steps == 0
+            and state.global_step % state.eval_steps == 0
             and args.eval_delay <= state.global_step
         ):
             control.should_evaluate = True
@@ -435,8 +444,8 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         # Save
         if (
             args.save_strategy == IntervalStrategy.STEPS
-            and args.save_steps > 0
-            and state.global_step % args.save_steps == 0
+            and state.save_steps > 0
+            and state.global_step % state.save_steps == 0
         ):
             control.should_save = True
 

From 347001237a8ff845fc23f678107fc505361f9f13 Mon Sep 17 00:00:00 2001
From: Alan Ji <hzji210@gmail.com>
Date: Thu, 10 Aug 2023 23:13:39 +0800
Subject: [PATCH 919/935] docs: add LLaMA-Efficient-Tuning to
 awesome-transformers (#25441)

Co-authored-by: statelesshz <jihuazhong1@huawei.com>
---
 awesome-transformers.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/awesome-transformers.md b/awesome-transformers.md
index dad4da8a8439..013f88259c91 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -601,3 +601,9 @@ All Hugging Face models and pipelines can be seamlessly integrated into BentoML
 
 Keywords: BentoML, Framework, Deployment, AI Applications
 
+## [LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning)
+
+[LLaMA-Efficient-Tuning](https://github.com/hiyouga/LLaMA-Efficient-Tuning) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning).
+
+Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen
+

From 55db70c63de2c07b6ffe36f24c0e7df8f967e935 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Thu, 10 Aug 2023 16:06:29 -0400
Subject: [PATCH 920/935] GPTQ integration (#25062)

* GTPQ integration

* Add tests for gptq

* support for more quantization model

* fix style

* typo

* fix method

* Update src/transformers/modeling_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* add dataclass and fix quantization_method

* fix doc

* Update tests/quantization/gptq/test_gptq.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* modify dataclass

* add gtpqconfig import

* fix typo

* fix tests

* remove dataset as req arg

* remove tokenizer import

* add offload cpu quantization test

* fix check dataset

* modify dockerfile

* protect trainer

* style

* test for config

* add more log

* overwrite torch_dtype

* draft doc

* modify quantization_config docstring

* fix class name in docstring

* Apply suggestions from code review

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* more warning

* fix 8bit kwargs tests

* peft compatibility

* remove var

* fix is_gptq_quantized

* remove is_gptq_quantized

* fix wrap

* Update src/transformers/modeling_utils.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* add exllama

* skip test

* overwrite float16

* style

* fix skip test

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* fix docsting formatting

* add doc

* better test

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile |   7 +-
 docs/source/en/main_classes/quantization.md   | 133 +++++++-
 src/transformers/__init__.py                  |   4 +-
 src/transformers/modeling_utils.py            | 118 ++++++--
 src/transformers/models/auto/auto_factory.py  |   5 +
 src/transformers/testing_utils.py             |   8 +
 src/transformers/trainer.py                   |  13 +-
 src/transformers/utils/__init__.py            |   1 +
 src/transformers/utils/import_utils.py        |   5 +
 src/transformers/utils/quantization_config.py | 286 +++++++++++++-----
 tests/{ => quantization}/bnb/README.md        |   0
 tests/{ => quantization}/bnb/__init__.py      |   0
 tests/{ => quantization}/bnb/test_4bit.py     |   0
 .../{ => quantization}/bnb/test_mixed_int8.py |   1 +
 tests/quantization/gptq/__init__.py           |   0
 tests/quantization/gptq/test_gptq.py          | 272 +++++++++++++++++
 16 files changed, 750 insertions(+), 103 deletions(-)
 rename tests/{ => quantization}/bnb/README.md (100%)
 rename tests/{ => quantization}/bnb/__init__.py (100%)
 rename tests/{ => quantization}/bnb/test_4bit.py (100%)
 rename tests/{ => quantization}/bnb/test_mixed_int8.py (99%)
 create mode 100644 tests/quantization/gptq/__init__.py
 create mode 100644 tests/quantization/gptq/test_gptq.py

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index eda1fca6a61b..168aba9dbbc7 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -47,8 +47,11 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/acc
 # Add bitsandbytes for mixed int8 testing
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
-# For bettertransformer
-RUN python3 -m pip install --no-cache-dir optimum
+# Add auto-gptq for gtpq quantization testing
+RUN python3 -m pip install --no-cache-dir auto-gptq 
+
+# For bettertransformer + gptq 
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
 
 # For video model testing
 RUN python3 -m pip install --no-cache-dir decord av==9.2.0
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index e37f8cd9987a..eae71c9cc9e4 100644
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -16,6 +16,137 @@ rendered properly in your Markdown viewer.
 
 # Quantize 🤗 Transformers models
 
+## `AutoGPTQ` Integration
+
+🤗 Transformers has integrated `optimum` API to perform GPTQ quantization on language models. You can load and quantize your model in 8,6,4 or even 2 bits without a big drop of performance and faster inference speed! This is supported by most GPU hardwares.
+
+To learn more about the the quantization model, check out: 
+- the [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) paper
+<!-- - the `optimum` [guide]() on GPTQ quantization -->
+- the [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) library used as the backend
+
+### Requirements
+
+You need to have the following requirements installed to run the code below: 
+
+- Install latest `AutoGPTQ` library
+`pip install auto-gptq`
+
+- Install latest `optimum` from source 
+`pip install git+https://github.com/huggingface/optimum.git`
+
+- Install latest `transformers` from source 
+`pip install git+https://github.com/huggingface/transformers.git`
+
+- Install latest `accelerate` library 
+`pip install --upgrade accelerate`
+GPTQ integration supports for now only text models and you may encounter unexpected behaviour for vision, speech or multi-modal models.
+
+### Load and quantize a model
+
+GPTQ is a quantization method that requires weights calibration before using the quantized models. If you want to quantize transformers model from scratch, it might take some time before producing the quantized model (~10 min on a Google colab for `facebook/opt-350m` model. 
+
+Hence, there are two different scenarios where you want to use GPTQ-quantized models. The first use case would be to load models that has been already quantized by other users that are available on the Hub, the second use case would be to quantize your model from scratch and save it or push it on the Hub so that other users can also use it.
+#### GPTQ Configuration
+
+In order to load and quantize a model, you need to create a [`GPTQConfig`]. You need to pass the number of `bits`, a `dataset` in order to calibrate the quantization and the `tokenizer` of the model in order prepare the dataset.
+
+```python 
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
+```
+
+Note that you can pass your own dataset as a list of string. However, it is highly recommended to use the dataset from the GPTQ paper. 
+```python
+dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
+```
+
+#### Quantization
+
+You can quantize a model by using `from_pretrained` and setting the `quantization_config`. 
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
+```
+Note that you will need a GPU to quantize a model. We will put the model in the cpu and move the modules back and forth to the gpu in order to quantize them.
+
+If you want to maximize your gpus usage while using cpu offload, you can set `device_map = "auto"`.
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+```
+Note that disk offload is not supported. Furthermore, if you are out of memory because of the dataset, you may have to pass `max_memory` in `from_pretained`. Checkout this [guide](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map) to learn more about `device_map` and `max_memory`.
+
+<Tip warning={true}>
+GPTQ quantization only works for text model for now. Futhermore, the quantization process can a lot of time depending on one's hardware (175B model = 4 gpu hours using NVIDIA A100). Please check on the hub if there is not a GPTQ quantized version of the model. If not, you can submit a demand on github. 
+</Tip>
+
+### Push quantized model to 🤗 Hub
+
+You can push the quantized model like any 🤗 model to Hub with `push_to_hub`. The quantization config will be saved and pushed along the model. 
+
+```python
+quantized_model.push_to_hub("opt-125m-gptq")
+tokenizer.push_to_hub("opt-125m-gptq")
+```
+
+If you want to save your quantized model on your local machine, you can also do it with `save_pretrained`: 
+```python
+quantized_model.save_pretrained("opt-125m-gptq")
+tokenizer.save_pretrained("opt-125m-gptq")
+```
+
+Note that if you have quantized your model with a `device_map`, make sure to move the entire model to one of your gpus or the `cpu` before saving it. 
+```python
+quantized_model.to("cpu")
+quantized_model.save_pretrained("opt-125m-gptq")
+```
+
+### Load a quantized model from the 🤗 Hub
+
+You can load a quantized model from the Hub by using `from_pretrained`.
+Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
+```
+
+If you want to load a model faster and without allocating more memory than needed, the `device_map` argument also works with quantized model. Make sure that you have `accelerate` library installed.
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
+```
+
+### Exllama kernels for faster inference
+
+For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `disable_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels. 
+
+```py
+import torch
+gptq_config = GPTQConfig(bits=4, disable_exllama=False)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
+```
+
+Note that only 4-bit models are supported for now. Furthermore, it is recommended to deactivate the exllama kernels if you are finetuning a quantized model with peft. 
+
+#### Fine-tune a quantized model 
+
+With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ. 
+Please have a look at [`peft`](https://github.com/huggingface/peft) library for more details.
+
+### Example demo
+
+Check out the Google Colab [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) to learn how to quantize your model with GPTQ and how finetune the quantized model with peft. 
+
+### GPTQConfig
+
+[[autodoc]] GPTQConfig
+
+
 ## `bitsandbytes` Integration
 
 🤗 Transformers is closely integrated with most used modules on `bitsandbytes`. You can load your model in 8-bit precision with few lines of code.
@@ -215,7 +346,7 @@ This section is intended to advanced users, that want to explore what it is poss
 
 One of the advanced use case of this is being able to load a model and dispatch the weights between `CPU` and `GPU`. Note that the weights that will be dispatched on CPU **will not** be converted in 8-bit, thus kept in `float32`. This feature is intended for users that want to fit a very large model and dispatch the model between GPU and CPU.
 
-First, load a `BitsAndBytesConfig` from `transformers` and set the attribute `llm_int8_enable_fp32_cpu_offload` to `True`:
+First, load a [`BitsAndBytesConfig`] from `transformers` and set the attribute `llm_int8_enable_fp32_cpu_offload` to `True`:
 
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2253bda3908a..695e51fbe293 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -731,7 +731,7 @@
         "logging",
     ],
     "utils.bitsandbytes": [],
-    "utils.quantization_config": ["BitsAndBytesConfig"],
+    "utils.quantization_config": ["BitsAndBytesConfig", "GPTQConfig"],
 }
 
 # sentencepiece-backed objects
@@ -4703,7 +4703,7 @@
     )
 
     # bitsandbytes config
-    from .utils.quantization_config import BitsAndBytesConfig
+    from .utils.quantization_config import BitsAndBytesConfig, GPTQConfig
 
     try:
         if not is_sentencepiece_available():
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index f5b5fa4dd760..66e3de6480a0 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -64,6 +64,7 @@
     download_url,
     has_file,
     is_accelerate_available,
+    is_auto_gptq_available,
     is_bitsandbytes_available,
     is_offline_mode,
     is_optimum_available,
@@ -75,7 +76,7 @@
 )
 from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 from .utils.import_utils import ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled, is_torch_fx_proxy
-from .utils.quantization_config import BitsAndBytesConfig
+from .utils.quantization_config import BitsAndBytesConfig, GPTQConfig, QuantizationMethod
 from .utils.versions import require_version_core
 
 
@@ -1915,7 +1916,7 @@ def get_memory_footprint(self, return_buffers=True):
     @wraps(torch.nn.Module.cuda)
     def cuda(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_quantized", False):
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             raise ValueError(
                 "Calling `cuda()` is not supported for `4-bit` or `8-bit` quantized models. Please use the model as it is, since the"
                 " model has already been set to the correct devices and casted to the correct `dtype`."
@@ -1926,29 +1927,29 @@ def cuda(self, *args, **kwargs):
     @wraps(torch.nn.Module.to)
     def to(self, *args, **kwargs):
         # Checks if the model has been loaded in 8-bit
-        if getattr(self, "is_quantized", False):
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             raise ValueError(
-                "`.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
+                "`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the"
                 " model has already been set to the correct devices and casted to the correct `dtype`."
             )
         else:
             return super().to(*args, **kwargs)
 
     def half(self, *args):
-        # Checks if the model has been loaded in 8-bit
+        # Checks if the model is quantized
         if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.half()` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
+                "`.half()` is not supported for quantized model. Please use the model as it is, since the"
                 " model has already been casted to the correct `dtype`."
             )
         else:
             return super().half(*args)
 
     def float(self, *args):
-        # Checks if the model has been loaded in 8-bit
+        # Checks if the model is quantized
         if getattr(self, "is_quantized", False):
             raise ValueError(
-                "`.float()` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
+                "`.float()` is not supported for quantized model. Please use the model as it is, since the"
                 " model has already been casted to the correct `dtype`."
             )
         else:
@@ -2130,9 +2131,9 @@ def from_pretrained(
             load_in_4bit (`bool`, *optional*, defaults to `False`):
                 If `True`, will convert the loaded model into 4bit precision quantized model. To use this feature
                 install the latest version of `bitsandbytes` (`pip install -U bitsandbytes`).
-            quantization_config (`Dict`, *optional*):
-                A dictionary of configuration parameters for the `bitsandbytes` library and loading the model using
-                advanced features such as offloading in fp32 on CPU or on disk.
+            quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
+                A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
+                bitsandbytes, gptq)
             subfolder (`str`, *optional*, defaults to `""`):
                 In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                 specify the folder name here.
@@ -2287,13 +2288,20 @@ def from_pretrained(
                     "Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`"
                 )
 
-        if quantization_config is None:
+        quantization_method_from_args = None
+        if quantization_config is not None:
+            quantization_method_from_args = getattr(
+                quantization_config, "quant_method", QuantizationMethod.BITS_AND_BYTES
+            )
+
+        if quantization_config is None and (load_in_8bit or load_in_4bit):
+            quantization_method_from_args = QuantizationMethod.BITS_AND_BYTES
             quantization_config, kwargs = BitsAndBytesConfig.from_dict(
                 config_dict={"load_in_8bit": load_in_8bit, "load_in_4bit": load_in_4bit},
                 return_unused_kwargs=True,
                 **kwargs,
             )
-        elif quantization_config is not None:
+        elif quantization_method_from_args == QuantizationMethod.BITS_AND_BYTES:
             load_in_8bit = quantization_config.load_in_8bit
             load_in_4bit = quantization_config.load_in_4bit
 
@@ -2375,15 +2383,63 @@ def from_pretrained(
         else:
             model_kwargs = kwargs
 
-        if is_8bit_serializable and quantization_config is not None and load_in_8bit:
-            if hasattr(config, "quantization_config"):
+        quantizer = None
+        quantization_method_from_config = None
+        if hasattr(config, "quantization_config"):
+            quantization_method_from_config = config.quantization_config.get(
+                "quant_method", QuantizationMethod.BITS_AND_BYTES
+            )
+
+        if quantization_method_from_config == QuantizationMethod.GPTQ and quantization_method_from_args is not None:
+            loading_attr_dict = quantization_config.get_loading_attributes()
+            for attr, val in loading_attr_dict.items():
+                config.quantization_config[attr] = val
+            quantization_method_from_args = None
+            logger.warning(
+                "You passed `quantization_config` to `from_pretrained` but the model you're loading already has a "
+                "`quantization_config` attribute and has already quantized weights. However, loading attributes"
+                " (e.g. disable_exllama, use_cuda_fp16) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
+            )
+        if (
+            quantization_method_from_args == QuantizationMethod.GPTQ
+            or quantization_method_from_config == QuantizationMethod.GPTQ
+        ):
+            if not torch.cuda.is_available():
+                raise RuntimeError("GPU is required to quantize or run quantize model.")
+            elif not (is_optimum_available() and is_auto_gptq_available()):
+                raise ImportError(
+                    "Loading GPTQ quantized model requires optimum library : `pip install optimum` and auto-gptq library 'pip install auto-gptq'"
+                )
+            else:
+                # Need to protect the import
+                from optimum.gptq import GPTQQuantizer
+            if quantization_method_from_config == QuantizationMethod.GPTQ:
+                quantization_config = GPTQConfig.from_dict(config.quantization_config)
+                config.quantization_config = quantization_config
+            logger.info(
+                f"Overriding torch_dtype={torch_dtype} with `torch_dtype=torch.float16` due to "
+                "requirements of `auto-gptq` to enable model quantization "
+            )
+            torch_dtype = torch.float16
+            quantizer = GPTQQuantizer.from_dict(quantization_config.to_dict())
+
+        if (
+            is_8bit_serializable
+            and quantization_method_from_args == QuantizationMethod.BITS_AND_BYTES
+            and load_in_8bit
+        ):
+            if quantization_method_from_config == QuantizationMethod.BITS_AND_BYTES:
                 logger.warning(
                     "You passed `quantization_config` to `from_pretrained` but the model you're loading already has a"
                     " `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the"
                     " one you passed to `from_pretrained`."
                 )
             config.quantization_config = quantization_config
-        elif is_8bit_serializable and not load_in_8bit and hasattr(config, "quantization_config"):
+        elif (
+            is_8bit_serializable
+            and not load_in_8bit
+            and quantization_method_from_config == QuantizationMethod.BITS_AND_BYTES
+        ):
             quantization_config = config.quantization_config
             if isinstance(quantization_config, dict):
                 quantization_config = BitsAndBytesConfig.from_dict(quantization_config, return_unused_kwargs=False)
@@ -2413,7 +2469,11 @@ def from_pretrained(
                     if low_cpu_mem_usage is None:
                         low_cpu_mem_usage = True
 
-        elif not is_8bit_serializable and not load_in_8bit and hasattr(config, "quantization_config"):
+        elif (
+            not is_8bit_serializable
+            and not load_in_8bit
+            and quantization_method_from_config == QuantizationMethod.BITS_AND_BYTES
+        ):
             logger.warning(
                 "Detected the presence of a `quantization_config` attribute in the model's configuration but you don't have the correct"
                 " `bitsandbytes` version to support int8 serialization. Please install the latest version of `bitsandbytes` with "
@@ -2800,6 +2860,16 @@ def from_pretrained(
                 "All non-linear modules will be loaded in full precision."
                 " If you want to load the other modules in other precision, please specify a `torch_dtype` attribute."
             )
+        if quantization_method_from_config == QuantizationMethod.GPTQ:
+            model = quantizer.convert_model(model)
+            model._is_quantized_training_enabled = True
+
+        if quantization_method_from_config is not None:
+            model.quantization_method = quantization_method_from_config
+        elif quantization_method_from_args is not None:
+            model.quantization_method = quantization_method_from_args
+        if hasattr(model, "quantization_method"):
+            model.is_quantized = True
 
         if isinstance(device_map, str):
             special_dtypes = {}
@@ -2951,13 +3021,12 @@ def from_pretrained(
                 offload_folder=offload_folder,
                 offload_state_dict=offload_state_dict,
                 dtype=torch_dtype,
-                is_quantized=(load_in_8bit or load_in_4bit),
+                is_quantized=(getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES),
                 keep_in_fp32_modules=keep_in_fp32_modules,
             )
 
         model.is_loaded_in_4bit = load_in_4bit
         model.is_loaded_in_8bit = load_in_8bit
-        model.is_quantized = load_in_8bit or load_in_4bit
 
         # make sure token embedding weights are still tied if needed
         model.tie_weights()
@@ -2999,6 +3068,17 @@ def from_pretrained(
                 device_map_kwargs["skip_keys"] = model._skip_keys_device_placement
             dispatch_model(model, **device_map_kwargs)
 
+        if quantization_method_from_args == QuantizationMethod.GPTQ:
+            if quantization_config.tokenizer is None:
+                quantization_config.tokenizer = pretrained_model_name_or_path
+            if cls.main_input_name != "input_ids":
+                raise RuntimeError("We can only quantize pure text model.")
+            quantizer.quantize_model(model, quantization_config.tokenizer)
+            model.config.quantization_config = GPTQConfig.from_dict(quantizer.to_dict())
+            model._is_quantized_training_enabled = True
+        if quantization_method_from_config == QuantizationMethod.GPTQ:
+            model = quantizer.post_init_model(model)
+
         if output_loading_info:
             if loading_info is None:
                 loading_info = {
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 36d3435eb7b5..a5e4fcbb90ee 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -475,6 +475,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             # meaningless in the context of the config object - torch.dtype values are acceptable
             if kwargs.get("torch_dtype", None) == "auto":
                 _ = kwargs.pop("torch_dtype")
+            # to not overwrite the quantization_config if config has a quantization_config
+            if kwargs.get("quantization_config", None) is not None:
+                _ = kwargs.pop("quantization_config")
 
             config, kwargs = AutoConfig.from_pretrained(
                 pretrained_model_name_or_path,
@@ -487,6 +490,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             # if torch_dtype=auto was passed here, ensure to pass it on
             if kwargs_orig.get("torch_dtype", None) == "auto":
                 kwargs["torch_dtype"] = "auto"
+            if kwargs_orig.get("quantization_config", None) is not None:
+                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
 
         has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
         has_local_code = type(config) in cls._model_mapping.keys()
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index b93a40daa2ac..94811a7bc8a2 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -51,6 +51,7 @@
 from .utils import (
     is_accelerate_available,
     is_apex_available,
+    is_auto_gptq_available,
     is_bitsandbytes_available,
     is_bs4_available,
     is_cython_available,
@@ -776,6 +777,13 @@ def require_optimum(test_case):
     return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
 
 
+def require_auto_gptq(test_case):
+    """
+    Decorator for auto_gptq dependency
+    """
+    return unittest.skipUnless(is_auto_gptq_available(), "test requires auto-gptq")(test_case)
+
+
 def require_phonemizer(test_case):
     """
     Decorator marking a test that requires phonemizer
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index e3971dec474a..24eb9d8590ea 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -146,6 +146,7 @@
     logging,
     strtobool,
 )
+from .utils.quantization_config import QuantizationMethod
 
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
@@ -396,10 +397,9 @@ def __init__(
         if getattr(model, "is_quantized", False):
             if getattr(model, "_is_quantized_training_enabled", False):
                 logger.info(
-                    "The model is loaded in 8-bit precision. To train this model you need to add additional modules"
+                    "The model is quantized. To train this model you need to add additional modules"
                     " inside the model such as adapters using `peft` library and freeze the model weights. Please"
-                    " check "
-                    " the examples in https://github.com/huggingface/peft for more details."
+                    " check the examples in https://github.com/huggingface/peft for more details."
                 )
             else:
                 raise ValueError(
@@ -498,8 +498,11 @@ def __init__(
         self.eval_dataset = eval_dataset
         self.tokenizer = tokenizer
 
-        # Quantized models doesn't support `.to` operation.
-        if self.place_model_on_device and not getattr(model, "is_quantized", False):
+        # Bnb Quantized models doesn't support `.to` operation.
+        if (
+            self.place_model_on_device
+            and not getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
+        ):
             self._move_model_to_device(model, args.device)
 
         # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 728fb5c911d1..83b0128fbc58 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -104,6 +104,7 @@
     get_torch_version,
     is_accelerate_available,
     is_apex_available,
+    is_auto_gptq_available,
     is_bitsandbytes_available,
     is_bs4_available,
     is_coloredlogs_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index c0a8c80f0b09..54ed4030a2b6 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -98,6 +98,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _onnx_available = _is_package_available("onnx")
 _openai_available = _is_package_available("openai")
 _optimum_available = _is_package_available("optimum")
+_auto_gptq_available = _is_package_available("auto_gptq")
 _pandas_available = _is_package_available("pandas")
 _peft_available = _is_package_available("peft")
 _phonemizer_available = _is_package_available("phonemizer")
@@ -554,6 +555,10 @@ def is_optimum_available():
     return _optimum_available
 
 
+def is_auto_gptq_available():
+    return _auto_gptq_available
+
+
 def is_optimum_neuron_available():
     return _optimum_available and _is_package_available("optimum.neuron")
 
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index cc7d195e1752..f0c82602f02e 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -19,7 +19,8 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, Union
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
 
 from packaging import version
 
@@ -33,8 +34,100 @@
 logger = logging.get_logger(__name__)
 
 
+class QuantizationMethod(str, Enum):
+    BITS_AND_BYTES = "bitsandbytes"
+    GPTQ = "gptq"
+
+
 @dataclass
-class BitsAndBytesConfig:
+class QuantizationConfigMixin:
+    """
+    Mixin class for quantization config
+    """
+
+    quant_method: QuantizationMethod
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """
+        Instantiates a [`QuantizationConfigMixin`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
+        """
+
+        config = cls(**config_dict)
+
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `QuantizationConfig()` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return copy.deepcopy(self.__dict__)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+
+@dataclass
+class BitsAndBytesConfig(QuantizationConfigMixin):
     """
     This is a wrapper class about all possible attributes and features that you can play with a model that has been
     loaded using `bitsandbytes`.
@@ -97,6 +190,7 @@ def __init__(
         bnb_4bit_use_double_quant=False,
         **kwargs,
     ):
+        self.quant_method = QuantizationMethod.BITS_AND_BYTES
         self.load_in_8bit = load_in_8bit
         self.load_in_4bit = load_in_4bit
         self.llm_int8_threshold = llm_int8_threshold
@@ -168,88 +262,16 @@ def quantization_method(self):
         else:
             return None
 
-    @classmethod
-    def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
-        """
-        Instantiates a [`BitsAndBytesConfig`] from a Python dictionary of parameters.
-
-        Args:
-            config_dict (`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the configuration object.
-            return_unused_kwargs (`bool`):
-                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
-                `PreTrainedModel`.
-            kwargs (`Dict[str, Any]`):
-                Additional parameters from which to initialize the configuration object.
-
-        Returns:
-            [`BitsAndBytesConfig`]: The configuration object instantiated from those parameters.
-        """
-
-        config = cls(**config_dict)
-
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-            use_diff (`bool`, *optional*, defaults to `True`):
-                If set to `True`, only the difference between the config instance and the default
-                `BitsAndBytesConfig()` is serialized to JSON file.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            config_dict = self.to_dict()
-            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-
-            writer.write(json_string)
-
     def to_dict(self) -> Dict[str, Any]:
         """
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
-
         output = copy.deepcopy(self.__dict__)
         output["bnb_4bit_compute_dtype"] = str(output["bnb_4bit_compute_dtype"]).split(".")[1]
 
         return output
 
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    def to_json_string(self, use_diff: bool = True) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Args:
-            use_diff (`bool`, *optional*, defaults to `True`):
-                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
-                is serialized to JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        if use_diff is True:
-            config_dict = self.to_diff_dict()
-        else:
-            config_dict = self.to_dict()
-        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-
     def to_diff_dict(self) -> Dict[str, Any]:
         """
         Removes all attributes from config which correspond to the default config attributes for better readability and
@@ -271,3 +293,119 @@ def to_diff_dict(self) -> Dict[str, Any]:
                 serializable_config_dict[key] = value
 
         return serializable_config_dict
+
+
+@dataclass
+class GPTQConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `optimum` api for gptq quantization relying on auto_gptq backend.
+
+    Args:
+        bits (`int`):
+            The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
+        tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
+            The tokenizer used to process the dataset. You can pass either:
+                - A custom tokenizer object.
+                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                    Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                    user or organization name, like `dbmdz/bert-base-german-cased`.
+                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                    using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        dataset (`Union[List[str]]`, *optional*):
+            The dataset used for quantization. You can provide your own dataset in a list of string or just use the
+            original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new']
+        group_size (`int`, *optional*, defaults to 128):
+            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+        damp_percent (`float`, *optional*, defaults to 0.01):
+            The percent of the average Hessian diagonal to use for dampening. Recommended value is 0.01.
+        desc_act (`bool`, *optional*, defaults to `True`):
+            Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly
+            speed up inference but the perplexity may become slightly worse. Also known as act-order.
+        sym (`bool`, *optional*, defaults to `True`):
+            Whether to use symetric quantization.
+        true_sequential (`bool`, *optional*, defaults to `True`):
+            Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
+            the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
+            quantization using inputs that have passed through the previously quantized layers.
+        use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
+            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
+        model_seqlen (`int`, *optional*):
+            The maximum sequence length that the model can take.
+        block_name_to_quantize (`str`, *optional*):
+            The transformers block name to quantize.
+        module_name_preceding_first_block (`List[str]`, *optional*):
+            The layers that are preceding the first Transformer block.
+        batch_size (`int`, *optional*, defaults to 1):
+            The batch size used when processing the dataset
+        pad_token_id (`int`, *optional*):
+            The pad token id. Needed to prepare the dataset when `batch_size` > 1.
+        disable_exllama (`bool`, *optional*, defaults to `False`):
+            Whether to use exllama backend. Only works with `bits` = 4.
+    """
+
+    def __init__(
+        self,
+        bits: int,
+        tokenizer: Any = None,
+        dataset: Optional[Union[List[str], str]] = None,
+        group_size: int = 128,
+        damp_percent: float = 0.01,
+        desc_act: bool = True,
+        sym: bool = True,
+        true_sequential: bool = True,
+        use_cuda_fp16: bool = False,
+        model_seqlen: Optional[int] = None,
+        block_name_to_quantize: Optional[str] = None,
+        module_name_preceding_first_block: Optional[List[str]] = None,
+        batch_size: int = 1,
+        pad_token_id: Optional[int] = None,
+        disable_exllama: bool = False,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.GPTQ
+        self.bits = bits
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.group_size = group_size
+        self.damp_percent = damp_percent
+        self.desc_act = desc_act
+        self.sym = sym
+        self.true_sequential = true_sequential
+        self.use_cuda_fp16 = use_cuda_fp16
+        self.model_seqlen = model_seqlen
+        self.block_name_to_quantize = block_name_to_quantize
+        self.module_name_preceding_first_block = module_name_preceding_first_block
+        self.batch_size = batch_size
+        self.pad_token_id = pad_token_id
+        self.disable_exllama = disable_exllama
+        self.post_init()
+
+    def get_loading_attributes(self):
+        attibutes_dict = copy.deepcopy(self.__dict__)
+        loading_attibutes = ["disable_exllama", "use_cuda_fp16"]
+        loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
+        return loading_attibutes_dict
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        if self.bits not in [2, 4, 6, 8]:
+            raise ValueError(f"Only support quantization to [2,4,6,8] bits but found {self.bits}")
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+        if self.dataset is not None:
+            if isinstance(self.dataset, str):
+                if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
+                    raise ValueError(
+                        f"""You have entered a string value for dataset. You can only choose between
+                        ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
+                    )
+            elif not isinstance(self.dataset, list):
+                raise ValueError(
+                    f"""dataset needs to be either a list of string or a value in
+                    ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
+                )
diff --git a/tests/bnb/README.md b/tests/quantization/bnb/README.md
similarity index 100%
rename from tests/bnb/README.md
rename to tests/quantization/bnb/README.md
diff --git a/tests/bnb/__init__.py b/tests/quantization/bnb/__init__.py
similarity index 100%
rename from tests/bnb/__init__.py
rename to tests/quantization/bnb/__init__.py
diff --git a/tests/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
similarity index 100%
rename from tests/bnb/test_4bit.py
rename to tests/quantization/bnb/test_4bit.py
diff --git a/tests/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
similarity index 99%
rename from tests/bnb/test_mixed_int8.py
rename to tests/quantization/bnb/test_mixed_int8.py
index 3e88a366d82b..55e75bb52bd0 100644
--- a/tests/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -753,6 +753,7 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
             model_8bit = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map=device_map,
+                load_in_8bit=True,
                 llm_int8_enable_fp32_cpu_offload=True,
                 offload_folder=tmpdirname,
             )
diff --git a/tests/quantization/gptq/__init__.py b/tests/quantization/gptq/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
new file mode 100644
index 000000000000..257c6f020dd3
--- /dev/null
+++ b/tests/quantization/gptq/test_gptq.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import pytest
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_accelerate,
+    require_auto_gptq,
+    require_optimum,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+class GPTQConfigTest(unittest.TestCase):
+    def test_bits(self):
+        with self.assertRaises(ValueError):
+            GPTQConfig(bits="")
+            GPTQConfig(bits=1)
+        GPTQConfig(bits=2)
+        GPTQConfig(bits=4)
+
+    def test_dataset(self):
+        with self.assertRaises(ValueError):
+            GPTQConfig(bits=2, dataset="auto_gpt")
+        GPTQConfig(bits=2, dataset="c4")
+        GPTQConfig(bits=2, dataset="ptb-new")
+
+    def test_damp_percent(self):
+        with self.assertRaises(ValueError):
+            GPTQConfig(bits=2, damp_percent=10)
+            GPTQConfig(bits=2, damp_percent=-1)
+            GPTQConfig(bits=2, damp_percent="0")
+        GPTQConfig(bits=2, damp_percent=0.01)
+
+    def test_to_dict(self):
+        quantization_config = GPTQConfig(bits=2)
+        quantization_config.to_dict()
+
+    def test_from_dict(self):
+        dict = {"bits": 2}
+        quantization_config = GPTQConfig.from_dict(dict)
+        self.assertEqual(dict["bits"], quantization_config.bits)
+
+    @require_optimum
+    def test_optimum_config(self):
+        from optimum.gptq import GPTQQuantizer
+
+        config = GPTQConfig(bits=2)
+        optimum_config = GPTQQuantizer.from_dict(config.to_dict())
+        self.assertEqual(optimum_config.bits, config.bits)
+        new_config = GPTQConfig.from_dict(optimum_config.to_dict())
+        self.assertEqual(optimum_config.bits, new_config.bits)
+
+
+@slow
+@require_optimum
+@require_auto_gptq
+@require_torch_gpu
+class GPTQTest(unittest.TestCase):
+    model_name = "bigscience/bloom-560m"
+
+    input_text = "Hello my name is"
+
+    EXPECTED_OUTPUTS = set()
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
+    EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
+    EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a professional photographer")
+
+    # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
+    EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
+
+    bits = 4
+    group_size = 128
+    desc_act = False
+    disable_exllama = True
+
+    dataset = [
+        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+    ]
+
+    device_map = None
+
+    # called only once for all test in this class
+    @classmethod
+    def setUpClass(cls):
+        """
+        Setup quantized model
+        """
+        cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
+            cls.model_name, torch_dtype=torch.float16, device_map=cls.device_map
+        )
+        cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
+
+        quantization_config = GPTQConfig(
+            bits=cls.bits,
+            dataset=cls.dataset,
+            tokenizer=cls.tokenizer,
+            group_size=cls.group_size,
+            desc_act=cls.desc_act,
+            disable_exllama=cls.disable_exllama,
+        )
+
+        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
+            cls.model_name,
+            torch_dtype=torch.float16,
+            device_map=cls.device_map,
+            quantization_config=quantization_config,
+        )
+
+    def test_memory_footprint(self):
+        r"""
+        A simple test to check if the model conversion has been done correctly by checking on the
+        memory footprint of the converted model
+        """
+
+        mem_quantized = self.quantized_model.get_memory_footprint()
+
+        self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE)
+
+    def test_quantized_layers_class(self):
+        """
+        Simple test to check if the model conversion has been done correctly by checking on
+        the class type of the linear layers of the converted models
+        """
+        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=self.desc_act,
+            group_size=self.group_size,
+            bits=self.bits,
+            disable_exllama=self.disable_exllama,
+        )
+        self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
+
+    def check_inference_correctness(self, model):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # Check the exactness of the results
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+
+        # Get the generation
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_generate_quality(self):
+        """
+        Simple test to check the quality of the model by comapring the the generated tokens with the expected tokens
+        """
+        if self.device_map is None:
+            self.check_inference_correctness(self.quantized_model.to(0))
+        else:
+            self.check_inference_correctness(self.quantized_model)
+
+    def test_serialization(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights works
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname)
+            if self.disable_exllama:
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname).to(0)
+            else:
+                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": 0})
+            self.check_inference_correctness(quantized_model_from_saved)
+
+    @require_accelerate
+    def test_serialization_big_model_inference(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights with big model inference
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname)
+            quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
+            self.check_inference_correctness(quantized_model_from_saved)
+
+    def test_change_loading_attributes(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights works with another config file
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantized_model.save_pretrained(tmpdirname)
+            if self.disable_exllama:
+                self.assertEqual(self.quantized_model.config.quantization_config.disable_exllama, True)
+                # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
+                quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
+                    tmpdirname, quantization_config=GPTQConfig(disable_exllama=False, bits=6), device_map={"": 0}
+                )
+                self.assertEqual(quantized_model_from_saved.config.quantization_config.disable_exllama, False)
+                self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
+                self.check_inference_correctness(quantized_model_from_saved)
+
+
+@require_accelerate
+@require_torch_multi_gpu
+class GPTQTestDeviceMap(GPTQTest):
+    device_map = "auto"
+
+
+@require_accelerate
+@require_torch_multi_gpu
+class GPTQTestDeviceMapExllama(GPTQTest):
+    device_map = "auto"
+    disable_exllama = False
+
+
+# fail when run all together
+@pytest.mark.skip
+@require_accelerate
+@require_torch_multi_gpu
+class GPTQTestDeviceMapCPUOffload(GPTQTest):
+    device_map = {
+        "transformer.word_embeddings": 0,
+        "transformer.word_embeddings_layernorm": 0,
+        "lm_head": 0,
+        "transformer.h.0": 0,
+        "transformer.h.1": 0,
+        "transformer.h.2": 0,
+        "transformer.h.3": 0,
+        "transformer.h.4": 0,
+        "transformer.h.5": 0,
+        "transformer.h.6": 0,
+        "transformer.h.7": 0,
+        "transformer.h.8": 0,
+        "transformer.h.9": 0,
+        "transformer.h.10": 1,
+        "transformer.h.11": 1,
+        "transformer.h.12": 1,
+        "transformer.h.13": 1,
+        "transformer.h.14": 1,
+        "transformer.h.15": 1,
+        "transformer.h.16": 1,
+        "transformer.h.17": 0,
+        "transformer.h.18": "cpu",
+        "transformer.h.19": "cpu",
+        "transformer.h.20": "cpu",
+        "transformer.h.21": "cpu",
+        "transformer.h.22": "cpu",
+        "transformer.h.23": 1,
+        "transformer.ln_f": 0,
+    }

From 454957c9bb5e8270c384ad46a1bb96928997c249 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 11 Aug 2023 11:39:57 +0200
Subject: [PATCH 921/935] Fix for #25437 (#25454)

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/tests_fetcher.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index aee7a3cbb590..5d0518b00fd0 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -735,6 +735,9 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filter_models=T
 
     print(f"\n### EXAMPLES TEST TO RUN ###\n{_print_list(examples_tests_to_run)}")
     if len(examples_tests_to_run) > 0:
+        # We use `all` in the case `commit_flags["test_all"]` as well as in `create_circleci_config.py` for processing
+        if examples_tests_to_run == ["examples"]:
+            examples_tests_to_run = ["all"]
         example_file = Path(output_file).parent / "examples_test_list.txt"
         with open(example_file, "w", encoding="utf-8") as f:
             f.write(" ".join(examples_tests_to_run))

From 8db3720b8ca654cad09a07ad52534809816fe271 Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Tue, 2 May 2023 16:32:49 +0000
Subject: [PATCH 922/935] not debugged code

---
 .../models/bloom/modeling_bloom.py            |  55 +--
 .../bloom/permutation_invariant_attention.py  | 388 ++++++++++++++++++
 2 files changed, 396 insertions(+), 47 deletions(-)
 create mode 100644 src/transformers/models/bloom/permutation_invariant_attention.py

diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index d12ec1724f70..2ba34048373e 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -35,6 +35,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from .configuration_bloom import BloomConfig
+from permutation_invariant_attention import build_alibi_tensor
 
 
 logger = logging.get_logger(__name__)
@@ -83,50 +84,6 @@ def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
     return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
 
 
-def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
-            number of heads
-        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-            dtype of the output tensor
-    """
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(
-        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    alibi = slopes[..., None] * arange_tensor
-    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-
-
 def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
     """
     Dropout add function
@@ -637,7 +594,7 @@ def __init__(self, config: BloomConfig):
         self.post_init()
 
     def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-        return build_alibi_tensor(attention_mask, num_heads, dtype)
+        return build_alibi_tensor(attention_mask, num_heads, dtype, self.graph_tokens, self.position_encoding_type)
 
     def get_input_embeddings(self):
         return self.word_embeddings
@@ -676,6 +633,7 @@ def set_input_embeddings(self, new_embeddings: torch.Tensor):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
+        full_input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.LongTensor] = None,
@@ -845,8 +803,9 @@ def prepare_inputs_for_generation(
         **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
+        truncated_input_ids = input_ids
         if past_key_values:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
+            truncated_input_ids = input_ids[:, -1].unsqueeze(-1)
 
             # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
@@ -856,13 +815,14 @@ def prepare_inputs_for_generation(
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
-            model_inputs = {"input_ids": input_ids}
+            model_inputs = {"input_ids": truncated_input_ids}
 
         model_inputs.update(
             {
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
+                "full_input_ids": input_ids,
             }
         )
         return model_inputs
@@ -881,6 +841,7 @@ def forward(
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
+        full_input_ids: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/bloom/permutation_invariant_attention.py b/src/transformers/models/bloom/permutation_invariant_attention.py
new file mode 100644
index 000000000000..5f036b4dd28a
--- /dev/null
+++ b/src/transformers/models/bloom/permutation_invariant_attention.py
@@ -0,0 +1,388 @@
+""" Implementing permutation invariant masking """
+
+from collections import defaultdict
+import math
+from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+
+import torch
+from transformers import BloomTokenizerFast
+
+from data.base_dataset import TextGraph
+from models.serialize import GEN_NODE_TOKEN, NODE_TOKEN, EDGE_TOKEN, SerializedGraphGenerator, S_TOKENS
+
+
+TEST_TOKENIZER = BloomTokenizerFast.from_pretrained('bigscience/bloom-3b')
+TEST_TOKENIZER.padding_side = 'left'
+TEST_TOKENIZER.add_tokens(S_TOKENS)
+
+MOLECULE_GRAPH = TextGraph(
+    text="a made up molecule that is probably impossible",
+    nodes=['C', 'O', 'N', 'C', 'C', 'N'],
+    # contains cycle
+    edge_index=[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 1], [2, 4], [3, 5]],
+    edges=['SINGLE', 'AROMATIC', 'DOUBLE', 'SINGLE', 'AROMATIC', 'SINGLE', 'SINGLE', 'SINGLE']
+)
+
+ABSTRACT_SYNTAX_TREE = TextGraph(
+    text="a made up function which is probably impossible",
+    nodes=['FUNCTION_DEF', 'FOR', 'WHILE', 'VARIABLE', 'ADDITION', 'VARIABLE', 'TYPE'],
+    # tree structure
+    edge_index=[[0, 1], [0, 2], [1, 3], [2, 4], [1, 5], [4, 6]],
+    edges=['A', 'B', 'C', 'D', 'E', 'F']
+)
+
+KNOWLEDGE_GRAPH = TextGraph(
+    text="a made up knowledge graph which probably contains incorrect syntax",
+    nodes=['sun', 'earth', 'plant', 'water'],
+    # contains self loop
+    edge_index=[[0, 0], [0, 1], [1, 2], [2, 0], [3, 0]],
+    edges=['iscalled', 'orbitsthe', 'containedin', 'needs', 'needs'],
+)
+
+NESTED_CYCLES = TextGraph(
+    text='this graph has a cycle nested in another cycle',
+    nodes=['0', '1', '2', '3', '4', '5', '6'],
+    edge_index=[[0, 1],[1, 2], [1, 3], [2, 4], [3, 4], [4, 5], [5, 6], [6, 0]],
+    edges=['0', '1', '2', '3', '4', '5', '6', '7'],
+)
+
+CHAIN_WITH_CYCLES = TextGraph(
+    text='this graph has a chain with a cycle in it, so not really a chain',
+    nodes=['0', '1', '2', '3', '4', '5', '6'],
+    edge_index=[[0, 1],[1, 2], [1, 3], [2, 4], [3, 4], [4, 5], [5, 6]],
+    edges=['0', '1', '2', '3', '4', '5', '6'],
+)
+
+MOLECULE_GRAPH_TWO = TextGraph(
+    text='this molecule has only carbon atoms',
+    nodes=['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
+    edge_index=[[0, 3], [1, 3], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [6, 8]],
+    edges=['SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE'],
+)
+
+GRAPH_BATCH = [
+    MOLECULE_GRAPH,
+    KNOWLEDGE_GRAPH,
+    ABSTRACT_SYNTAX_TREE,
+    MOLECULE_GRAPH_TWO,
+    NESTED_CYCLES,
+    CHAIN_WITH_CYCLES
+]
+
+GRAPH_TOKENS = {
+    'graph': TEST_TOKENIZER.convert_tokens_to_ids([GEN_NODE_TOKEN, NODE_TOKEN, EDGE_TOKEN]),
+    'nodes': TEST_TOKENIZER.convert_tokens_to_ids([GEN_NODE_TOKEN, NODE_TOKEN]),
+    'gen_node': TEST_TOKENIZER.convert_tokens_to_ids([GEN_NODE_TOKEN]),
+    'node': TEST_TOKENIZER.convert_tokens_to_ids([NODE_TOKEN]),
+    'edge': TEST_TOKENIZER.convert_tokens_to_ids([EDGE_TOKEN]),
+    'eos': [TEST_TOKENIZER.pad_token_id, TEST_TOKENIZER.eos_token_id]
+}
+
+
+@dataclass
+class SequenceElement:
+    """ A data class for representing an element in a sequence which is an element of a serialized
+        graph
+    """
+    token: int
+    start_idx: int
+    end_idx: int
+    ids: Tuple[int]
+    length: int
+
+
+def build_alibi_tensor(
+    token_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    num_heads: int,
+    dtype: torch.dtype,
+    graph_tokens: Dict[str, List[int]],
+    position_type: str = 'vanilla'
+) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    token_positions = []
+    for (t_ids, mask, positions) in enumerate(zip(token_ids, attention_mask, arange_tensor)):
+        token_positions.append(
+            _get_graph_positions(t_ids, mask, positions, graph_tokens, position_type)
+        )
+    alibi = slopes[..., None] * torch.cat(token_positions, dim=0)
+    import pdb;pdb.set_trace()
+    return alibi.reshape(
+        batch_size * num_heads,
+        seq_length if position_type == 'all_edges_previous' else 1,
+        seq_length
+    ).to(dtype)
+
+
+def _get_graph_positions(
+    token_ids: torch.Tensor,
+    mask: torch.Tensor,
+    positions: torch.Tensor,
+    graph_tokens: Dict[str, List[int]],
+    position_type: str
+) -> torch.Tensor:
+    """ Returns a revised position tensor (arange_tensor) where the position of tokens in the
+        sequence reflect the position of nodes and edges in a graph
+    """
+    if position_type == 'vanilla':
+        return positions
+    assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
+    edges = _extract_graph(token_ids.int().tolist(), graph_tokens)
+    if len(edges) > 0:
+        assert len(positions.shape) == 2 and positions.size(0) == 1
+        assert token_ids.shape[0] == positions.shape[1], f"Tensors must be the same shape {positions.shape}   {token_ids.shape}"
+        if position_type == 'action_invariant':
+            new_positions = _get_action_invariant_positions(
+                positions=positions[0],
+                sequenced_edges=edges,
+                graph_tokens=graph_tokens,
+            ) * mask
+        elif position_type == 'no_positions':
+            new_positions = _remove_positions_from_graph_tokens(
+                positions=positions[0],
+                sequenced_edges=edges,
+            ) * mask
+        elif position_type == 'all_edges_previous':
+            new_positions = _get_all_edges_previous_positions(
+                positions=positions[0],
+                sequenced_edges=edges,
+            ) * mask
+        return new_positions.unsqueeze(0)
+    else:
+        return positions
+
+
+def _get_all_edges_previous_positions(
+    positions: torch.Tensor,
+    sequenced_edges: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+) -> torch.Tensor:
+    start_idx = sequenced_edges[0][0].start_idx
+    init_position = positions[start_idx].item()
+    new_positions_list = []
+    for s_idx, sequenced_edge in enumerate(sequenced_edges):
+        new_positions = positions.clone()
+        max_length = 0
+        for prev_edge in sequenced_edges[:s_idx]:
+            pred_node, _, succ_node = prev_edge
+            new_positions[pred_node.start_idx:succ_node.end_idx] = torch.arange(start_idx - end_idx, 0, -1).to(positions.device)
+            max_length = max(end_idx - start_idx, max_length)
+        pred_node, edge, succ_node = sequenced_edge
+        if succ_node is None and edge is None:
+            end_idx = pred_node.end_idx
+        elif succ_node is None:
+            end_idx = edge.end_idx
+        else:
+            end_idx = succ_node.end_idx
+        new_positions[pred_node.start_idx:end_idx] = torch.arange(end_idx - start_idx).to(positions.device)
+        new_positions[start_idx:end_idx] += init_position + max_length
+        for idx in range(pred_node.start_idx, end_idx):
+            token_positions = new_positions.clone()
+            if idx < token_positions.numel():
+                token_positions[(idx + 1):] = -1e6
+            new_positions_list.append(token_positions.unsqueeze(0))
+    return torch.cat([new_positions_list], dim=0)
+
+
+def _remove_positions_from_graph_tokens(
+    positions: torch.Tensor,
+    sequenced_edges: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+) -> torch.Tensor:
+    start_idx = sequenced_edges[0][0].start_idx
+    if sequenced_edges[-1][2] is None and sequenced_edges[-1][1] is None:
+        end_idx = sequenced_edges[-1][0].end_idx
+    elif sequenced_edges[-1][2] is None:
+        end_idx = sequenced_edges[-1][1].end_idx
+    else:
+        end_idx = sequenced_edges[-1][2].end_idx
+    init_position = positions[start_idx].item()
+    new_positions = positions.clone()
+    new_positions[start_idx:end_idx] = init_position # setting all isolated graph tokens to have a low bias
+    return new_positions
+
+
+def _get_action_invariant_positions(
+    positions: torch.Tensor,
+    sequenced_edges: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
+    graph_tokens: Dict[str, List[int]]
+) -> torch.Tensor:
+    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
+        their position in the graph
+    """
+    device = positions.device
+    init_position = positions[sequenced_edges[0][0].start_idx].item()
+    new_positions = positions.clone()
+    element_position = defaultdict(lambda: -1)
+    for sequenced_edge in sequenced_edges:
+        pred_node, edge, succ_node = sequenced_edge
+        pred_bias = (
+            init_position if element_position[pred_node.ids] < 0
+            else element_position[pred_node.ids]
+        )
+        new_positions[pred_node.start_idx:pred_node.end_idx] = (
+            pred_bias + torch.arange(pred_node.length).to(device)
+        )
+        if isinstance(edge, SequenceElement):
+            new_positions[edge.start_idx:edge.end_idx] = (
+                new_positions[pred_node.end_idx - 1] + 1 + torch.arange(edge.length).to(device)
+            )
+        if isinstance(edge, SequenceElement) and isinstance(succ_node, SequenceElement):
+            new_positions[succ_node.start_idx:succ_node.end_idx] = (
+                new_positions[edge.end_idx - 1] + 1 + torch.arange(succ_node.length).to(device)
+            )
+        after_edge_pos = new_positions[succ_node.end_idx - 1] + 1
+        if pred_node.token in graph_tokens['gen_node']:
+            element_position[pred_node.ids] = after_edge_pos.item()
+        if isinstance(succ_node, SequenceElement) and succ_node.token in graph_tokens['gen_node']:
+            element_position[succ_node.ids] = after_edge_pos.item()
+    return new_positions
+
+
+def _extract_graph(
+    token_ids: List[int],
+    graph_tokens: Dict[str, List[int]]
+) -> List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]:
+    """ Returns a list of edges of the graph sequence identified in a sequence of token ids """
+    sequence = _extract_graph_sequence(token_ids, graph_tokens)
+    edges = []
+    if len(sequence) > 2:
+        node_explored = defaultdict(lambda: False)
+        for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
+            if (
+                elem0.token in graph_tokens['nodes']
+                and elem1.token in graph_tokens['edge']
+                and elem2.token in graph_tokens['nodes']
+            ): # edge syntax
+                if (
+                # test to see if there is an ungenerated node contained within the identified edge
+                # essentially another syntax error that can occur during graph generation
+                    (elem0.token in graph_tokens['node'] and not node_explored[elem0.ids])
+                    or (
+                        elem2.token in graph_tokens['node']
+                        and not node_explored[elem2.ids]
+                        and elem2.ids != elem0.ids # to account for self loops
+                    )
+                ):
+                    continue
+                if elem0.token in graph_tokens['gen_node'] and not node_explored[elem0.ids]:
+                    node_explored[elem0.ids] = True
+                if elem2.token in graph_tokens['gen_node'] and not node_explored[elem2.ids]:
+                    node_explored[elem2.ids] = True
+                edges.append((elem0, elem1, elem2))
+    if (
+        (
+            len(edges) > 0
+            and edges[-1].pred_node != sequence[-3]
+            and edges[-1].edge != sequence[-2]
+            and edges[-1].succ_node != sequence[-1]
+        )
+        or len(edges) == 0
+    ):
+        if (
+            len(sequence) > 1
+            and sequence[-2].token in graph_tokens['nodes']
+            and sequence[-1].token in graph_tokens['edge']
+            and not (
+                sequence[-2].token in graph_tokens['node']
+                and not node_explored[sequence[-2].ids]
+            )
+        ):
+
+            edges.append((sequence[-2], sequence[-1], None))
+        elif (
+            len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']
+            and not (
+                sequence[-1].token in graph_tokens['node']
+                and not node_explored[sequence[-1].ids]
+            )
+        ):
+            edges.append((sequence[-1], None, None))
+    return edges
+
+
+def _extract_graph_sequence(
+    token_ids: List[int],
+    graph_tokens: Dict[str, List[int]]
+) -> List[SequenceElement]:
+    """ Returns a parsable representation of the serialized graph in a sequence of token ids,
+        if none is found, returns an empty list
+    """
+    sequence = []
+    prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
+    for token_idx, token_id in enumerate(token_ids):
+        if token_id in graph_tokens['gen_node'] and prev_token_id is None:
+            prev_token_id, prev_idx = token_id, token_idx
+        elif token_id in graph_tokens['graph'] and prev_token_id is not None:
+            sequence.append(SequenceElement(
+                token=prev_token_id,
+                start_idx=prev_idx,
+                end_idx=token_idx,
+                ids=tuple(token_ids[prev_idx:token_idx])[1:],
+                length=token_idx - prev_idx
+            ))
+            prev_token_id, prev_idx = token_id, token_idx
+        elif token_id in graph_tokens['eos'] and prev_token_id is not None:
+            final_idx = token_idx
+            break
+    if prev_token_id is not None:
+        sequence.append(SequenceElement(
+            token=prev_token_id,
+            start_idx=prev_idx,
+            end_idx=final_idx,
+            ids=tuple(token_ids[prev_idx:final_idx])[1:],
+            length=final_idx - prev_idx
+        ))
+    return sequence
+
+
+if __name__ == "__main__":
+    graphs_batch = SerializedGraphGenerator.text_graph2inputs_batch(
+        text_graph_pairs=GRAPH_BATCH,
+        tokenizer=TEST_TOKENIZER,
+        add_directions=False,
+        randomize=False
+    )
+    alibi = build_alibi_tensor(
+        token_ids=graphs_batch['input_sequence'],
+        attention_mask=graphs_batch['input_attn_mask'],
+        num_heads=8,
+        dtype=graphs_batch['input_sequence'].dtype,
+        graph_tokens=GRAPH_TOKENS
+    )

From 7fcb1ab425d1fbef260592938fbd411bf35f3429 Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Wed, 3 May 2023 09:18:26 +0000
Subject: [PATCH 923/935] reference code so nothing is lost

---
 src/transformers/models/bloom/reference.py | 1696 ++++++++++++++++++++
 1 file changed, 1696 insertions(+)
 create mode 100644 src/transformers/models/bloom/reference.py

diff --git a/src/transformers/models/bloom/reference.py b/src/transformers/models/bloom/reference.py
new file mode 100644
index 000000000000..a52e439bc357
--- /dev/null
+++ b/src/transformers/models/bloom/reference.py
@@ -0,0 +1,1696 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BLOOM model."""
+
+import math
+import warnings
+from typing import Dict, List, Optional, Tuple, Union
+
+import math
+from collections import defaultdict
+from typing import Dict, List, Tuple, Union
+from dataclasses import dataclass
+
+from collections import defaultdict
+import itertools
+import math
+from typing import Dict, List
+
+import numpy as np
+import torch
+from torch_scatter import scatter
+
+import torch
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import functional as F
+
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_bloom import BloomConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
+_CONFIG_FOR_DOC = "BloomConfig"
+
+BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bigscience/bigscience-small-testing",
+    "bigscience/bloom-560m",
+    "bigscience/bloom-1b1",
+    "bigscience/bloom-1b7",
+    "bigscience/bloom-3b",
+    "bigscience/bloom-7b1",
+    "bigscience/bloom",
+]
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
+    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
+    seq_ids = torch.arange(target_length, device=device)
+    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
+
+    if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = False
+
+    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
+    return expanded_mask
+
+
+def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
+
+
+# def build_alibi_tensor(
+#     token_ids: torch.Tensor,
+#     attention_mask: torch.Tensor,
+#     num_heads: int,
+#     dtype: torch.dtype,
+#     graph_tokens: Dict[str, List[int]]
+# ) -> torch.Tensor:
+#     """
+#     Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+#     relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+#     `softmax(l+a) = softmax(l)`. Based on
+#     https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+#     TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+#     Args:
+#     Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+#         attention_mask (`torch.Tensor`):
+#             Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+#         num_heads (`int`, *required*):
+#             number of heads
+#         dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+#             dtype of the output tensor
+#     """
+#     batch_size, seq_length = attention_mask.shape
+#     closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+#     base = torch.tensor(
+#         2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+#     )
+#     powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+#     slopes = torch.pow(base, powers)
+
+#     if closest_power_of_2 != num_heads:
+#         extra_base = torch.tensor(
+#             2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+#         )
+#         num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+#         extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+#         slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+#     # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+#     # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+#     # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+#     # => the query_length dimension will then be broadcasted correctly
+#     # This is more or less identical to T5's relative position bias:
+#     # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+#     arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+#     alibi = slopes[..., None] * arange_tensor
+#     return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
+@dataclass
+class SequenceElement:
+    """ A data class for representing an element in a sequence which is an element of a serialized
+        graph
+    """
+    token: int
+    start_idx: int
+    end_idx: int
+    ids: Tuple[int]
+    length: int
+
+
+@dataclass
+class SequenceEdge:
+    """ A data class for representing a text graph where you can optionally include the text
+        describing the graph or the log probabilities of generating the text graph
+    """
+    pred_node: SequenceElement
+    edge: SequenceElement = SequenceElement(
+        token=0,
+        start_idx=-1,
+        end_idx=0,
+        ids=tuple(),
+        length=0
+    )
+    succ_node: SequenceElement = SequenceElement(
+        token=0,
+        start_idx=-1,
+        end_idx=0,
+        ids=tuple(),
+        length=0
+    )
+
+
+def build_alibi_tensor(
+    token_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    num_heads: int,
+    dtype: torch.dtype,
+    graph_tokens: Dict[str, List[int]]
+) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    for batch_idx, (t_ids, mask, positions) in enumerate(
+        zip(token_ids, attention_mask, arange_tensor)
+    ):
+        arange_tensor[batch_idx] = _get_graph_positions(t_ids, mask, positions, graph_tokens)
+    alibi = slopes[..., None] * arange_tensor
+    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
+def _get_graph_positions(
+    token_ids: torch.Tensor,
+    mask: torch.Tensor,
+    arange_tensor: torch.Tensor,
+    graph_tokens: Dict[str, List[int]]
+) -> torch.Tensor:
+    """ Returns a revised position tensor (arange_tensor) where the position of tokens in the
+        sequence reflect the position of nodes and edges in a graph
+    """
+    assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
+    edges = _extract_graph(token_ids.int().tolist(), graph_tokens)
+    if len(edges) > 0:
+        assert len(arange_tensor.shape) == 2 and arange_tensor.size(0) == 1
+        assert token_ids.shape[0] == arange_tensor.shape[1], f"Tensors must be the same shape {arange_tensor.shape}   {token_ids.shape}"
+        arange_tensor = _edit_positions_w_graph_info(arange_tensor[0], edges, graph_tokens) * mask
+    return arange_tensor.unsqueeze(0)
+
+
+def _edit_positions_w_graph_info(
+    arange_tensor: torch.Tensor,
+    sequenced_edges: List[SequenceEdge],
+    graph_tokens: Dict[str, List[int]]
+) -> torch.Tensor:
+    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
+        their position in the graph
+    """
+    device = arange_tensor.device
+    start_idx = sequenced_edges[0].pred_node.start_idx
+    init_position = arange_tensor[start_idx].item()
+    arange_tensor[start_idx:] = 0 # setting all isolated graph tokens to have a low bias
+    element_position = defaultdict(lambda: -1)
+    for s_edge in sequenced_edges:
+        pred_node, edge, succ_node = s_edge.pred_node, s_edge.edge, s_edge.succ_node
+        pred_bias = (
+            init_position if element_position[pred_node.ids] < 0
+            else element_position[pred_node.ids]
+        )
+        arange_tensor[pred_node.start_idx:pred_node.end_idx] = (
+            pred_bias + torch.arange(pred_node.length).to(device)
+        )
+        if edge.start_idx != -1:
+            arange_tensor[edge.start_idx:edge.end_idx] = (
+                arange_tensor[pred_node.end_idx - 1] + 1 + torch.arange(edge.length).to(device)
+            )
+        if edge.start_idx != -1 and succ_node.start_idx == -1:
+            arange_tensor[succ_node.start_idx:succ_node.end_idx] = (
+                arange_tensor[edge.end_idx - 1] + 1 + torch.arange(succ_node.length).to(device)
+            )
+        after_edge_pos = arange_tensor[succ_node.end_idx - 1] + 1
+        if pred_node.token in graph_tokens['gen_node']:
+            element_position[pred_node.ids] = after_edge_pos.item()
+        if succ_node.token in graph_tokens['gen_node'] and succ_node.start_idx != -1:
+            element_position[succ_node.ids] = after_edge_pos.item()
+    return arange_tensor
+
+
+def _extract_graph(token_ids: List[int], graph_tokens: Dict[str, List[int]]) -> List[SequenceEdge]:
+    """ Returns a list of edges of the graph sequence identified in a sequence of token ids """
+    sequence = _extract_graph_sequence(token_ids, graph_tokens)
+    edges = []
+    if len(sequence) > 2:
+        node_explored = defaultdict(lambda: False)
+        for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
+            if (
+                elem0.token in graph_tokens['nodes']
+                and elem1.token in graph_tokens['edge']
+                and elem2.token in graph_tokens['nodes']
+            ): # edge syntax
+                if (
+                # test to see if there is an ungenerated node contained within the identified edge
+                # essentially another syntax error that can occur during graph generation
+                    (elem0.token in graph_tokens['node'] and not node_explored[elem0.ids])
+                    or (
+                        elem2.token in graph_tokens['node']
+                        and not node_explored[elem2.ids]
+                        and elem2.ids != elem0.ids # to account for self loops
+                    )
+                ):
+                    continue
+                if elem0.token in graph_tokens['gen_node'] and not node_explored[elem0.ids]:
+                    node_explored[elem0.ids] = True
+                if elem2.token in graph_tokens['gen_node'] and not node_explored[elem2.ids]:
+                    node_explored[elem2.ids] = True
+                edges.append(SequenceEdge(pred_node=elem0, edge=elem1, succ_node=elem2))
+    if (
+        (
+            len(edges) > 0
+            and edges[-1].pred_node != sequence[-3]
+            and edges[-1].edge != sequence[-2]
+            and edges[-1].succ_node != sequence[-1]
+        )
+        or len(edges) == 0
+    ):
+        if (
+            len(sequence) > 1
+            and sequence[-2].token in graph_tokens['nodes']
+            and sequence[-1].token in graph_tokens['edge']
+            and not (
+                sequence[-2].token in graph_tokens['node']
+                and not node_explored[sequence[-2].ids]
+            )
+        ):
+
+            edges.append(SequenceEdge(pred_node=sequence[-2], edge=sequence[-1]))
+        elif (
+            len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']
+            and not (
+                sequence[-1].token in graph_tokens['node']
+                and not node_explored[sequence[-1].ids]
+            )
+        ):
+            edges.append(SequenceEdge(pred_node=sequence[-1]))
+    return edges
+
+
+def _extract_graph_sequence(
+    token_ids: List[int],
+    graph_tokens: Dict[str, List[int]]
+) -> List[SequenceElement]:
+    """ Returns a parsable representation of the serialized graph in a sequence of token ids,
+        if none is found, returns an empty list
+    """
+    sequence = []
+    prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
+    for token_idx, token_id in enumerate(token_ids):
+        if token_id in graph_tokens['gen_node'] and prev_token_id is None:
+            prev_token_id, prev_idx = token_id, token_idx
+        elif token_id in graph_tokens['graph'] and prev_token_id is not None:
+            sequence.append(SequenceElement(
+                token=prev_token_id,
+                start_idx=prev_idx,
+                end_idx=token_idx,
+                ids=tuple(token_ids[prev_idx:token_idx])[1:],
+                length=token_idx - prev_idx
+            ))
+            prev_token_id, prev_idx = token_id, token_idx
+        elif token_id in graph_tokens['eos'] and prev_token_id is not None:
+            final_idx = token_idx
+            break
+    if prev_token_id is not None:
+        sequence.append(SequenceElement(
+            token=prev_token_id,
+            start_idx=prev_idx,
+            end_idx=final_idx,
+            ids=tuple(token_ids[prev_idx:final_idx])[1:],
+            length=final_idx - prev_idx
+        ))
+    return sequence
+
+
+def build_message_passing_matrices(
+    token_ids: torch.Tensor,
+    graph_tokens: Dict[str, List[int]]
+) -> List[Dict[str, torch.Tensor]]:
+    """ Returns the adjacency matrices required to perform causal message passing in between
+        language model blocks of an autoregressive language model
+    """
+    message_passing_dicts = []
+    for t_ids in token_ids:
+        message_passing_dict = {'tokens2edges': [], 'edges2tokens': [], 'inverse_edge_index': []}
+        edges = _extract_graph(t_ids.int().tolist(), graph_tokens)
+        node2edge_idxs = defaultdict(list)
+        for edge_idx, edge in enumerate(edges):
+            node2edge_idxs[edge.pred_node.ids].append(edge_idx)
+            if edge.succ_node.start_idx != -1:
+                end_idx = edge.succ_node.end_idx
+                node2edge_idxs[edge.succ_node.ids].append(edge_idx)
+            elif edge.edge.start_idx != -1:
+                end_idx = edge.edge.end_idx
+            else:
+                end_idx = edge.pred_node.end_idx
+            for token_idx in range(edge.pred_node.start_idx, end_idx):
+                message_passing_dict['tokens2edges'].append([token_idx, edge_idx])
+                message_passing_dict['edges2tokens'].append([edge_idx, token_idx])
+        if len(message_passing_dict['edges2tokens']) > 0:
+            message_passing_dict['edges2tokens'] = add_missing_idxs(
+                message_passing_dict['edges2tokens'],
+                num_incoming_nodes=len(edges),
+                num_outgoing_nodes=len(t_ids)
+            )
+        message_passing_dict['inverse_edge_index'] = []
+        for edge_idxs in node2edge_idxs.values():
+            if len(edge_idxs) < 2:
+                continue
+            for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
+                message_passing_dict['inverse_edge_index'].append(
+                    [idx0, idx1] if idx0 < idx1 else [idx1, idx0]
+                )
+        if len(message_passing_dict['inverse_edge_index']) > 0:
+            message_passing_dict['inverse_edge_index'] = add_missing_idxs(
+                message_passing_dict['inverse_edge_index'],
+                num_incoming_nodes=len(edges),
+                num_outgoing_nodes=len(edges)
+            )
+        message_passing_dicts.append({
+            key: torch.from_numpy(np.array(value).transpose(1, 0)).long().to(token_ids.device)
+            if len(value) > 0 else torch.from_numpy(np.array(value)).long().to(token_ids.device)
+            for key, value in message_passing_dict.items()
+        })
+    return message_passing_dicts
+
+
+def add_missing_idxs(
+    edge_index: List[List[int]],
+    *,
+    num_incoming_nodes: int,
+    num_outgoing_nodes: int
+) -> List[List[int]]:
+    """ Adds edges from a dummy node to all outgoing nodes which do not have an edge pointing to
+        them. This is to facilitate causal message passing where a node should have access to its
+        own embedding using torch.scatter to perform message passing.
+    """
+    existing_idxs = set([node_idxs[-1] for node_idxs in edge_index])
+    missing_idxs = set(range(num_outgoing_nodes)) - existing_idxs
+    for missing_idx in missing_idxs:
+        edge_index.append([num_incoming_nodes, missing_idx])
+    return edge_index
+
+
+def perform_causal_message_passing(
+    token_embeddings: torch.Tensor,
+    message_passing_dicts: List[Dict[str, torch.Tensor]],
+    reduce: str = 'mean'
+) -> torch.Tensor:
+    """ Returns token embeddings in a sequence where causal message passing has been performed to
+        based on the serialized graph described in the token sequence
+    """
+    new_token_embeddings = []
+    for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
+        if message_passing_dict['inverse_edge_index'].numel() == 0:
+            new_t_embeddings = t_embeddings
+        else:
+            edge_embeddings = scatter(
+                src=t_embeddings[message_passing_dict['tokens2edges'][0]],
+                dim=0,
+                index=message_passing_dict['tokens2edges'][1],
+                reduce=reduce
+            )
+            # adding dummy tensor to make sure that the output tensor of message passing is the
+            # correct size because causal message passing does not allow self loops
+            edge_embeddings = torch.cat([
+                edge_embeddings,
+                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
+            ], dim=0)
+            # adding dummy tensor to make sure that the output tensor of message passing is the
+            # correct size because causal message passing does not allow self loops
+            edge_embeddings = scatter(
+                src=edge_embeddings[message_passing_dict['inverse_edge_index'][0]],
+                dim=0,
+                index=message_passing_dict['inverse_edge_index'][1],
+                reduce=reduce
+            )
+            edge_embeddings = torch.cat([
+                edge_embeddings,
+                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
+            ], dim=0)
+            new_t_embeddings = scatter(
+                src=edge_embeddings[message_passing_dict['edges2tokens'][0]],
+                dim=0,
+                index=message_passing_dict['edges2tokens'][1],
+                reduce=reduce
+            )
+        assert new_t_embeddings.shape == t_embeddings.shape
+        new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
+    return torch.cat(new_token_embeddings, dim=0) + token_embeddings
+
+
+def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
+    """
+    Dropout add function
+
+    Args:
+        x (`torch.tensor`, *required*):
+            input tensor
+        residual (`torch.tensor`, *required*):
+            esidual tensor
+        prob (`float`, *required*):
+            dropout probability
+        training (`bool`, *required*):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
+    """
+    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
+    make the model jitable.
+
+    Args:
+        x (`torch.tensor`, *required*):
+            input hidden states
+    """
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+    """
+    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
+    0.3989423 * x * torch.exp(-0.5 * x * x)
+
+    Args:
+        g (`torch.tensor`, *required*):
+            gradient output tensor
+        x (`torch.tensor`, *required*):
+            input tensor
+    """
+    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff * g
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+        ctx.save_for_backward(input)
+        return bloom_gelu_forward(input)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        input = ctx.saved_tensors
+        tmp = bloom_gelu_back(grad_output, input)
+        return tmp
+
+
+class BloomGelu(nn.Module):
+    """
+    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
+    torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
+    copied from Megatron-DeepSpeed code and adapted for our needs
+
+    See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            return GeLUFunction.apply(x)
+        else:
+            return bloom_gelu_forward(x)
+
+
+class BloomAttention(nn.Module):
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+
+        self.pretraining_tp = config.pretraining_tp
+        self.slow_but_exact = config.slow_but_exact
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.n_head
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = 1.0
+
+        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
+        storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Merge heads together over the last dimenstion
+
+        Args:
+            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+
+        Returns:
+            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+
+        # First view to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
+
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.permute(0, 2, 1, 3)
+
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, q_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+        key_layer = key_layer.permute(0, 2, 3, 1).reshape(batch_size * self.num_heads, self.head_dim, q_length)
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
+            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            key_layer = torch.cat((past_key, key_layer), dim=2)
+            value_layer = torch.cat((past_value, value_layer), dim=1)
+
+        _, _, kv_length = key_layer.shape
+
+        if use_cache is True:
+            present = (key_layer, value_layer)
+        else:
+            present = None
+
+        # [batch_size * num_heads, q_length, kv_length]
+        # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
+        matmul_result = alibi.baddbmm(
+            batch1=query_layer,
+            batch2=key_layer,
+            beta=self.beta,
+            alpha=self.inv_norm_factor,
+        )
+
+        # change view to [batch_size, num_heads, q_length, kv_length]
+        attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
+
+        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+        input_dtype = attention_scores.dtype
+        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+        if input_dtype == torch.float16:
+            attention_scores = attention_scores.to(torch.float)
+        attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
+        attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype)
+
+        # [batch_size, num_heads, q_length, kv_length]
+        attention_probs = self.attention_dropout(attention_probs)
+
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        # change view [batch_size x num_heads, q_length, kv_length]
+        attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
+
+        # matmul: [batch_size * num_heads, q_length, head_dim]
+        context_layer = torch.bmm(attention_probs_reshaped, value_layer)
+
+        # change view [batch_size, num_heads, q_length, head_dim]
+        context_layer = self._merge_heads(context_layer)
+
+        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            slices = self.hidden_size / self.pretraining_tp
+            output_tensor = torch.zeros_like(context_layer)
+            for i in range(self.pretraining_tp):
+                output_tensor = output_tensor + F.linear(
+                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            output_tensor = self.dense(context_layer)
+
+        output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
+
+        outputs = (output_tensor, present)
+        if output_attentions:
+            outputs += (attention_probs,)
+
+        return outputs
+
+
+class BloomMLP(nn.Module):
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.pretraining_tp = config.pretraining_tp
+        self.slow_but_exact = config.slow_but_exact
+        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
+        self.gelu_impl = BloomGelu()
+        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))
+
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            intermediate_output = torch.zeros_like(residual)
+            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
+            for i in range(self.pretraining_tp):
+                intermediate_output = intermediate_output + F.linear(
+                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            intermediate_output = self.dense_4h_to_h(hidden_states)
+
+        output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
+
+        return output
+
+
+class BloomBlock(nn.Module):
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.num_heads = config.n_head
+        self.self_attention = BloomAttention(config)
+        self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = BloomMLP(config)
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        # hidden_states: [batch_size, seq_length, hidden_size]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Layer norm post the self attention.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # Self attention.
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            residual,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        layernorm_output = self.post_attention_layernorm(attention_output)
+
+        # Get residual
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = attention_output
+
+        # MLP.
+        output = self.mlp(layernorm_output, residual)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+class BloomPreTrainedModel(PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BloomConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BloomBlock"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
+        if isinstance(module, BloomModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_to_standard_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+        num_heads, ...]))
+        """
+        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        num_heads = batch_size_times_num_heads // batch_size
+        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
+        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
+                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+    @staticmethod
+    def _convert_to_bloom_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+        """
+        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
+                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+
+BLOOM_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BLOOM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+
+            Each element of `past_key_values` is a tuple (past_key, past_value):
+            - past_key: [batch_size * num_heads, head_dim, kv_length]
+            - past_value: [batch_size * num_heads, kv_length, head_dim]
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
+            `past_key_values`).
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
+    BLOOM_START_DOCSTRING,
+)
+class BloomModel(BloomPreTrainedModel):
+    def __init__(self, config: BloomConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.n_head
+
+        # Embedding + LN Embedding
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Transformer blocks
+        self.h = nn.ModuleList([BloomBlock(config) for _ in range(config.num_hidden_layers)])
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def build_alibi_tensor(self, token_ids: torch.Tensor, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype, graph_tokens: Dict[str, List[int]]) -> torch.Tensor:
+        return build_alibi_tensor(token_ids, attention_mask, num_heads, dtype, graph_tokens)
+
+    def build_message_passing_matrices(self, token_ids: torch.Tensor, graph_tokens: Dict[str, List[int]]) -> List[Dict[str, torch.Tensor]]:
+        return build_message_passing_matrices(token_ids, graph_tokens)
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def _prepare_attn_mask(
+        self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
+    ) -> torch.BoolTensor:
+        # create causal mask
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        combined_attention_mask = None
+        device = attention_mask.device
+        _, src_length = input_shape
+
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, device=device, past_key_values_length=past_key_values_length
+            )
+
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        full_input_ids: Optional[torch.LongTensor] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+        alibi = self.build_alibi_tensor(input_ids if full_input_ids is None else full_input_ids, attention_mask, self.num_heads, dtype=hidden_states.dtype, graph_tokens=self.graph_tokens)
+        message_passing_dicts = self.build_message_passing_matrices(input_ids if full_input_ids is None else full_input_ids, self.graph_tokens)
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    layer_past,
+                    head_mask[i],
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                )
+
+            hidden_states = outputs[0]
+
+            if i != len(self.h) - 1:
+                hidden_states = perform_causal_message_passing(hidden_states, message_passing_dicts)
+
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    BLOOM_START_DOCSTRING,
+)
+class BloomForCausalLM(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config: BloomConfig):
+        super().__init__(config)
+        self.transformer = BloomModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: torch.Tensor):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        truncated_input_ids = None
+        if past_key_values:
+            truncated_input_ids = input_ids[:, -1].unsqueeze(-1)
+
+            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
+            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
+                past_key_values = self._convert_to_bloom_cache(past_key_values)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids if truncated_input_ids is None else truncated_input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "full_input_ids": input_ids
+            }
+        )
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        full_input_ids: Optional[torch.LongTensor] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            full_input_ids=full_input_ids
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def _reorder_cache(
+        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))
+
+        # Get a copy of `beam_idx` on all the devices where we need those indices.
+        device_to_beam_idx = {
+            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
+        }
+        reordered_past = tuple(
+            (
+                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
+                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
+            )
+            for layer_past in standardized_past
+        )
+        return self._convert_to_bloom_cache(reordered_past)
+
+
+@add_start_docstrings(
+    """
+    The Bloom Model transformer with a sequence classification head on top (linear layer).
+
+    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    BLOOM_START_DOCSTRING,
+)
+class BloomForSequenceClassification(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config: BloomConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = BloomModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bloom Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BLOOM_START_DOCSTRING,
+)
+class BloomForTokenClassification(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config: BloomConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = BloomModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The BLOOM Model transformer with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BLOOM_START_DOCSTRING,
+)
+class BloomForQuestionAnswering(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = BloomModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

From fe48c3fdcaff570cec62b40fbe4d5471369e8845 Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Wed, 3 May 2023 10:01:37 +0000
Subject: [PATCH 924/935] novelty

---
 .../models/bloom/causal_message_passing.py    |  126 ++
 .../models/bloom/desequence_graph_ids.py      |  114 ++
 .../models/bloom/modeling_bloom.py            |   45 +-
 .../bloom/permutation_invariant_attention.py  |  388 ----
 .../bloom/permutation_invariant_positions.py  |  198 ++
 src/transformers/models/bloom/reference.py    | 1696 -----------------
 6 files changed, 478 insertions(+), 2089 deletions(-)
 create mode 100644 src/transformers/models/bloom/causal_message_passing.py
 create mode 100644 src/transformers/models/bloom/desequence_graph_ids.py
 delete mode 100644 src/transformers/models/bloom/permutation_invariant_attention.py
 create mode 100644 src/transformers/models/bloom/permutation_invariant_positions.py
 delete mode 100644 src/transformers/models/bloom/reference.py

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
new file mode 100644
index 000000000000..8c65242ae3f1
--- /dev/null
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -0,0 +1,126 @@
+from collections import defaultdict
+import itertools
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch_scatter import scatter
+
+from desequence_graph_ids import SequenceElement
+
+
+def build_message_passing_matrices(
+    token_ids: torch.Tensor,
+    edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]]
+) -> List[Dict[str, torch.Tensor]]:
+    """ Returns the adjacency matrices required to perform causal message passing in between
+        language model blocks of an autoregressive language model
+    """
+    message_passing_dicts = []
+    for t_ids, edge_sequence in zip(token_ids, edge_sequences):
+        message_passing_dict = {'tokens2edges': [], 'edges2tokens': [], 'inverse_edge_index': []}
+        node2edge_idxs = defaultdict(list)
+        for edge_idx, sequenced_edge in enumerate(edge_sequence):
+            pred_node, edge, succ_node = sequenced_edge
+            node2edge_idxs[pred_node.ids].append(edge_idx)
+            if isinstance(succ_node, SequenceElement):
+                end_idx = succ_node.end_idx
+                node2edge_idxs[succ_node.ids].append(edge_idx)
+            elif isinstance(edge, SequenceElement):
+                end_idx = edge.end_idx
+            else:
+                end_idx = pred_node.end_idx
+            for token_idx in range(pred_node.start_idx, end_idx):
+                message_passing_dict['tokens2edges'].append([token_idx, edge_idx])
+                message_passing_dict['edges2tokens'].append([edge_idx, token_idx])
+        if len(message_passing_dict['edges2tokens']) > 0:
+            message_passing_dict['edges2tokens'] = add_missing_idxs(
+                message_passing_dict['edges2tokens'],
+                num_incoming_nodes=len(edge_sequence),
+                num_outgoing_nodes=len(t_ids)
+            )
+        message_passing_dict['inverse_edge_index'] = []
+        for edge_idxs in node2edge_idxs.values():
+            if len(edge_idxs) < 2:
+                continue
+            for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
+                message_passing_dict['inverse_edge_index'].append(
+                    [idx0, idx1] if idx0 < idx1 else [idx1, idx0]
+                )
+        if len(message_passing_dict['inverse_edge_index']) > 0:
+            message_passing_dict['inverse_edge_index'] = add_missing_idxs(
+                message_passing_dict['inverse_edge_index'],
+                num_incoming_nodes=len(edge_sequence),
+                num_outgoing_nodes=len(edge_sequence)
+            )
+        message_passing_dicts.append({
+            key: torch.from_numpy(np.array(value).transpose(1, 0)).long().to(token_ids.device)
+            if len(value) > 0 else torch.from_numpy(np.array(value)).long().to(token_ids.device)
+            for key, value in message_passing_dict.items()
+        })
+    return message_passing_dicts
+
+
+def add_missing_idxs(
+    edge_index: List[List[int]],
+    *,
+    num_incoming_nodes: int,
+    num_outgoing_nodes: int
+) -> List[List[int]]:
+    """ Adds edges from a dummy node to all outgoing nodes which do not have an edge pointing to
+        them. This is to facilitate causal message passing where a node should have access to its
+        own embedding using torch.scatter to perform message passing.
+    """
+    existing_idxs = set([node_idxs[-1] for node_idxs in edge_index])
+    missing_idxs = set(range(num_outgoing_nodes)) - existing_idxs
+    for missing_idx in missing_idxs:
+        edge_index.append([num_incoming_nodes, missing_idx])
+    return edge_index
+
+
+def perform_causal_message_passing(
+    token_embeddings: torch.Tensor,
+    message_passing_dicts: List[Dict[str, torch.Tensor]],
+    reduce: str = 'mean'
+) -> torch.Tensor:
+    """ Returns token embeddings in a sequence where causal message passing has been performed to
+        based on the serialized graph described in the token sequence
+    """
+    new_token_embeddings = []
+    for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
+        if message_passing_dict['inverse_edge_index'].numel() == 0:
+            new_t_embeddings = t_embeddings
+        else:
+            edge_embeddings = scatter(
+                src=t_embeddings[message_passing_dict['tokens2edges'][0]],
+                dim=0,
+                index=message_passing_dict['tokens2edges'][1],
+                reduce=reduce
+            )
+            # adding dummy tensor to make sure that the output tensor of message passing is the
+            # correct size because causal message passing does not allow self loops
+            edge_embeddings = torch.cat([
+                edge_embeddings,
+                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
+            ], dim=0)
+            # adding dummy tensor to make sure that the output tensor of message passing is the
+            # correct size because causal message passing does not allow self loops
+            edge_embeddings = scatter(
+                src=edge_embeddings[message_passing_dict['inverse_edge_index'][0]],
+                dim=0,
+                index=message_passing_dict['inverse_edge_index'][1],
+                reduce=reduce
+            )
+            edge_embeddings = torch.cat([
+                edge_embeddings,
+                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
+            ], dim=0)
+            new_t_embeddings = scatter(
+                src=edge_embeddings[message_passing_dict['edges2tokens'][0]],
+                dim=0,
+                index=message_passing_dict['edges2tokens'][1],
+                reduce=reduce
+            )
+        assert new_t_embeddings.shape == t_embeddings.shape
+        new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
+    return torch.cat(new_token_embeddings, dim=0) + token_embeddings
diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/bloom/desequence_graph_ids.py
new file mode 100644
index 000000000000..95d322597a26
--- /dev/null
+++ b/src/transformers/models/bloom/desequence_graph_ids.py
@@ -0,0 +1,114 @@
+""" Implementing permutation invariant masking """
+
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+
+
+@dataclass
+class SequenceElement:
+    """ A data class for representing an element in a sequence which is an element of a serialized
+        graph
+    """
+    token: int
+    start_idx: int
+    end_idx: int
+    ids: Tuple[int]
+    length: int
+
+
+def extract_edge_sequence(
+    token_ids: List[int],
+    graph_tokens: Dict[str, List[int]]
+) -> List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]:
+    """ Returns a list of edges of the graph sequence identified in a sequence of token ids """
+    sequence = _extract_graph_elements(token_ids, graph_tokens)
+    edges = []
+    if len(sequence) > 2:
+        node_explored = defaultdict(lambda: False)
+        for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
+            if (
+                elem0.token in graph_tokens['nodes']
+                and elem1.token in graph_tokens['edge']
+                and elem2.token in graph_tokens['nodes']
+            ): # edge syntax
+                if (
+                # test to see if there is an ungenerated node contained within the identified edge
+                # essentially another syntax error that can occur during graph generation
+                    (elem0.token in graph_tokens['node'] and not node_explored[elem0.ids])
+                    or (
+                        elem2.token in graph_tokens['node']
+                        and not node_explored[elem2.ids]
+                        and elem2.ids != elem0.ids # to account for self loops
+                    )
+                ):
+                    continue
+                if elem0.token in graph_tokens['gen_node'] and not node_explored[elem0.ids]:
+                    node_explored[elem0.ids] = True
+                if elem2.token in graph_tokens['gen_node'] and not node_explored[elem2.ids]:
+                    node_explored[elem2.ids] = True
+                edges.append((elem0, elem1, elem2))
+    if (
+        (
+            len(edges) > 0
+            and edges[-1].pred_node != sequence[-3]
+            and edges[-1].edge != sequence[-2]
+            and edges[-1].succ_node != sequence[-1]
+        )
+        or len(edges) == 0
+    ):
+        if (
+            len(sequence) > 1
+            and sequence[-2].token in graph_tokens['nodes']
+            and sequence[-1].token in graph_tokens['edge']
+            and not (
+                sequence[-2].token in graph_tokens['node']
+                and not node_explored[sequence[-2].ids]
+            )
+        ):
+
+            edges.append((sequence[-2], sequence[-1], None))
+        elif (
+            len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']
+            and not (
+                sequence[-1].token in graph_tokens['node']
+                and not node_explored[sequence[-1].ids]
+            )
+        ):
+            edges.append((sequence[-1], None, None))
+    return edges
+
+
+def _extract_graph_elements(
+    token_ids: List[int],
+    graph_tokens: Dict[str, List[int]]
+) -> List[SequenceElement]:
+    """ Returns a parsable representation of the serialized graph in a sequence of token ids,
+        if none is found, returns an empty list
+    """
+    sequence = []
+    prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
+    for token_idx, token_id in enumerate(token_ids):
+        if token_id in graph_tokens['gen_node'] and prev_token_id is None:
+            prev_token_id, prev_idx = token_id, token_idx
+        elif token_id in graph_tokens['graph'] and prev_token_id is not None:
+            sequence.append(SequenceElement(
+                token=prev_token_id,
+                start_idx=prev_idx,
+                end_idx=token_idx,
+                ids=tuple(token_ids[prev_idx:token_idx])[1:],
+                length=token_idx - prev_idx
+            ))
+            prev_token_id, prev_idx = token_id, token_idx
+        elif token_id in graph_tokens['eos'] and prev_token_id is not None:
+            final_idx = token_idx
+            break
+    if prev_token_id is not None:
+        sequence.append(SequenceElement(
+            token=prev_token_id,
+            start_idx=prev_idx,
+            end_idx=final_idx,
+            ids=tuple(token_ids[prev_idx:final_idx])[1:],
+            length=final_idx - prev_idx
+        ))
+    return sequence
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 2ba34048373e..7f56abf942cb 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -16,7 +16,7 @@
 
 import math
 import warnings
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -35,7 +35,9 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from .configuration_bloom import BloomConfig
-from permutation_invariant_attention import build_alibi_tensor
+from desequence_graph_ids import extract_edge_sequence, SequenceElement
+from permutation_invariant_positions import build_alibi_tensor
+from causal_message_passing import build_message_passing_matrices, perform_causal_message_passing
 
 
 logger = logging.get_logger(__name__)
@@ -577,6 +579,9 @@ def __init__(self, config: BloomConfig):
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.n_head
+        self.graph_tokens = {}
+        self.position_type = "normal"
+        self.message_passing_type = "none"
 
         # Embedding + LN Embedding
         self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
@@ -593,8 +598,23 @@ def __init__(self, config: BloomConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
-        return build_alibi_tensor(attention_mask, num_heads, dtype, self.graph_tokens, self.position_encoding_type)
+    def build_alibi_tensor(
+        self,
+        token_ids: torch.Tensor,
+        edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
+        attention_mask: torch.Tensor,
+        num_heads: int,
+        dtype: torch.dtype
+    ) -> torch.Tensor:
+        return build_alibi_tensor(
+            token_ids=token_ids,
+            edge_sequences=edge_sequences,
+            attention_mask=attention_mask,
+            num_heads=num_heads,
+            dtype=dtype,
+            graph_tokens=self.graph_tokens,
+            position_type=self.position_type
+        )
 
     def get_input_embeddings(self):
         return self.word_embeddings
@@ -706,7 +726,19 @@ def forward(
         else:
             attention_mask = attention_mask.to(hidden_states.device)
 
-        alibi = self.build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
+        token_ids: torch.Tensor = full_input_ids if full_input_ids is not None else input_ids
+        edge_sequences = [
+            extract_edge_sequence(t_ids.tolist(), self.graph_tokens) for t_ids in token_ids
+        ]
+        alibi = self.build_alibi_tensor(
+            token_ids=token_ids,
+            edge_sequences=edge_sequences,
+            attention_mask=attention_mask,
+            num_heads=self.num_heads,
+            dtype=hidden_states.dtype
+        )
+        if self.message_passing_type != 'none':
+            message_passing_dicts = build_message_passing_matrices(token_ids, edge_sequences)
 
         causal_mask = self._prepare_attn_mask(
             attention_mask,
@@ -747,6 +779,9 @@ def custom_forward(*inputs):
                 )
 
             hidden_states = outputs[0]
+            if i != len(self.h) - 1 and self.message_passing_type != 'none':
+                hidden_states = perform_causal_message_passing(hidden_states, message_passing_dicts)
+
             if use_cache is True:
                 presents = presents + (outputs[1],)
 
diff --git a/src/transformers/models/bloom/permutation_invariant_attention.py b/src/transformers/models/bloom/permutation_invariant_attention.py
deleted file mode 100644
index 5f036b4dd28a..000000000000
--- a/src/transformers/models/bloom/permutation_invariant_attention.py
+++ /dev/null
@@ -1,388 +0,0 @@
-""" Implementing permutation invariant masking """
-
-from collections import defaultdict
-import math
-from typing import Dict, List, Optional, Tuple
-from dataclasses import dataclass
-
-import torch
-from transformers import BloomTokenizerFast
-
-from data.base_dataset import TextGraph
-from models.serialize import GEN_NODE_TOKEN, NODE_TOKEN, EDGE_TOKEN, SerializedGraphGenerator, S_TOKENS
-
-
-TEST_TOKENIZER = BloomTokenizerFast.from_pretrained('bigscience/bloom-3b')
-TEST_TOKENIZER.padding_side = 'left'
-TEST_TOKENIZER.add_tokens(S_TOKENS)
-
-MOLECULE_GRAPH = TextGraph(
-    text="a made up molecule that is probably impossible",
-    nodes=['C', 'O', 'N', 'C', 'C', 'N'],
-    # contains cycle
-    edge_index=[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 1], [2, 4], [3, 5]],
-    edges=['SINGLE', 'AROMATIC', 'DOUBLE', 'SINGLE', 'AROMATIC', 'SINGLE', 'SINGLE', 'SINGLE']
-)
-
-ABSTRACT_SYNTAX_TREE = TextGraph(
-    text="a made up function which is probably impossible",
-    nodes=['FUNCTION_DEF', 'FOR', 'WHILE', 'VARIABLE', 'ADDITION', 'VARIABLE', 'TYPE'],
-    # tree structure
-    edge_index=[[0, 1], [0, 2], [1, 3], [2, 4], [1, 5], [4, 6]],
-    edges=['A', 'B', 'C', 'D', 'E', 'F']
-)
-
-KNOWLEDGE_GRAPH = TextGraph(
-    text="a made up knowledge graph which probably contains incorrect syntax",
-    nodes=['sun', 'earth', 'plant', 'water'],
-    # contains self loop
-    edge_index=[[0, 0], [0, 1], [1, 2], [2, 0], [3, 0]],
-    edges=['iscalled', 'orbitsthe', 'containedin', 'needs', 'needs'],
-)
-
-NESTED_CYCLES = TextGraph(
-    text='this graph has a cycle nested in another cycle',
-    nodes=['0', '1', '2', '3', '4', '5', '6'],
-    edge_index=[[0, 1],[1, 2], [1, 3], [2, 4], [3, 4], [4, 5], [5, 6], [6, 0]],
-    edges=['0', '1', '2', '3', '4', '5', '6', '7'],
-)
-
-CHAIN_WITH_CYCLES = TextGraph(
-    text='this graph has a chain with a cycle in it, so not really a chain',
-    nodes=['0', '1', '2', '3', '4', '5', '6'],
-    edge_index=[[0, 1],[1, 2], [1, 3], [2, 4], [3, 4], [4, 5], [5, 6]],
-    edges=['0', '1', '2', '3', '4', '5', '6'],
-)
-
-MOLECULE_GRAPH_TWO = TextGraph(
-    text='this molecule has only carbon atoms',
-    nodes=['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
-    edge_index=[[0, 3], [1, 3], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [6, 8]],
-    edges=['SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE', 'SINGLE'],
-)
-
-GRAPH_BATCH = [
-    MOLECULE_GRAPH,
-    KNOWLEDGE_GRAPH,
-    ABSTRACT_SYNTAX_TREE,
-    MOLECULE_GRAPH_TWO,
-    NESTED_CYCLES,
-    CHAIN_WITH_CYCLES
-]
-
-GRAPH_TOKENS = {
-    'graph': TEST_TOKENIZER.convert_tokens_to_ids([GEN_NODE_TOKEN, NODE_TOKEN, EDGE_TOKEN]),
-    'nodes': TEST_TOKENIZER.convert_tokens_to_ids([GEN_NODE_TOKEN, NODE_TOKEN]),
-    'gen_node': TEST_TOKENIZER.convert_tokens_to_ids([GEN_NODE_TOKEN]),
-    'node': TEST_TOKENIZER.convert_tokens_to_ids([NODE_TOKEN]),
-    'edge': TEST_TOKENIZER.convert_tokens_to_ids([EDGE_TOKEN]),
-    'eos': [TEST_TOKENIZER.pad_token_id, TEST_TOKENIZER.eos_token_id]
-}
-
-
-@dataclass
-class SequenceElement:
-    """ A data class for representing an element in a sequence which is an element of a serialized
-        graph
-    """
-    token: int
-    start_idx: int
-    end_idx: int
-    ids: Tuple[int]
-    length: int
-
-
-def build_alibi_tensor(
-    token_ids: torch.Tensor,
-    attention_mask: torch.Tensor,
-    num_heads: int,
-    dtype: torch.dtype,
-    graph_tokens: Dict[str, List[int]],
-    position_type: str = 'vanilla'
-) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
-            number of heads
-        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-            dtype of the output tensor
-    """
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(
-        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    token_positions = []
-    for (t_ids, mask, positions) in enumerate(zip(token_ids, attention_mask, arange_tensor)):
-        token_positions.append(
-            _get_graph_positions(t_ids, mask, positions, graph_tokens, position_type)
-        )
-    alibi = slopes[..., None] * torch.cat(token_positions, dim=0)
-    import pdb;pdb.set_trace()
-    return alibi.reshape(
-        batch_size * num_heads,
-        seq_length if position_type == 'all_edges_previous' else 1,
-        seq_length
-    ).to(dtype)
-
-
-def _get_graph_positions(
-    token_ids: torch.Tensor,
-    mask: torch.Tensor,
-    positions: torch.Tensor,
-    graph_tokens: Dict[str, List[int]],
-    position_type: str
-) -> torch.Tensor:
-    """ Returns a revised position tensor (arange_tensor) where the position of tokens in the
-        sequence reflect the position of nodes and edges in a graph
-    """
-    if position_type == 'vanilla':
-        return positions
-    assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
-    edges = _extract_graph(token_ids.int().tolist(), graph_tokens)
-    if len(edges) > 0:
-        assert len(positions.shape) == 2 and positions.size(0) == 1
-        assert token_ids.shape[0] == positions.shape[1], f"Tensors must be the same shape {positions.shape}   {token_ids.shape}"
-        if position_type == 'action_invariant':
-            new_positions = _get_action_invariant_positions(
-                positions=positions[0],
-                sequenced_edges=edges,
-                graph_tokens=graph_tokens,
-            ) * mask
-        elif position_type == 'no_positions':
-            new_positions = _remove_positions_from_graph_tokens(
-                positions=positions[0],
-                sequenced_edges=edges,
-            ) * mask
-        elif position_type == 'all_edges_previous':
-            new_positions = _get_all_edges_previous_positions(
-                positions=positions[0],
-                sequenced_edges=edges,
-            ) * mask
-        return new_positions.unsqueeze(0)
-    else:
-        return positions
-
-
-def _get_all_edges_previous_positions(
-    positions: torch.Tensor,
-    sequenced_edges: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
-) -> torch.Tensor:
-    start_idx = sequenced_edges[0][0].start_idx
-    init_position = positions[start_idx].item()
-    new_positions_list = []
-    for s_idx, sequenced_edge in enumerate(sequenced_edges):
-        new_positions = positions.clone()
-        max_length = 0
-        for prev_edge in sequenced_edges[:s_idx]:
-            pred_node, _, succ_node = prev_edge
-            new_positions[pred_node.start_idx:succ_node.end_idx] = torch.arange(start_idx - end_idx, 0, -1).to(positions.device)
-            max_length = max(end_idx - start_idx, max_length)
-        pred_node, edge, succ_node = sequenced_edge
-        if succ_node is None and edge is None:
-            end_idx = pred_node.end_idx
-        elif succ_node is None:
-            end_idx = edge.end_idx
-        else:
-            end_idx = succ_node.end_idx
-        new_positions[pred_node.start_idx:end_idx] = torch.arange(end_idx - start_idx).to(positions.device)
-        new_positions[start_idx:end_idx] += init_position + max_length
-        for idx in range(pred_node.start_idx, end_idx):
-            token_positions = new_positions.clone()
-            if idx < token_positions.numel():
-                token_positions[(idx + 1):] = -1e6
-            new_positions_list.append(token_positions.unsqueeze(0))
-    return torch.cat([new_positions_list], dim=0)
-
-
-def _remove_positions_from_graph_tokens(
-    positions: torch.Tensor,
-    sequenced_edges: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
-) -> torch.Tensor:
-    start_idx = sequenced_edges[0][0].start_idx
-    if sequenced_edges[-1][2] is None and sequenced_edges[-1][1] is None:
-        end_idx = sequenced_edges[-1][0].end_idx
-    elif sequenced_edges[-1][2] is None:
-        end_idx = sequenced_edges[-1][1].end_idx
-    else:
-        end_idx = sequenced_edges[-1][2].end_idx
-    init_position = positions[start_idx].item()
-    new_positions = positions.clone()
-    new_positions[start_idx:end_idx] = init_position # setting all isolated graph tokens to have a low bias
-    return new_positions
-
-
-def _get_action_invariant_positions(
-    positions: torch.Tensor,
-    sequenced_edges: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
-    graph_tokens: Dict[str, List[int]]
-) -> torch.Tensor:
-    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
-        their position in the graph
-    """
-    device = positions.device
-    init_position = positions[sequenced_edges[0][0].start_idx].item()
-    new_positions = positions.clone()
-    element_position = defaultdict(lambda: -1)
-    for sequenced_edge in sequenced_edges:
-        pred_node, edge, succ_node = sequenced_edge
-        pred_bias = (
-            init_position if element_position[pred_node.ids] < 0
-            else element_position[pred_node.ids]
-        )
-        new_positions[pred_node.start_idx:pred_node.end_idx] = (
-            pred_bias + torch.arange(pred_node.length).to(device)
-        )
-        if isinstance(edge, SequenceElement):
-            new_positions[edge.start_idx:edge.end_idx] = (
-                new_positions[pred_node.end_idx - 1] + 1 + torch.arange(edge.length).to(device)
-            )
-        if isinstance(edge, SequenceElement) and isinstance(succ_node, SequenceElement):
-            new_positions[succ_node.start_idx:succ_node.end_idx] = (
-                new_positions[edge.end_idx - 1] + 1 + torch.arange(succ_node.length).to(device)
-            )
-        after_edge_pos = new_positions[succ_node.end_idx - 1] + 1
-        if pred_node.token in graph_tokens['gen_node']:
-            element_position[pred_node.ids] = after_edge_pos.item()
-        if isinstance(succ_node, SequenceElement) and succ_node.token in graph_tokens['gen_node']:
-            element_position[succ_node.ids] = after_edge_pos.item()
-    return new_positions
-
-
-def _extract_graph(
-    token_ids: List[int],
-    graph_tokens: Dict[str, List[int]]
-) -> List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]:
-    """ Returns a list of edges of the graph sequence identified in a sequence of token ids """
-    sequence = _extract_graph_sequence(token_ids, graph_tokens)
-    edges = []
-    if len(sequence) > 2:
-        node_explored = defaultdict(lambda: False)
-        for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
-            if (
-                elem0.token in graph_tokens['nodes']
-                and elem1.token in graph_tokens['edge']
-                and elem2.token in graph_tokens['nodes']
-            ): # edge syntax
-                if (
-                # test to see if there is an ungenerated node contained within the identified edge
-                # essentially another syntax error that can occur during graph generation
-                    (elem0.token in graph_tokens['node'] and not node_explored[elem0.ids])
-                    or (
-                        elem2.token in graph_tokens['node']
-                        and not node_explored[elem2.ids]
-                        and elem2.ids != elem0.ids # to account for self loops
-                    )
-                ):
-                    continue
-                if elem0.token in graph_tokens['gen_node'] and not node_explored[elem0.ids]:
-                    node_explored[elem0.ids] = True
-                if elem2.token in graph_tokens['gen_node'] and not node_explored[elem2.ids]:
-                    node_explored[elem2.ids] = True
-                edges.append((elem0, elem1, elem2))
-    if (
-        (
-            len(edges) > 0
-            and edges[-1].pred_node != sequence[-3]
-            and edges[-1].edge != sequence[-2]
-            and edges[-1].succ_node != sequence[-1]
-        )
-        or len(edges) == 0
-    ):
-        if (
-            len(sequence) > 1
-            and sequence[-2].token in graph_tokens['nodes']
-            and sequence[-1].token in graph_tokens['edge']
-            and not (
-                sequence[-2].token in graph_tokens['node']
-                and not node_explored[sequence[-2].ids]
-            )
-        ):
-
-            edges.append((sequence[-2], sequence[-1], None))
-        elif (
-            len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']
-            and not (
-                sequence[-1].token in graph_tokens['node']
-                and not node_explored[sequence[-1].ids]
-            )
-        ):
-            edges.append((sequence[-1], None, None))
-    return edges
-
-
-def _extract_graph_sequence(
-    token_ids: List[int],
-    graph_tokens: Dict[str, List[int]]
-) -> List[SequenceElement]:
-    """ Returns a parsable representation of the serialized graph in a sequence of token ids,
-        if none is found, returns an empty list
-    """
-    sequence = []
-    prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
-    for token_idx, token_id in enumerate(token_ids):
-        if token_id in graph_tokens['gen_node'] and prev_token_id is None:
-            prev_token_id, prev_idx = token_id, token_idx
-        elif token_id in graph_tokens['graph'] and prev_token_id is not None:
-            sequence.append(SequenceElement(
-                token=prev_token_id,
-                start_idx=prev_idx,
-                end_idx=token_idx,
-                ids=tuple(token_ids[prev_idx:token_idx])[1:],
-                length=token_idx - prev_idx
-            ))
-            prev_token_id, prev_idx = token_id, token_idx
-        elif token_id in graph_tokens['eos'] and prev_token_id is not None:
-            final_idx = token_idx
-            break
-    if prev_token_id is not None:
-        sequence.append(SequenceElement(
-            token=prev_token_id,
-            start_idx=prev_idx,
-            end_idx=final_idx,
-            ids=tuple(token_ids[prev_idx:final_idx])[1:],
-            length=final_idx - prev_idx
-        ))
-    return sequence
-
-
-if __name__ == "__main__":
-    graphs_batch = SerializedGraphGenerator.text_graph2inputs_batch(
-        text_graph_pairs=GRAPH_BATCH,
-        tokenizer=TEST_TOKENIZER,
-        add_directions=False,
-        randomize=False
-    )
-    alibi = build_alibi_tensor(
-        token_ids=graphs_batch['input_sequence'],
-        attention_mask=graphs_batch['input_attn_mask'],
-        num_heads=8,
-        dtype=graphs_batch['input_sequence'].dtype,
-        graph_tokens=GRAPH_TOKENS
-    )
diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
new file mode 100644
index 000000000000..d08a26e2cf57
--- /dev/null
+++ b/src/transformers/models/bloom/permutation_invariant_positions.py
@@ -0,0 +1,198 @@
+""" Implementing permutation invariant masking """
+
+from collections import defaultdict
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from desequence_graph_ids import SequenceElement
+
+
+def build_alibi_tensor(
+    token_ids: torch.Tensor,
+    edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
+    attention_mask: torch.Tensor,
+    num_heads: int,
+    dtype: torch.dtype,
+    graph_tokens: Dict[str, List[int]],
+    position_type: str = 'vanilla'
+) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    token_positions = []
+    for (t_ids, edge_sequence, mask, positions) in enumerate(
+        zip(token_ids, edge_sequences, attention_mask, arange_tensor)
+    ):
+        token_positions.append(
+            _get_graph_positions(t_ids, edge_sequence, mask, positions, graph_tokens, position_type)
+        )
+    alibi = slopes[..., None] * torch.cat(token_positions, dim=0)
+    return alibi.reshape(
+        batch_size * num_heads,
+        seq_length if position_type == 'all_edges_previous' else 1,
+        seq_length
+    ).to(dtype)
+
+
+def _get_graph_positions(
+    token_ids: torch.Tensor,
+    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
+    mask: torch.Tensor,
+    positions: torch.Tensor,
+    graph_tokens: Dict[str, List[int]],
+    position_type: str
+) -> torch.Tensor:
+    """ Returns a revised position tensor (arange_tensor) where the position of tokens in the
+        sequence reflect the position of nodes and edges in a graph
+    """
+    if position_type == 'vanilla':
+        return positions
+    assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
+    if len(edge_sequence) > 0:
+        assert len(positions.shape) == 2 and positions.size(0) == 1
+        assert (
+            token_ids.shape[0] == positions.shape[1]
+        ), f"Tensors must be the same shape {positions.shape}   {token_ids.shape}"
+        if position_type == 'action_invariant':
+            new_positions = _get_action_invariant_positions(
+                positions=positions[0],
+                edge_sequence=edge_sequence,
+                graph_tokens=graph_tokens,
+            ) * mask
+        elif position_type == 'no_positions':
+            new_positions = _remove_positions_from_graph_tokens(
+                positions=positions[0],
+                edge_sequence=edge_sequence,
+            ) * mask
+        elif position_type == 'all_edges_previous':
+            new_positions = _get_all_edges_previous_positions(
+                positions=positions[0],
+                edge_sequence=edge_sequence
+            ) * mask
+        else:
+            raise ValueError(f"Unknown position embedding type {position_type}")
+        return new_positions.unsqueeze(0)
+    else:
+        return positions
+
+
+def _get_all_edges_previous_positions(
+    positions: torch.Tensor,
+    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+) -> torch.Tensor:
+    start_idx = edge_sequence[0][0].start_idx
+    init_position = positions[start_idx].item()
+    new_positions_list = []
+    for s_idx, sequenced_edge in enumerate(edge_sequence):
+        new_positions = positions.clone()
+        max_length = 0
+        for prev_edge in edge_sequence[:s_idx]:
+            pred_node, _, succ_node = prev_edge
+            new_positions[pred_node.start_idx:succ_node.end_idx] = torch.arange(start_idx - end_idx, 0, -1).to(positions.device)
+            max_length = max(end_idx - start_idx, max_length)
+        pred_node, edge, succ_node = sequenced_edge
+        if succ_node is None and edge is None:
+            end_idx = pred_node.end_idx
+        elif succ_node is None:
+            end_idx = edge.end_idx
+        else:
+            end_idx = succ_node.end_idx
+        new_positions[pred_node.start_idx:end_idx] = torch.arange(end_idx - start_idx).to(positions.device)
+        new_positions[start_idx:end_idx] += init_position + max_length
+        for idx in range(pred_node.start_idx, end_idx):
+            token_positions = new_positions.clone()
+            if idx < token_positions.numel():
+                token_positions[(idx + 1):] = -1e6
+            new_positions_list.append(token_positions.unsqueeze(0))
+    return torch.cat([new_positions_list], dim=0)
+
+
+def _remove_positions_from_graph_tokens(
+    positions: torch.Tensor,
+    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+) -> torch.Tensor:
+    start_idx = edge_sequence[0][0].start_idx
+    if edge_sequence[-1][2] is None and edge_sequence[-1][1] is None:
+        end_idx = edge_sequence[-1][0].end_idx
+    elif edge_sequence[-1][2] is None:
+        end_idx = edge_sequence[-1][1].end_idx
+    else:
+        end_idx = edge_sequence[-1][2].end_idx
+    init_position = positions[start_idx].item()
+    new_positions = positions.clone()
+    new_positions[start_idx:end_idx] = init_position # setting all isolated graph tokens to have a low bias
+    return new_positions
+
+
+def _get_action_invariant_positions(
+    positions: torch.Tensor,
+    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
+    graph_tokens: Dict[str, List[int]]
+) -> torch.Tensor:
+    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
+        their position in the graph
+    """
+    device = positions.device
+    init_position = positions[edge_sequence[0][0].start_idx].item()
+    new_positions = positions.clone()
+    element_position = defaultdict(lambda: -1)
+    for sequenced_edge in edge_sequence:
+        pred_node, edge, succ_node = sequenced_edge
+        pred_bias = (
+            init_position if element_position[pred_node.ids] < 0
+            else element_position[pred_node.ids]
+        )
+        new_positions[pred_node.start_idx:pred_node.end_idx] = (
+            pred_bias + torch.arange(pred_node.length).to(device)
+        )
+        if isinstance(edge, SequenceElement):
+            new_positions[edge.start_idx:edge.end_idx] = (
+                new_positions[pred_node.end_idx - 1] + 1 + torch.arange(edge.length).to(device)
+            )
+        if isinstance(edge, SequenceElement) and isinstance(succ_node, SequenceElement):
+            new_positions[succ_node.start_idx:succ_node.end_idx] = (
+                new_positions[edge.end_idx - 1] + 1 + torch.arange(succ_node.length).to(device)
+            )
+        after_edge_pos = new_positions[succ_node.end_idx - 1] + 1
+        if pred_node.token in graph_tokens['gen_node']:
+            element_position[pred_node.ids] = after_edge_pos.item()
+        if isinstance(succ_node, SequenceElement) and succ_node.token in graph_tokens['gen_node']:
+            element_position[succ_node.ids] = after_edge_pos.item()
+    return new_positions
diff --git a/src/transformers/models/bloom/reference.py b/src/transformers/models/bloom/reference.py
deleted file mode 100644
index a52e439bc357..000000000000
--- a/src/transformers/models/bloom/reference.py
+++ /dev/null
@@ -1,1696 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BLOOM model."""
-
-import math
-import warnings
-from typing import Dict, List, Optional, Tuple, Union
-
-import math
-from collections import defaultdict
-from typing import Dict, List, Tuple, Union
-from dataclasses import dataclass
-
-from collections import defaultdict
-import itertools
-import math
-from typing import Dict, List
-
-import numpy as np
-import torch
-from torch_scatter import scatter
-
-import torch
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
-from torch.nn import functional as F
-
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
-from ...modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
-from .configuration_bloom import BloomConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
-_CONFIG_FOR_DOC = "BloomConfig"
-
-BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "bigscience/bigscience-small-testing",
-    "bigscience/bloom-560m",
-    "bigscience/bloom-1b1",
-    "bigscience/bloom-1b7",
-    "bigscience/bloom-3b",
-    "bigscience/bloom-7b1",
-    "bigscience/bloom",
-]
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
-) -> torch.BoolTensor:
-    """
-    Make causal mask used for self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
-    # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
-    seq_ids = torch.arange(target_length, device=device)
-    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
-
-    if past_key_values_length > 0:
-        mask[:, :past_key_values_length] = False
-
-    expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
-    return expanded_mask
-
-
-def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
-    """
-    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
-    """
-    batch_size, src_length = mask.shape
-    tgt_length = tgt_length if tgt_length is not None else src_length
-
-    expanded_mask = ~(mask[:, None, None, :].to(torch.bool))
-    return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
-
-
-# def build_alibi_tensor(
-#     token_ids: torch.Tensor,
-#     attention_mask: torch.Tensor,
-#     num_heads: int,
-#     dtype: torch.dtype,
-#     graph_tokens: Dict[str, List[int]]
-# ) -> torch.Tensor:
-#     """
-#     Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-#     relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-#     `softmax(l+a) = softmax(l)`. Based on
-#     https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-#     TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-
-#     Args:
-#     Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-#         attention_mask (`torch.Tensor`):
-#             Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-#         num_heads (`int`, *required*):
-#             number of heads
-#         dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-#             dtype of the output tensor
-#     """
-#     batch_size, seq_length = attention_mask.shape
-#     closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-#     base = torch.tensor(
-#         2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-#     )
-#     powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-#     slopes = torch.pow(base, powers)
-
-#     if closest_power_of_2 != num_heads:
-#         extra_base = torch.tensor(
-#             2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-#         )
-#         num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-#         extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-#         slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-#     # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-#     # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-#     # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-#     # => the query_length dimension will then be broadcasted correctly
-#     # This is more or less identical to T5's relative position bias:
-#     # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-#     arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-#     alibi = slopes[..., None] * arange_tensor
-#     return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-
-
-@dataclass
-class SequenceElement:
-    """ A data class for representing an element in a sequence which is an element of a serialized
-        graph
-    """
-    token: int
-    start_idx: int
-    end_idx: int
-    ids: Tuple[int]
-    length: int
-
-
-@dataclass
-class SequenceEdge:
-    """ A data class for representing a text graph where you can optionally include the text
-        describing the graph or the log probabilities of generating the text graph
-    """
-    pred_node: SequenceElement
-    edge: SequenceElement = SequenceElement(
-        token=0,
-        start_idx=-1,
-        end_idx=0,
-        ids=tuple(),
-        length=0
-    )
-    succ_node: SequenceElement = SequenceElement(
-        token=0,
-        start_idx=-1,
-        end_idx=0,
-        ids=tuple(),
-        length=0
-    )
-
-
-def build_alibi_tensor(
-    token_ids: torch.Tensor,
-    attention_mask: torch.Tensor,
-    num_heads: int,
-    dtype: torch.dtype,
-    graph_tokens: Dict[str, List[int]]
-) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
-            number of heads
-        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-            dtype of the output tensor
-    """
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(
-        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    for batch_idx, (t_ids, mask, positions) in enumerate(
-        zip(token_ids, attention_mask, arange_tensor)
-    ):
-        arange_tensor[batch_idx] = _get_graph_positions(t_ids, mask, positions, graph_tokens)
-    alibi = slopes[..., None] * arange_tensor
-    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-
-
-def _get_graph_positions(
-    token_ids: torch.Tensor,
-    mask: torch.Tensor,
-    arange_tensor: torch.Tensor,
-    graph_tokens: Dict[str, List[int]]
-) -> torch.Tensor:
-    """ Returns a revised position tensor (arange_tensor) where the position of tokens in the
-        sequence reflect the position of nodes and edges in a graph
-    """
-    assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
-    edges = _extract_graph(token_ids.int().tolist(), graph_tokens)
-    if len(edges) > 0:
-        assert len(arange_tensor.shape) == 2 and arange_tensor.size(0) == 1
-        assert token_ids.shape[0] == arange_tensor.shape[1], f"Tensors must be the same shape {arange_tensor.shape}   {token_ids.shape}"
-        arange_tensor = _edit_positions_w_graph_info(arange_tensor[0], edges, graph_tokens) * mask
-    return arange_tensor.unsqueeze(0)
-
-
-def _edit_positions_w_graph_info(
-    arange_tensor: torch.Tensor,
-    sequenced_edges: List[SequenceEdge],
-    graph_tokens: Dict[str, List[int]]
-) -> torch.Tensor:
-    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
-        their position in the graph
-    """
-    device = arange_tensor.device
-    start_idx = sequenced_edges[0].pred_node.start_idx
-    init_position = arange_tensor[start_idx].item()
-    arange_tensor[start_idx:] = 0 # setting all isolated graph tokens to have a low bias
-    element_position = defaultdict(lambda: -1)
-    for s_edge in sequenced_edges:
-        pred_node, edge, succ_node = s_edge.pred_node, s_edge.edge, s_edge.succ_node
-        pred_bias = (
-            init_position if element_position[pred_node.ids] < 0
-            else element_position[pred_node.ids]
-        )
-        arange_tensor[pred_node.start_idx:pred_node.end_idx] = (
-            pred_bias + torch.arange(pred_node.length).to(device)
-        )
-        if edge.start_idx != -1:
-            arange_tensor[edge.start_idx:edge.end_idx] = (
-                arange_tensor[pred_node.end_idx - 1] + 1 + torch.arange(edge.length).to(device)
-            )
-        if edge.start_idx != -1 and succ_node.start_idx == -1:
-            arange_tensor[succ_node.start_idx:succ_node.end_idx] = (
-                arange_tensor[edge.end_idx - 1] + 1 + torch.arange(succ_node.length).to(device)
-            )
-        after_edge_pos = arange_tensor[succ_node.end_idx - 1] + 1
-        if pred_node.token in graph_tokens['gen_node']:
-            element_position[pred_node.ids] = after_edge_pos.item()
-        if succ_node.token in graph_tokens['gen_node'] and succ_node.start_idx != -1:
-            element_position[succ_node.ids] = after_edge_pos.item()
-    return arange_tensor
-
-
-def _extract_graph(token_ids: List[int], graph_tokens: Dict[str, List[int]]) -> List[SequenceEdge]:
-    """ Returns a list of edges of the graph sequence identified in a sequence of token ids """
-    sequence = _extract_graph_sequence(token_ids, graph_tokens)
-    edges = []
-    if len(sequence) > 2:
-        node_explored = defaultdict(lambda: False)
-        for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
-            if (
-                elem0.token in graph_tokens['nodes']
-                and elem1.token in graph_tokens['edge']
-                and elem2.token in graph_tokens['nodes']
-            ): # edge syntax
-                if (
-                # test to see if there is an ungenerated node contained within the identified edge
-                # essentially another syntax error that can occur during graph generation
-                    (elem0.token in graph_tokens['node'] and not node_explored[elem0.ids])
-                    or (
-                        elem2.token in graph_tokens['node']
-                        and not node_explored[elem2.ids]
-                        and elem2.ids != elem0.ids # to account for self loops
-                    )
-                ):
-                    continue
-                if elem0.token in graph_tokens['gen_node'] and not node_explored[elem0.ids]:
-                    node_explored[elem0.ids] = True
-                if elem2.token in graph_tokens['gen_node'] and not node_explored[elem2.ids]:
-                    node_explored[elem2.ids] = True
-                edges.append(SequenceEdge(pred_node=elem0, edge=elem1, succ_node=elem2))
-    if (
-        (
-            len(edges) > 0
-            and edges[-1].pred_node != sequence[-3]
-            and edges[-1].edge != sequence[-2]
-            and edges[-1].succ_node != sequence[-1]
-        )
-        or len(edges) == 0
-    ):
-        if (
-            len(sequence) > 1
-            and sequence[-2].token in graph_tokens['nodes']
-            and sequence[-1].token in graph_tokens['edge']
-            and not (
-                sequence[-2].token in graph_tokens['node']
-                and not node_explored[sequence[-2].ids]
-            )
-        ):
-
-            edges.append(SequenceEdge(pred_node=sequence[-2], edge=sequence[-1]))
-        elif (
-            len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']
-            and not (
-                sequence[-1].token in graph_tokens['node']
-                and not node_explored[sequence[-1].ids]
-            )
-        ):
-            edges.append(SequenceEdge(pred_node=sequence[-1]))
-    return edges
-
-
-def _extract_graph_sequence(
-    token_ids: List[int],
-    graph_tokens: Dict[str, List[int]]
-) -> List[SequenceElement]:
-    """ Returns a parsable representation of the serialized graph in a sequence of token ids,
-        if none is found, returns an empty list
-    """
-    sequence = []
-    prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
-    for token_idx, token_id in enumerate(token_ids):
-        if token_id in graph_tokens['gen_node'] and prev_token_id is None:
-            prev_token_id, prev_idx = token_id, token_idx
-        elif token_id in graph_tokens['graph'] and prev_token_id is not None:
-            sequence.append(SequenceElement(
-                token=prev_token_id,
-                start_idx=prev_idx,
-                end_idx=token_idx,
-                ids=tuple(token_ids[prev_idx:token_idx])[1:],
-                length=token_idx - prev_idx
-            ))
-            prev_token_id, prev_idx = token_id, token_idx
-        elif token_id in graph_tokens['eos'] and prev_token_id is not None:
-            final_idx = token_idx
-            break
-    if prev_token_id is not None:
-        sequence.append(SequenceElement(
-            token=prev_token_id,
-            start_idx=prev_idx,
-            end_idx=final_idx,
-            ids=tuple(token_ids[prev_idx:final_idx])[1:],
-            length=final_idx - prev_idx
-        ))
-    return sequence
-
-
-def build_message_passing_matrices(
-    token_ids: torch.Tensor,
-    graph_tokens: Dict[str, List[int]]
-) -> List[Dict[str, torch.Tensor]]:
-    """ Returns the adjacency matrices required to perform causal message passing in between
-        language model blocks of an autoregressive language model
-    """
-    message_passing_dicts = []
-    for t_ids in token_ids:
-        message_passing_dict = {'tokens2edges': [], 'edges2tokens': [], 'inverse_edge_index': []}
-        edges = _extract_graph(t_ids.int().tolist(), graph_tokens)
-        node2edge_idxs = defaultdict(list)
-        for edge_idx, edge in enumerate(edges):
-            node2edge_idxs[edge.pred_node.ids].append(edge_idx)
-            if edge.succ_node.start_idx != -1:
-                end_idx = edge.succ_node.end_idx
-                node2edge_idxs[edge.succ_node.ids].append(edge_idx)
-            elif edge.edge.start_idx != -1:
-                end_idx = edge.edge.end_idx
-            else:
-                end_idx = edge.pred_node.end_idx
-            for token_idx in range(edge.pred_node.start_idx, end_idx):
-                message_passing_dict['tokens2edges'].append([token_idx, edge_idx])
-                message_passing_dict['edges2tokens'].append([edge_idx, token_idx])
-        if len(message_passing_dict['edges2tokens']) > 0:
-            message_passing_dict['edges2tokens'] = add_missing_idxs(
-                message_passing_dict['edges2tokens'],
-                num_incoming_nodes=len(edges),
-                num_outgoing_nodes=len(t_ids)
-            )
-        message_passing_dict['inverse_edge_index'] = []
-        for edge_idxs in node2edge_idxs.values():
-            if len(edge_idxs) < 2:
-                continue
-            for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
-                message_passing_dict['inverse_edge_index'].append(
-                    [idx0, idx1] if idx0 < idx1 else [idx1, idx0]
-                )
-        if len(message_passing_dict['inverse_edge_index']) > 0:
-            message_passing_dict['inverse_edge_index'] = add_missing_idxs(
-                message_passing_dict['inverse_edge_index'],
-                num_incoming_nodes=len(edges),
-                num_outgoing_nodes=len(edges)
-            )
-        message_passing_dicts.append({
-            key: torch.from_numpy(np.array(value).transpose(1, 0)).long().to(token_ids.device)
-            if len(value) > 0 else torch.from_numpy(np.array(value)).long().to(token_ids.device)
-            for key, value in message_passing_dict.items()
-        })
-    return message_passing_dicts
-
-
-def add_missing_idxs(
-    edge_index: List[List[int]],
-    *,
-    num_incoming_nodes: int,
-    num_outgoing_nodes: int
-) -> List[List[int]]:
-    """ Adds edges from a dummy node to all outgoing nodes which do not have an edge pointing to
-        them. This is to facilitate causal message passing where a node should have access to its
-        own embedding using torch.scatter to perform message passing.
-    """
-    existing_idxs = set([node_idxs[-1] for node_idxs in edge_index])
-    missing_idxs = set(range(num_outgoing_nodes)) - existing_idxs
-    for missing_idx in missing_idxs:
-        edge_index.append([num_incoming_nodes, missing_idx])
-    return edge_index
-
-
-def perform_causal_message_passing(
-    token_embeddings: torch.Tensor,
-    message_passing_dicts: List[Dict[str, torch.Tensor]],
-    reduce: str = 'mean'
-) -> torch.Tensor:
-    """ Returns token embeddings in a sequence where causal message passing has been performed to
-        based on the serialized graph described in the token sequence
-    """
-    new_token_embeddings = []
-    for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
-        if message_passing_dict['inverse_edge_index'].numel() == 0:
-            new_t_embeddings = t_embeddings
-        else:
-            edge_embeddings = scatter(
-                src=t_embeddings[message_passing_dict['tokens2edges'][0]],
-                dim=0,
-                index=message_passing_dict['tokens2edges'][1],
-                reduce=reduce
-            )
-            # adding dummy tensor to make sure that the output tensor of message passing is the
-            # correct size because causal message passing does not allow self loops
-            edge_embeddings = torch.cat([
-                edge_embeddings,
-                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
-            ], dim=0)
-            # adding dummy tensor to make sure that the output tensor of message passing is the
-            # correct size because causal message passing does not allow self loops
-            edge_embeddings = scatter(
-                src=edge_embeddings[message_passing_dict['inverse_edge_index'][0]],
-                dim=0,
-                index=message_passing_dict['inverse_edge_index'][1],
-                reduce=reduce
-            )
-            edge_embeddings = torch.cat([
-                edge_embeddings,
-                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
-            ], dim=0)
-            new_t_embeddings = scatter(
-                src=edge_embeddings[message_passing_dict['edges2tokens'][0]],
-                dim=0,
-                index=message_passing_dict['edges2tokens'][1],
-                reduce=reduce
-            )
-        assert new_t_embeddings.shape == t_embeddings.shape
-        new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
-    return torch.cat(new_token_embeddings, dim=0) + token_embeddings
-
-
-def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
-    """
-    Dropout add function
-
-    Args:
-        x (`torch.tensor`, *required*):
-            input tensor
-        residual (`torch.tensor`, *required*):
-            esidual tensor
-        prob (`float`, *required*):
-            dropout probability
-        training (`bool`, *required*):
-            training mode
-    """
-    out = F.dropout(x, p=prob, training=training)
-    out = residual + out
-    return out
-
-
-def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
-    """
-    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
-    make the model jitable.
-
-    Args:
-        x (`torch.tensor`, *required*):
-            input hidden states
-    """
-    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
-
-
-def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-    """
-    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
-    0.3989423 * x * torch.exp(-0.5 * x * x)
-
-    Args:
-        g (`torch.tensor`, *required*):
-            gradient output tensor
-        x (`torch.tensor`, *required*):
-            input tensor
-    """
-    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
-    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
-    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
-    return ff * g
-
-
-class GeLUFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input: torch.Tensor) -> torch.Tensor:
-        ctx.save_for_backward(input)
-        return bloom_gelu_forward(input)
-
-    @staticmethod
-    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
-        input = ctx.saved_tensors
-        tmp = bloom_gelu_back(grad_output, input)
-        return tmp
-
-
-class BloomGelu(nn.Module):
-    """
-    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
-    torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
-    copied from Megatron-DeepSpeed code and adapted for our needs
-
-    See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.training:
-            return GeLUFunction.apply(x)
-        else:
-            return bloom_gelu_forward(x)
-
-
-class BloomAttention(nn.Module):
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-
-        self.pretraining_tp = config.pretraining_tp
-        self.slow_but_exact = config.slow_but_exact
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.n_head
-        self.head_dim = self.hidden_size // self.num_heads
-        self.split_size = self.hidden_size
-        self.hidden_dropout = config.hidden_dropout
-
-        if self.head_dim * self.num_heads != self.hidden_size:
-            raise ValueError(
-                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-
-        # Layer-wise attention scaling
-        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.beta = 1.0
-
-        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
-        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
-        self.attention_dropout = nn.Dropout(config.attention_dropout)
-
-    def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
-        storage as `fused_qkv`
-
-        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
-
-        Returns:
-            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
-            value: [batch_size, seq_length, num_heads, head_dim]
-        """
-        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-        fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
-        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
-
-    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Merge heads together over the last dimenstion
-
-        Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
-
-        Returns:
-            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
-        """
-        # What we want to achieve is:
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
-        batch_size_and_num_heads, seq_length, _ = x.shape
-        batch_size = batch_size_and_num_heads // self.num_heads
-
-        # First view to decompose the batch size
-        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
-        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
-
-        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
-        x = x.permute(0, 2, 1, 3)
-
-        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
-        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-
-        # 3 x [batch_size, seq_length, num_heads, head_dim]
-        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
-
-        batch_size, q_length, _, _ = query_layer.shape
-
-        query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        key_layer = key_layer.permute(0, 2, 3, 1).reshape(batch_size * self.num_heads, self.head_dim, q_length)
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
-            key_layer = torch.cat((past_key, key_layer), dim=2)
-            value_layer = torch.cat((past_value, value_layer), dim=1)
-
-        _, _, kv_length = key_layer.shape
-
-        if use_cache is True:
-            present = (key_layer, value_layer)
-        else:
-            present = None
-
-        # [batch_size * num_heads, q_length, kv_length]
-        # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
-        matmul_result = alibi.baddbmm(
-            batch1=query_layer,
-            batch2=key_layer,
-            beta=self.beta,
-            alpha=self.inv_norm_factor,
-        )
-
-        # change view to [batch_size, num_heads, q_length, kv_length]
-        attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
-
-        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-        input_dtype = attention_scores.dtype
-        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-        if input_dtype == torch.float16:
-            attention_scores = attention_scores.to(torch.float)
-        attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
-        attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype)
-
-        # [batch_size, num_heads, q_length, kv_length]
-        attention_probs = self.attention_dropout(attention_probs)
-
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        # change view [batch_size x num_heads, q_length, kv_length]
-        attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
-
-        # matmul: [batch_size * num_heads, q_length, head_dim]
-        context_layer = torch.bmm(attention_probs_reshaped, value_layer)
-
-        # change view [batch_size, num_heads, q_length, head_dim]
-        context_layer = self._merge_heads(context_layer)
-
-        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
-        if self.pretraining_tp > 1 and self.slow_but_exact:
-            slices = self.hidden_size / self.pretraining_tp
-            output_tensor = torch.zeros_like(context_layer)
-            for i in range(self.pretraining_tp):
-                output_tensor = output_tensor + F.linear(
-                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
-                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
-                )
-        else:
-            output_tensor = self.dense(context_layer)
-
-        output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
-
-        outputs = (output_tensor, present)
-        if output_attentions:
-            outputs += (attention_probs,)
-
-        return outputs
-
-
-class BloomMLP(nn.Module):
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.pretraining_tp = config.pretraining_tp
-        self.slow_but_exact = config.slow_but_exact
-        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
-        self.gelu_impl = BloomGelu()
-        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
-        self.hidden_dropout = config.hidden_dropout
-
-    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))
-
-        if self.pretraining_tp > 1 and self.slow_but_exact:
-            intermediate_output = torch.zeros_like(residual)
-            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
-            for i in range(self.pretraining_tp):
-                intermediate_output = intermediate_output + F.linear(
-                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
-                    self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)],
-                )
-        else:
-            intermediate_output = self.dense_4h_to_h(hidden_states)
-
-        output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
-
-        return output
-
-
-class BloomBlock(nn.Module):
-    def __init__(self, config: BloomConfig):
-        super().__init__()
-        hidden_size = config.hidden_size
-
-        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.num_heads = config.n_head
-        self.self_attention = BloomAttention(config)
-        self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = BloomMLP(config)
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-        self.hidden_dropout = config.hidden_dropout
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        # hidden_states: [batch_size, seq_length, hidden_size]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-
-        # Layer norm post the self attention.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        # Self attention.
-        attn_outputs = self.self_attention(
-            layernorm_output,
-            residual,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            alibi=alibi,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-
-        attention_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        layernorm_output = self.post_attention_layernorm(attention_output)
-
-        # Get residual
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = attention_output
-
-        # MLP.
-        output = self.mlp(layernorm_output, residual)
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-
-class BloomPreTrainedModel(PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = BloomConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["BloomBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
-        if isinstance(module, BloomModel):
-            module.gradient_checkpointing = value
-
-    @staticmethod
-    def _convert_to_standard_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
-        num_heads, ...]))
-        """
-        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        num_heads = batch_size_times_num_heads // batch_size
-        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
-        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
-                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
-            )
-            for layer_past in past_key_value
-        )
-
-    @staticmethod
-    def _convert_to_bloom_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
-        """
-        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        batch_size_times_num_heads = batch_size * num_heads
-        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
-        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
-        return tuple(
-            (
-                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
-                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
-            )
-            for layer_past in past_key_value
-        )
-
-
-BLOOM_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-BLOOM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
-            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
-            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
-
-            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
-            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
-            their past given to this model should not be passed as `input_ids` as they have already been computed.
-
-            Each element of `past_key_values` is a tuple (past_key, past_value):
-            - past_key: [batch_size * num_heads, head_dim, kv_length]
-            - past_value: [batch_size * num_heads, kv_length, head_dim]
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-
-            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
-            `past_key_values`).
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
-    BLOOM_START_DOCSTRING,
-)
-class BloomModel(BloomPreTrainedModel):
-    def __init__(self, config: BloomConfig):
-        super().__init__(config)
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.n_head
-
-        # Embedding + LN Embedding
-        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
-        self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        # Transformer blocks
-        self.h = nn.ModuleList([BloomBlock(config) for _ in range(config.num_hidden_layers)])
-
-        # Final Layer Norm
-        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def build_alibi_tensor(self, token_ids: torch.Tensor, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype, graph_tokens: Dict[str, List[int]]) -> torch.Tensor:
-        return build_alibi_tensor(token_ids, attention_mask, num_heads, dtype, graph_tokens)
-
-    def build_message_passing_matrices(self, token_ids: torch.Tensor, graph_tokens: Dict[str, List[int]]) -> List[Dict[str, torch.Tensor]]:
-        return build_message_passing_matrices(token_ids, graph_tokens)
-
-    def get_input_embeddings(self):
-        return self.word_embeddings
-
-    def _prepare_attn_mask(
-        self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
-    ) -> torch.BoolTensor:
-        # create causal mask
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        combined_attention_mask = None
-        device = attention_mask.device
-        _, src_length = input_shape
-
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, device=device, past_key_values_length=past_key_values_length
-            )
-
-        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
-        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
-        combined_attention_mask = (
-            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
-        )
-
-        return combined_attention_mask
-
-    def set_input_embeddings(self, new_embeddings: torch.Tensor):
-        self.word_embeddings = new_embeddings
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        full_input_ids: Optional[torch.LongTensor] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-                " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-        alibi = self.build_alibi_tensor(input_ids if full_input_ids is None else full_input_ids, attention_mask, self.num_heads, dtype=hidden_states.dtype, graph_tokens=self.graph_tokens)
-        message_passing_dicts = self.build_message_passing_matrices(input_ids if full_input_ids is None else full_input_ids, self.graph_tokens)
-
-        causal_mask = self._prepare_attn_mask(
-            attention_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    alibi,
-                    causal_mask,
-                    layer_past,
-                    head_mask[i],
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
-
-            hidden_states = outputs[0]
-
-            if i != len(self.h) - 1:
-                hidden_states = perform_causal_message_passing(hidden_states, message_passing_dicts)
-
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
-    """,
-    BLOOM_START_DOCSTRING,
-)
-class BloomForCausalLM(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: BloomConfig):
-        super().__init__(config)
-        self.transformer = BloomModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings: torch.Tensor):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        truncated_input_ids = None
-        if past_key_values:
-            truncated_input_ids = input_ids[:, -1].unsqueeze(-1)
-
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
-            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
-                past_key_values = self._convert_to_bloom_cache(past_key_values)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids if truncated_input_ids is None else truncated_input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "full_input_ids": input_ids
-            }
-        )
-        return model_inputs
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        full_input_ids: Optional[torch.LongTensor] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-                " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            full_input_ids=full_input_ids
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    def _reorder_cache(
-        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx))
-
-        # Get a copy of `beam_idx` on all the devices where we need those indices.
-        device_to_beam_idx = {
-            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
-        }
-        reordered_past = tuple(
-            (
-                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
-                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
-            )
-            for layer_past in standardized_past
-        )
-        return self._convert_to_bloom_cache(reordered_past)
-
-
-@add_start_docstrings(
-    """
-    The Bloom Model transformer with a sequence classification head on top (linear layer).
-
-    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-1) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    BLOOM_START_DOCSTRING,
-)
-class BloomForSequenceClassification(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: BloomConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.transformer = BloomModel(config)
-        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutputWithPast,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-                " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-                logger.warning(
-                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-                )
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bloom Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    BLOOM_START_DOCSTRING,
-)
-class BloomForTokenClassification(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config: BloomConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = BloomModel(config)
-        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
-            classifier_dropout = config.classifier_dropout
-        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-                " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.classifier(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            batch_size, seq_length = labels.shape
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
-            )
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The BLOOM Model transformer with a span classification head on top for extractive question-answering tasks like
-    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    BLOOM_START_DOCSTRING,
-)
-class BloomForQuestionAnswering(BloomPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = BloomModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

From e47b6e8017e378d56dba3a9343b29153427fe901 Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Wed, 3 May 2023 10:10:41 +0000
Subject: [PATCH 925/935] added docstrings

---
 .../models/bloom/causal_message_passing.py       |  6 ++++--
 .../models/bloom/desequence_graph_ids.py         |  2 +-
 .../bloom/permutation_invariant_positions.py     | 16 +++++++++++++---
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
index 8c65242ae3f1..57079057b9bd 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -1,3 +1,5 @@
+""" A set of functions to perform message passing on a serialized graph in an LLM """
+
 from collections import defaultdict
 import itertools
 from typing import Dict, List, Optional, Tuple
@@ -83,8 +85,8 @@ def perform_causal_message_passing(
     message_passing_dicts: List[Dict[str, torch.Tensor]],
     reduce: str = 'mean'
 ) -> torch.Tensor:
-    """ Returns token embeddings in a sequence where causal message passing has been performed to
-        based on the serialized graph described in the token sequence
+    """ Returns token embeddings in a sequence where causal message passing has been performed on
+        the token ids  based on the serialized graph described in the sequence
     """
     new_token_embeddings = []
     for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/bloom/desequence_graph_ids.py
index 95d322597a26..9dae8a30f0f8 100644
--- a/src/transformers/models/bloom/desequence_graph_ids.py
+++ b/src/transformers/models/bloom/desequence_graph_ids.py
@@ -1,4 +1,4 @@
-""" Implementing permutation invariant masking """
+""" A set of functions to identify a serialized graph within a list of token ids """
 
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple
diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
index d08a26e2cf57..0e4514729dee 100644
--- a/src/transformers/models/bloom/permutation_invariant_positions.py
+++ b/src/transformers/models/bloom/permutation_invariant_positions.py
@@ -1,4 +1,6 @@
-""" Implementing permutation invariant masking """
+""" A set of functions to modify the position embeddings of BLOOM to edit position embeddings in
+    a sequence to describe the order generation position invariance of a serialized graph
+"""
 
 from collections import defaultdict
 import math
@@ -79,7 +81,7 @@ def _get_graph_positions(
     graph_tokens: Dict[str, List[int]],
     position_type: str
 ) -> torch.Tensor:
-    """ Returns a revised position tensor (arange_tensor) where the position of tokens in the
+    """ Returns a revised position tenso where the position of tokens in the
         sequence reflect the position of nodes and edges in a graph
     """
     if position_type == 'vanilla':
@@ -117,6 +119,10 @@ def _get_all_edges_previous_positions(
     positions: torch.Tensor,
     edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
 ) -> torch.Tensor:
+    """ Returns a revised position tenso where the position of all previous graph tokens in the
+        sequenced are set to immediately precede the current token meaning every edge besides the
+        edge being generated appears to have been just generated
+    """
     start_idx = edge_sequence[0][0].start_idx
     init_position = positions[start_idx].item()
     new_positions_list = []
@@ -147,6 +153,9 @@ def _get_all_edges_previous_positions(
 def _remove_positions_from_graph_tokens(
     positions: torch.Tensor,
     edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+    """ Returns a revised position tenso where all token ids in a serialized graph are given the
+        same position
+    """
 ) -> torch.Tensor:
     start_idx = edge_sequence[0][0].start_idx
     if edge_sequence[-1][2] is None and edge_sequence[-1][1] is None:
@@ -167,7 +176,8 @@ def _get_action_invariant_positions(
     graph_tokens: Dict[str, List[int]]
 ) -> torch.Tensor:
     """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
-        their position in the graph
+        the order in which they can be generated, i.e. some edges can be generated at the same time
+        so they will have similar positions
     """
     device = positions.device
     init_position = positions[edge_sequence[0][0].start_idx].item()

From 96d2814968e8e92d9448d57a101204c1884d60db Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Wed, 3 May 2023 11:40:37 +0000
Subject: [PATCH 926/935] fixed some relative import errors

---
 src/transformers/models/bloom/causal_message_passing.py     | 2 +-
 src/transformers/models/bloom/modeling_bloom.py             | 6 +++---
 .../models/bloom/permutation_invariant_positions.py         | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
index 57079057b9bd..1c8862c2f916 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -8,7 +8,7 @@
 import torch
 from torch_scatter import scatter
 
-from desequence_graph_ids import SequenceElement
+from .desequence_graph_ids import SequenceElement
 
 
 def build_message_passing_matrices(
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 7f56abf942cb..7e32b2295975 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -35,9 +35,9 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from .configuration_bloom import BloomConfig
-from desequence_graph_ids import extract_edge_sequence, SequenceElement
-from permutation_invariant_positions import build_alibi_tensor
-from causal_message_passing import build_message_passing_matrices, perform_causal_message_passing
+from .desequence_graph_ids import extract_edge_sequence, SequenceElement
+from .permutation_invariant_positions import build_alibi_tensor
+from .causal_message_passing import build_message_passing_matrices, perform_causal_message_passing
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
index 0e4514729dee..09b270c680bf 100644
--- a/src/transformers/models/bloom/permutation_invariant_positions.py
+++ b/src/transformers/models/bloom/permutation_invariant_positions.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from desequence_graph_ids import SequenceElement
+from .desequence_graph_ids import SequenceElement
 
 
 def build_alibi_tensor(
@@ -153,10 +153,10 @@ def _get_all_edges_previous_positions(
 def _remove_positions_from_graph_tokens(
     positions: torch.Tensor,
     edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+) -> torch.Tensor:
     """ Returns a revised position tenso where all token ids in a serialized graph are given the
         same position
     """
-) -> torch.Tensor:
     start_idx = edge_sequence[0][0].start_idx
     if edge_sequence[-1][2] is None and edge_sequence[-1][1] is None:
         end_idx = edge_sequence[-1][0].end_idx

From 10902561d074f36776276fc404dfa1301686dcbb Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Wed, 3 May 2023 12:17:30 +0000
Subject: [PATCH 927/935] fixed small bugs

---
 .../models/bloom/desequence_graph_ids.py      |  6 +++---
 .../bloom/permutation_invariant_positions.py  | 19 +++++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/bloom/desequence_graph_ids.py
index 9dae8a30f0f8..ab299603e557 100644
--- a/src/transformers/models/bloom/desequence_graph_ids.py
+++ b/src/transformers/models/bloom/desequence_graph_ids.py
@@ -51,9 +51,9 @@ def extract_edge_sequence(
     if (
         (
             len(edges) > 0
-            and edges[-1].pred_node != sequence[-3]
-            and edges[-1].edge != sequence[-2]
-            and edges[-1].succ_node != sequence[-1]
+            and edges[-1][0] != sequence[-3]
+            and edges[-1][1] != sequence[-2]
+            and edges[-1][2] != sequence[-1]
         )
         or len(edges) == 0
     ):
diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
index 09b270c680bf..0b9f692beb66 100644
--- a/src/transformers/models/bloom/permutation_invariant_positions.py
+++ b/src/transformers/models/bloom/permutation_invariant_positions.py
@@ -59,8 +59,11 @@ def build_alibi_tensor(
     # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
     arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
     token_positions = []
-    for (t_ids, edge_sequence, mask, positions) in enumerate(
-        zip(token_ids, edge_sequences, attention_mask, arange_tensor)
+    for (t_ids, edge_sequence, mask, positions) in zip(
+        token_ids,
+        edge_sequences,
+        attention_mask,
+        arange_tensor
     ):
         token_positions.append(
             _get_graph_positions(t_ids, edge_sequence, mask, positions, graph_tokens, position_type)
@@ -84,8 +87,8 @@ def _get_graph_positions(
     """ Returns a revised position tenso where the position of tokens in the
         sequence reflect the position of nodes and edges in a graph
     """
-    if position_type == 'vanilla':
-        return positions
+    if position_type == 'normal':
+        return positions.unsqueeze(0)
     assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
     if len(edge_sequence) > 0:
         assert len(positions.shape) == 2 and positions.size(0) == 1
@@ -112,7 +115,7 @@ def _get_graph_positions(
             raise ValueError(f"Unknown position embedding type {position_type}")
         return new_positions.unsqueeze(0)
     else:
-        return positions
+        return positions.unsqueeze(0)
 
 
 def _get_all_edges_previous_positions(
@@ -147,7 +150,7 @@ def _get_all_edges_previous_positions(
             if idx < token_positions.numel():
                 token_positions[(idx + 1):] = -1e6
             new_positions_list.append(token_positions.unsqueeze(0))
-    return torch.cat([new_positions_list], dim=0)
+    return torch.cat(new_positions_list, dim=0)
 
 
 def _remove_positions_from_graph_tokens(
@@ -167,7 +170,7 @@ def _remove_positions_from_graph_tokens(
     init_position = positions[start_idx].item()
     new_positions = positions.clone()
     new_positions[start_idx:end_idx] = init_position # setting all isolated graph tokens to have a low bias
-    return new_positions
+    return new_positions.unsqueeze(0)
 
 
 def _get_action_invariant_positions(
@@ -205,4 +208,4 @@ def _get_action_invariant_positions(
             element_position[pred_node.ids] = after_edge_pos.item()
         if isinstance(succ_node, SequenceElement) and succ_node.token in graph_tokens['gen_node']:
             element_position[succ_node.ids] = after_edge_pos.item()
-    return new_positions
+    return new_positions.unsqueeze(0)

From 5beaab7a4c9d2b2c5e216ab9ea222514d901cd0b Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Thu, 4 May 2023 14:12:06 +0000
Subject: [PATCH 928/935] added linear layers to bloom

---
 .../models/bloom/causal_message_passing.py            |  6 +++++-
 src/transformers/models/bloom/modeling_bloom.py       | 11 ++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
index 1c8862c2f916..7d76a25a9eb0 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -2,7 +2,7 @@
 
 from collections import defaultdict
 import itertools
-from typing import Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -83,6 +83,7 @@ def add_missing_idxs(
 def perform_causal_message_passing(
     token_embeddings: torch.Tensor,
     message_passing_dicts: List[Dict[str, torch.Tensor]],
+    linear_layer: Optional[Callable] = None,
     reduce: str = 'mean'
 ) -> torch.Tensor:
     """ Returns token embeddings in a sequence where causal message passing has been performed on
@@ -113,6 +114,9 @@ def perform_causal_message_passing(
                 index=message_passing_dict['inverse_edge_index'][1],
                 reduce=reduce
             )
+            if linear_layer is not None:
+                edge_embeddings = linear_layer(edge_embeddings)
+                edge_embeddings = torch.relu(edge_embeddings)
             edge_embeddings = torch.cat([
                 edge_embeddings,
                 torch.zeros_like(edge_embeddings[0].unsqueeze(0))
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 7e32b2295975..3f3c0f6d8021 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -582,6 +582,7 @@ def __init__(self, config: BloomConfig):
         self.graph_tokens = {}
         self.position_type = "normal"
         self.message_passing_type = "none"
+        self.linear_layers = []
 
         # Embedding + LN Embedding
         self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
@@ -780,7 +781,15 @@ def custom_forward(*inputs):
 
             hidden_states = outputs[0]
             if i != len(self.h) - 1 and self.message_passing_type != 'none':
-                hidden_states = perform_causal_message_passing(hidden_states, message_passing_dicts)
+                hidden_states = perform_causal_message_passing(
+                    hidden_states,
+                    message_passing_dicts,
+                    linear_layer=(
+                        self.linear_layers[i]
+                        if (i < len(self.linear_layers) and self.message_passing_type == 'naive')
+                        else None
+                    )
+                )
 
             if use_cache is True:
                 presents = presents + (outputs[1],)

From c7166f11658397141b61b3dfaaf41dadfc67e2d3 Mon Sep 17 00:00:00 2001
From: root <zacharesp@gmail.com>
Date: Thu, 4 May 2023 15:01:45 +0000
Subject: [PATCH 929/935] removed impossible embedding method

---
 .../bloom/permutation_invariant_positions.py  | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
index 0b9f692beb66..ba670cdad4a7 100644
--- a/src/transformers/models/bloom/permutation_invariant_positions.py
+++ b/src/transformers/models/bloom/permutation_invariant_positions.py
@@ -106,11 +106,6 @@ def _get_graph_positions(
                 positions=positions[0],
                 edge_sequence=edge_sequence,
             ) * mask
-        elif position_type == 'all_edges_previous':
-            new_positions = _get_all_edges_previous_positions(
-                positions=positions[0],
-                edge_sequence=edge_sequence
-            ) * mask
         else:
             raise ValueError(f"Unknown position embedding type {position_type}")
         return new_positions.unsqueeze(0)
@@ -118,41 +113,6 @@ def _get_graph_positions(
         return positions.unsqueeze(0)
 
 
-def _get_all_edges_previous_positions(
-    positions: torch.Tensor,
-    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
-) -> torch.Tensor:
-    """ Returns a revised position tenso where the position of all previous graph tokens in the
-        sequenced are set to immediately precede the current token meaning every edge besides the
-        edge being generated appears to have been just generated
-    """
-    start_idx = edge_sequence[0][0].start_idx
-    init_position = positions[start_idx].item()
-    new_positions_list = []
-    for s_idx, sequenced_edge in enumerate(edge_sequence):
-        new_positions = positions.clone()
-        max_length = 0
-        for prev_edge in edge_sequence[:s_idx]:
-            pred_node, _, succ_node = prev_edge
-            new_positions[pred_node.start_idx:succ_node.end_idx] = torch.arange(start_idx - end_idx, 0, -1).to(positions.device)
-            max_length = max(end_idx - start_idx, max_length)
-        pred_node, edge, succ_node = sequenced_edge
-        if succ_node is None and edge is None:
-            end_idx = pred_node.end_idx
-        elif succ_node is None:
-            end_idx = edge.end_idx
-        else:
-            end_idx = succ_node.end_idx
-        new_positions[pred_node.start_idx:end_idx] = torch.arange(end_idx - start_idx).to(positions.device)
-        new_positions[start_idx:end_idx] += init_position + max_length
-        for idx in range(pred_node.start_idx, end_idx):
-            token_positions = new_positions.clone()
-            if idx < token_positions.numel():
-                token_positions[(idx + 1):] = -1e6
-            new_positions_list.append(token_positions.unsqueeze(0))
-    return torch.cat(new_positions_list, dim=0)
-
-
 def _remove_positions_from_graph_tokens(
     positions: torch.Tensor,
     edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]

From d7052dd4d32f66db21a8af4852027491a974bea1 Mon Sep 17 00:00:00 2001
From: Peter Zachares <32395644+zachares@users.noreply.github.com>
Date: Thu, 4 May 2023 20:43:22 +0100
Subject: [PATCH 930/935] Update
 src/transformers/models/bloom/desequence_graph_ids.py

Co-authored-by: vahanhov <32771381+vahanhov@users.noreply.github.com>
---
 src/transformers/models/bloom/desequence_graph_ids.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/bloom/desequence_graph_ids.py
index ab299603e557..eac5174b6bbf 100644
--- a/src/transformers/models/bloom/desequence_graph_ids.py
+++ b/src/transformers/models/bloom/desequence_graph_ids.py
@@ -66,7 +66,6 @@ def extract_edge_sequence(
                 and not node_explored[sequence[-2].ids]
             )
         ):
-
             edges.append((sequence[-2], sequence[-1], None))
         elif (
             len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']

From 591cf9c6f9ef0094f4d50deca0cd80c9926acb63 Mon Sep 17 00:00:00 2001
From: Peter Zachares <32395644+zachares@users.noreply.github.com>
Date: Thu, 4 May 2023 20:43:28 +0100
Subject: [PATCH 931/935] Update
 src/transformers/models/bloom/desequence_graph_ids.py

Co-authored-by: vahanhov <32771381+vahanhov@users.noreply.github.com>
---
 src/transformers/models/bloom/desequence_graph_ids.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/bloom/desequence_graph_ids.py
index eac5174b6bbf..50f94a743d07 100644
--- a/src/transformers/models/bloom/desequence_graph_ids.py
+++ b/src/transformers/models/bloom/desequence_graph_ids.py
@@ -21,7 +21,7 @@ def extract_edge_sequence(
     token_ids: List[int],
     graph_tokens: Dict[str, List[int]]
 ) -> List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]:
-    """ Returns a list of edges of the graph sequence identified in a sequence of token ids """
+    """ Returns a list of edges of the graph sequence identified in a sequence of generated token ids. """
     sequence = _extract_graph_elements(token_ids, graph_tokens)
     edges = []
     if len(sequence) > 2:

From 9f156f036255191f34dde3e95d44b4f775ace533 Mon Sep 17 00:00:00 2001
From: Peter Zachares <32395644+zachares@users.noreply.github.com>
Date: Fri, 12 May 2023 09:54:53 +0100
Subject: [PATCH 932/935] feat(causal message passing) (#2)

* novelty debugging

* running solution

* message passing slightly better

* simplified serialize

* current code

* flamingo inspired

* message passing correctly implemented

* positions update

* removing commented code

* causal message passing

* edge case in case using another model besides serialize

* update message passing and position embedding

* Update src/transformers/models/bloom/modeling_bloom.py

* removed unnecessary code
---
 .../models/bloom/causal_message_passing.py    | 205 +++++++++---------
 .../models/bloom/desequence_graph_ids.py      |  65 ++----
 .../models/bloom/modeling_bloom.py            |  52 ++---
 .../bloom/permutation_invariant_positions.py  | 123 ++++++++---
 4 files changed, 232 insertions(+), 213 deletions(-)

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
index 7d76a25a9eb0..60915ee4b6f3 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -1,16 +1,23 @@
 """ A set of functions to perform message passing on a serialized graph in an LLM """
 
+import enum
 from collections import defaultdict
 import itertools
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-from torch_scatter import scatter
+import torch_geometric
 
 from .desequence_graph_ids import SequenceElement
 
 
+class GNNLayerFactory(enum.Enum):
+    gcn = torch_geometric.nn.GCNConv
+    sage = torch_geometric.nn.SAGEConv
+    gat = torch_geometric.nn.GATConv
+
+
 def build_message_passing_matrices(
     token_ids: torch.Tensor,
     edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]]
@@ -19,114 +26,114 @@ def build_message_passing_matrices(
         language model blocks of an autoregressive language model
     """
     message_passing_dicts = []
-    for t_ids, edge_sequence in zip(token_ids, edge_sequences):
-        message_passing_dict = {'tokens2edges': [], 'edges2tokens': [], 'inverse_edge_index': []}
+    for edge_sequence in edge_sequences:
+        message_passing_dict = defaultdict(list)
         node2edge_idxs = defaultdict(list)
+        prev_node_idx = defaultdict(lambda: -1)
+
+        def add_element(end_idx: int, element_type: str):
+            """ Adds an element to the edge or node graphs used for message passing """
+            assert element_type in ['nodes', 'edges']
+            message_passing_dict[f"tokens2{element_type}"].append(end_idx - 1)
+            message_passing_dict[f"{element_type}2tokens"].append(end_idx)
+
         for edge_idx, sequenced_edge in enumerate(edge_sequence):
             pred_node, edge, succ_node = sequenced_edge
-            node2edge_idxs[pred_node.ids].append(edge_idx)
-            if isinstance(succ_node, SequenceElement):
-                end_idx = succ_node.end_idx
-                node2edge_idxs[succ_node.ids].append(edge_idx)
-            elif isinstance(edge, SequenceElement):
-                end_idx = edge.end_idx
+            if edge_idx == len(edge_sequence) - 1:
+                if (
+                    not isinstance(succ_node, SequenceElement)
+                    and not isinstance(edge, SequenceElement)
+                ):
+                    continue
+                else:
+                    add_element(pred_node.end_idx, 'nodes')
+                    num_nodes = len(message_passing_dict["tokens2nodes"])
+                    if prev_node_idx[pred_node.ids] != -1:
+                        message_passing_dict['edge_index_nodes'].append(
+                            [prev_node_idx[pred_node.ids], num_nodes - 1]
+                        )
             else:
-                end_idx = pred_node.end_idx
-            for token_idx in range(pred_node.start_idx, end_idx):
-                message_passing_dict['tokens2edges'].append([token_idx, edge_idx])
-                message_passing_dict['edges2tokens'].append([edge_idx, token_idx])
-        if len(message_passing_dict['edges2tokens']) > 0:
-            message_passing_dict['edges2tokens'] = add_missing_idxs(
-                message_passing_dict['edges2tokens'],
-                num_incoming_nodes=len(edge_sequence),
-                num_outgoing_nodes=len(t_ids)
-            )
-        message_passing_dict['inverse_edge_index'] = []
+                add_element(pred_node.end_idx, 'nodes')
+                add_element(succ_node.end_idx, 'edges')
+                add_element(succ_node.end_idx, 'nodes')
+                node2edge_idxs[pred_node.ids].append(edge_idx)
+                node2edge_idxs[succ_node.ids].append(edge_idx)
+                num_nodes = len(message_passing_dict["tokens2nodes"])
+                message_passing_dict['edge_index_nodes'].append([num_nodes - 2, num_nodes - 1])
+                if prev_node_idx[pred_node.ids] != -1:
+                    message_passing_dict['edge_index_nodes'].append(
+                        [prev_node_idx[pred_node.ids], num_nodes - 2]
+                    )
+                if prev_node_idx[succ_node.ids] != -1:
+                    message_passing_dict['edge_index_nodes'].append(
+                        [prev_node_idx[succ_node.ids], num_nodes - 1]
+                    )
+                prev_node_idx[pred_node.ids] = num_nodes - 2
+                prev_node_idx[succ_node.ids] = num_nodes - 1
+
         for edge_idxs in node2edge_idxs.values():
             if len(edge_idxs) < 2:
                 continue
             for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
-                message_passing_dict['inverse_edge_index'].append(
-                    [idx0, idx1] if idx0 < idx1 else [idx1, idx0]
-                )
-        if len(message_passing_dict['inverse_edge_index']) > 0:
-            message_passing_dict['inverse_edge_index'] = add_missing_idxs(
-                message_passing_dict['inverse_edge_index'],
-                num_incoming_nodes=len(edge_sequence),
-                num_outgoing_nodes=len(edge_sequence)
-            )
-        message_passing_dicts.append({
-            key: torch.from_numpy(np.array(value).transpose(1, 0)).long().to(token_ids.device)
-            if len(value) > 0 else torch.from_numpy(np.array(value)).long().to(token_ids.device)
-            for key, value in message_passing_dict.items()
-        })
-    return message_passing_dicts
+                message_passing_dict['edge_index_edges'].append(sorted([idx0, idx1]))
 
+        def to_torch(array: Union[List[int], List[List[int]]]) -> torch.Tensor:
+            """ Converts an array to a torch Tensor and returns it"""
+            if len(array) == 0 or isinstance(array[0], int):
+                return torch.from_numpy(np.array(array)).long().to(token_ids.device)
+            else:
+                return torch.from_numpy(np.array(array).transpose(1, 0)).long().to(token_ids.device)
 
-def add_missing_idxs(
-    edge_index: List[List[int]],
-    *,
-    num_incoming_nodes: int,
-    num_outgoing_nodes: int
-) -> List[List[int]]:
-    """ Adds edges from a dummy node to all outgoing nodes which do not have an edge pointing to
-        them. This is to facilitate causal message passing where a node should have access to its
-        own embedding using torch.scatter to perform message passing.
-    """
-    existing_idxs = set([node_idxs[-1] for node_idxs in edge_index])
-    missing_idxs = set(range(num_outgoing_nodes)) - existing_idxs
-    for missing_idx in missing_idxs:
-        edge_index.append([num_incoming_nodes, missing_idx])
-    return edge_index
+        message_passing_dict['tokens2edges'] = to_torch(message_passing_dict['tokens2edges'])
+        message_passing_dict['edges2tokens'] = to_torch(message_passing_dict['edges2tokens'])
+        message_passing_dict['tokens2nodes'] = to_torch(message_passing_dict['tokens2nodes'])
+        message_passing_dict['nodes2tokens'] = to_torch(message_passing_dict['nodes2tokens'])
+        message_passing_dict['edge_index_nodes'] = to_torch(message_passing_dict['edge_index_nodes'])
+        message_passing_dict['edge_index_edges'] = to_torch(message_passing_dict['edge_index_edges'])
+        message_passing_dicts.append(dict(message_passing_dict))
+    return message_passing_dicts
 
 
-def perform_causal_message_passing(
-    token_embeddings: torch.Tensor,
-    message_passing_dicts: List[Dict[str, torch.Tensor]],
-    linear_layer: Optional[Callable] = None,
-    reduce: str = 'mean'
-) -> torch.Tensor:
-    """ Returns token embeddings in a sequence where causal message passing has been performed on
-        the token ids  based on the serialized graph described in the sequence
+class CausalMessagePassingLayer(torch.nn.Module):
+    """ A torch.nn.Module for performing causal message passing within an autoregressive
+        language model
     """
-    new_token_embeddings = []
-    for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
-        if message_passing_dict['inverse_edge_index'].numel() == 0:
-            new_t_embeddings = t_embeddings
-        else:
-            edge_embeddings = scatter(
-                src=t_embeddings[message_passing_dict['tokens2edges'][0]],
-                dim=0,
-                index=message_passing_dict['tokens2edges'][1],
-                reduce=reduce
-            )
-            # adding dummy tensor to make sure that the output tensor of message passing is the
-            # correct size because causal message passing does not allow self loops
-            edge_embeddings = torch.cat([
-                edge_embeddings,
-                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
-            ], dim=0)
-            # adding dummy tensor to make sure that the output tensor of message passing is the
-            # correct size because causal message passing does not allow self loops
-            edge_embeddings = scatter(
-                src=edge_embeddings[message_passing_dict['inverse_edge_index'][0]],
-                dim=0,
-                index=message_passing_dict['inverse_edge_index'][1],
-                reduce=reduce
-            )
-            if linear_layer is not None:
-                edge_embeddings = linear_layer(edge_embeddings)
-                edge_embeddings = torch.relu(edge_embeddings)
-            edge_embeddings = torch.cat([
-                edge_embeddings,
-                torch.zeros_like(edge_embeddings[0].unsqueeze(0))
-            ], dim=0)
-            new_t_embeddings = scatter(
-                src=edge_embeddings[message_passing_dict['edges2tokens'][0]],
-                dim=0,
-                index=message_passing_dict['edges2tokens'][1],
-                reduce=reduce
+    def __init__(self, gnn_type: str, embedding_size: int):
+        super().__init__()
+        self.nodes_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
+        self.edges_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
+        self.gating_parameter_a = torch.nn.Parameter(torch.zeros(1))
+        self.gating_parameter_b = torch.nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        token_embeddings: torch.Tensor,
+        message_passing_dicts: List[Dict[str, torch.Tensor]]
+    ) -> torch.Tensor:
+        new_token_embeddings = []
+        for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
+            token_edges_embeddings = torch.zeros_like(t_embeddings)
+            token_nodes_embeddings = torch.zeros_like(t_embeddings)
+            if message_passing_dict['tokens2edges'].numel() > 0:
+                edges_embeddings = t_embeddings[message_passing_dict['tokens2edges']]
+                if message_passing_dict['edge_index_edges'].numel() > 0:
+                    edges_embeddings = self.edges_layer(
+                        edges_embeddings,
+                        message_passing_dict['edge_index_edges']
+                    )
+                token_edges_embeddings[message_passing_dict['edges2tokens']] = edges_embeddings
+            if message_passing_dict['tokens2nodes'].numel() > 0:
+                nodes_embeddings = t_embeddings[message_passing_dict['tokens2nodes']]
+                if message_passing_dict['edge_index_nodes'].numel() > 0:
+                    nodes_embeddings = self.nodes_layer(
+                        nodes_embeddings,
+                        message_passing_dict['edge_index_nodes']
+                    )
+                token_nodes_embeddings[message_passing_dict['nodes2tokens']] = nodes_embeddings
+            new_t_embeddings = (
+                t_embeddings
+                + torch.tanh(self.gating_parameter_a) * token_edges_embeddings
+                + torch.tanh(self.gating_parameter_b) * token_nodes_embeddings
             )
-        assert new_t_embeddings.shape == t_embeddings.shape
-        new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
-    return torch.cat(new_token_embeddings, dim=0) + token_embeddings
+            new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
+        return torch.cat(new_token_embeddings, dim=0)
diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/bloom/desequence_graph_ids.py
index 50f94a743d07..161a4bda4661 100644
--- a/src/transformers/models/bloom/desequence_graph_ids.py
+++ b/src/transformers/models/bloom/desequence_graph_ids.py
@@ -1,6 +1,5 @@
 """ A set of functions to identify a serialized graph within a list of token ids """
 
-from collections import defaultdict
 from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
 
@@ -25,56 +24,21 @@ def extract_edge_sequence(
     sequence = _extract_graph_elements(token_ids, graph_tokens)
     edges = []
     if len(sequence) > 2:
-        node_explored = defaultdict(lambda: False)
         for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
             if (
-                elem0.token in graph_tokens['nodes']
-                and elem1.token in graph_tokens['edge']
-                and elem2.token in graph_tokens['nodes']
+                elem0.token == graph_tokens['gen_edge']
+                and elem1.token == graph_tokens['edge']
+                and elem2.token == graph_tokens['node']
             ): # edge syntax
-                if (
-                # test to see if there is an ungenerated node contained within the identified edge
-                # essentially another syntax error that can occur during graph generation
-                    (elem0.token in graph_tokens['node'] and not node_explored[elem0.ids])
-                    or (
-                        elem2.token in graph_tokens['node']
-                        and not node_explored[elem2.ids]
-                        and elem2.ids != elem0.ids # to account for self loops
-                    )
-                ):
-                    continue
-                if elem0.token in graph_tokens['gen_node'] and not node_explored[elem0.ids]:
-                    node_explored[elem0.ids] = True
-                if elem2.token in graph_tokens['gen_node'] and not node_explored[elem2.ids]:
-                    node_explored[elem2.ids] = True
                 edges.append((elem0, elem1, elem2))
     if (
-        (
-            len(edges) > 0
-            and edges[-1][0] != sequence[-3]
-            and edges[-1][1] != sequence[-2]
-            and edges[-1][2] != sequence[-1]
-        )
-        or len(edges) == 0
+        len(sequence) > 1
+        and sequence[-2].token == graph_tokens['gen_edge']
+        and sequence[-1].token == graph_tokens['edge']
     ):
-        if (
-            len(sequence) > 1
-            and sequence[-2].token in graph_tokens['nodes']
-            and sequence[-1].token in graph_tokens['edge']
-            and not (
-                sequence[-2].token in graph_tokens['node']
-                and not node_explored[sequence[-2].ids]
-            )
-        ):
-            edges.append((sequence[-2], sequence[-1], None))
-        elif (
-            len(sequence) > 0 and sequence[-1].token in graph_tokens['nodes']
-            and not (
-                sequence[-1].token in graph_tokens['node']
-                and not node_explored[sequence[-1].ids]
-            )
-        ):
-            edges.append((sequence[-1], None, None))
+        edges.append((sequence[-2], sequence[-1], None))
+    elif len(sequence) > 0 and sequence[-1].token == graph_tokens['gen_edge']:
+        edges.append((sequence[-1], None, None))
     return edges
 
 
@@ -85,12 +49,17 @@ def _extract_graph_elements(
     """ Returns a parsable representation of the serialized graph in a sequence of token ids,
         if none is found, returns an empty list
     """
+    if len(graph_tokens) == 0:
+        return []
     sequence = []
     prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
     for token_idx, token_id in enumerate(token_ids):
-        if token_id in graph_tokens['gen_node'] and prev_token_id is None:
+        if token_id == graph_tokens['gen_edge'] and prev_token_id is None:
             prev_token_id, prev_idx = token_id, token_idx
-        elif token_id in graph_tokens['graph'] and prev_token_id is not None:
+        elif (
+            token_id in [graph_tokens['gen_edge'], graph_tokens['edge'], graph_tokens['node']]
+            and prev_token_id is not None
+        ):
             sequence.append(SequenceElement(
                 token=prev_token_id,
                 start_idx=prev_idx,
@@ -99,7 +68,7 @@ def _extract_graph_elements(
                 length=token_idx - prev_idx
             ))
             prev_token_id, prev_idx = token_id, token_idx
-        elif token_id in graph_tokens['eos'] and prev_token_id is not None:
+        elif token_id in [graph_tokens['eos'], graph_tokens['pad']] and prev_token_id is not None:
             final_idx = token_idx
             break
     if prev_token_id is not None:
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 3f3c0f6d8021..91aba03567c7 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -37,7 +37,7 @@
 from .configuration_bloom import BloomConfig
 from .desequence_graph_ids import extract_edge_sequence, SequenceElement
 from .permutation_invariant_positions import build_alibi_tensor
-from .causal_message_passing import build_message_passing_matrices, perform_causal_message_passing
+from .causal_message_passing import build_message_passing_matrices, CausalMessagePassingLayer
 
 
 logger = logging.get_logger(__name__)
@@ -582,7 +582,6 @@ def __init__(self, config: BloomConfig):
         self.graph_tokens = {}
         self.position_type = "normal"
         self.message_passing_type = "none"
-        self.linear_layers = []
 
         # Embedding + LN Embedding
         self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
@@ -645,6 +644,12 @@ def _prepare_attn_mask(
     def set_input_embeddings(self, new_embeddings: torch.Tensor):
         self.word_embeddings = new_embeddings
 
+    def init_message_passing(self, gnn_type: str):
+        self.causal_gnn_layers = torch.nn.ModuleList(
+            [CausalMessagePassingLayer(gnn_type, self.config.hidden_size)
+             for _ in range(self.config.n_layer)
+        ])
+
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -746,7 +751,6 @@ def forward(
             input_shape=(batch_size, seq_length),
             past_key_values_length=past_key_values_length,
         )
-
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -780,15 +784,13 @@ def custom_forward(*inputs):
                 )
 
             hidden_states = outputs[0]
-            if i != len(self.h) - 1 and self.message_passing_type != 'none':
-                hidden_states = perform_causal_message_passing(
+            if (
+                i != len(self.h) - 1
+                and self.message_passing_type != 'none'
+            ):
+                hidden_states = self.causal_gnn_layers[i](
                     hidden_states,
-                    message_passing_dicts,
-                    linear_layer=(
-                        self.linear_layers[i]
-                        if (i < len(self.linear_layers) and self.message_passing_type == 'naive')
-                        else None
-                    )
+                    message_passing_dicts
                 )
 
             if use_cache is True:
@@ -846,30 +848,12 @@ def prepare_inputs_for_generation(
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> dict:
-        # only last token for input_ids if past is not None
-        truncated_input_ids = input_ids
-        if past_key_values:
-            truncated_input_ids = input_ids[:, -1].unsqueeze(-1)
-
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
-            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
-                past_key_values = self._convert_to_bloom_cache(past_key_values)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": truncated_input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
+        return {
+                "input_ids": input_ids,
+                "past_key_values": None,
                 "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "full_input_ids": input_ids,
-            }
-        )
-        return model_inputs
+                "attention_mask": attention_mask
+        }
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
index ba670cdad4a7..d4e25c41273d 100644
--- a/src/transformers/models/bloom/permutation_invariant_positions.py
+++ b/src/transformers/models/bloom/permutation_invariant_positions.py
@@ -6,6 +6,7 @@
 import math
 from typing import Dict, List, Optional, Tuple
 
+import numpy as np
 import torch
 
 from .desequence_graph_ids import SequenceElement
@@ -66,12 +67,15 @@ def build_alibi_tensor(
         arange_tensor
     ):
         token_positions.append(
-            _get_graph_positions(t_ids, edge_sequence, mask, positions, graph_tokens, position_type)
+            _get_graph_positions(t_ids, edge_sequence, mask, positions, position_type)
         )
-    alibi = slopes[..., None] * torch.cat(token_positions, dim=0)
+    alibi = (
+        slopes.tile([token_ids.shape[0]]).unsqueeze(1).unsqueeze(2)
+        * torch.cat(token_positions, dim=0).tile([num_heads, 1, 1])
+    )
     return alibi.reshape(
         batch_size * num_heads,
-        seq_length if position_type == 'all_edges_previous' else 1,
+        seq_length if position_type in ['all_edges_previous', 'action_invariant']  else 1,
         seq_length
     ).to(dtype)
 
@@ -81,7 +85,6 @@ def _get_graph_positions(
     edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
     mask: torch.Tensor,
     positions: torch.Tensor,
-    graph_tokens: Dict[str, List[int]],
     position_type: str
 ) -> torch.Tensor:
     """ Returns a revised position tenso where the position of tokens in the
@@ -98,8 +101,12 @@ def _get_graph_positions(
         if position_type == 'action_invariant':
             new_positions = _get_action_invariant_positions(
                 positions=positions[0],
-                edge_sequence=edge_sequence,
-                graph_tokens=graph_tokens,
+                edge_sequence=edge_sequence
+            ) * mask
+        elif position_type == 'all_edges_previous':
+            new_positions = _get_all_edge_previous_positions(
+                positions=positions[0],
+                edge_sequence=edge_sequence
             ) * mask
         elif position_type == 'no_positions':
             new_positions = _remove_positions_from_graph_tokens(
@@ -135,37 +142,89 @@ def _remove_positions_from_graph_tokens(
 
 def _get_action_invariant_positions(
     positions: torch.Tensor,
-    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
-    graph_tokens: Dict[str, List[int]]
+    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
 ) -> torch.Tensor:
     """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
         the order in which they can be generated, i.e. some edges can be generated at the same time
         so they will have similar positions
     """
-    device = positions.device
-    init_position = positions[edge_sequence[0][0].start_idx].item()
-    new_positions = positions.clone()
+    position_vector = positions[:edge_sequence[0][0].start_idx].tolist()
     element_position = defaultdict(lambda: -1)
+    new_positions = [position_vector[:] for _ in range(edge_sequence[0][0].start_idx)]
     for sequenced_edge in edge_sequence:
         pred_node, edge, succ_node = sequenced_edge
-        pred_bias = (
-            init_position if element_position[pred_node.ids] < 0
-            else element_position[pred_node.ids]
-        )
-        new_positions[pred_node.start_idx:pred_node.end_idx] = (
-            pred_bias + torch.arange(pred_node.length).to(device)
-        )
-        if isinstance(edge, SequenceElement):
-            new_positions[edge.start_idx:edge.end_idx] = (
-                new_positions[pred_node.end_idx - 1] + 1 + torch.arange(edge.length).to(device)
-            )
-        if isinstance(edge, SequenceElement) and isinstance(succ_node, SequenceElement):
-            new_positions[succ_node.start_idx:succ_node.end_idx] = (
-                new_positions[edge.end_idx - 1] + 1 + torch.arange(succ_node.length).to(device)
-            )
-        after_edge_pos = new_positions[succ_node.end_idx - 1] + 1
-        if pred_node.token in graph_tokens['gen_node']:
-            element_position[pred_node.ids] = after_edge_pos.item()
-        if isinstance(succ_node, SequenceElement) and succ_node.token in graph_tokens['gen_node']:
-            element_position[succ_node.ids] = after_edge_pos.item()
-    return new_positions.unsqueeze(0)
+        position_vector += list(range(
+            max(position_vector) + 1,
+            max(position_vector) + 1 + pred_node.length
+        ))
+        new_positions += [position_vector[:] for _ in range(pred_node.length)]
+        if not isinstance(edge, SequenceElement):
+            continue
+        if element_position[pred_node.ids] > 0:
+            position_vector = position_vector[:pred_node.start_idx]
+            position_vector += list(range(
+                element_position[pred_node.ids],
+                pred_node.length + element_position[pred_node.ids]
+            ))
+        position_vector += list(range(
+            position_vector[-1] + 1,
+            position_vector[-1] + 1 + edge.length
+        ))
+        new_positions += [position_vector[:] for _ in range(edge.length)]
+        if not isinstance(succ_node, SequenceElement):
+            continue
+        position_vector += list(range(
+            position_vector[-1] + 1,
+            position_vector[-1] + 1 + succ_node.length
+        ))
+        new_positions += [position_vector[:] for _ in range(succ_node.length)]
+        if element_position[pred_node.ids] < -1:
+            element_position[pred_node.ids] = position_vector[-1] + 1
+        if element_position[succ_node.ids] < -1:
+            element_position[succ_node.ids] = position_vector[-1] + 1
+    length = positions.numel()
+    new_positions += [[0] * length for _ in range(length - len(new_positions[-1]))]
+    new_positions = [n_pos + (length - len(n_pos)) * [0] for n_pos in new_positions]
+    new_positions = torch.from_numpy(np.array(new_positions)).long().to(positions.device)
+    return new_positions
+
+
+def _get_all_edge_previous_positions(
+    positions: torch.Tensor,
+    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
+) -> torch.Tensor:
+    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
+        the order in which they can be generated, i.e. some edges can be generated at the same time
+        so they will have similar positions
+    """
+    text_positions = positions[:edge_sequence[0][0].start_idx].tolist()
+    new_positions = [text_positions[:] for _ in range(edge_sequence[0][0].start_idx)]
+    previous_lengths = []
+    for sequenced_edge in edge_sequence:
+        pred_node, edge, succ_node = sequenced_edge
+        start_idx = pred_node.start_idx
+        if isinstance(succ_node, SequenceElement):
+            end_idx = succ_node.end_idx
+        elif isinstance(edge, SequenceElement):
+            end_idx = edge.end_idx
+        else:
+            end_idx = pred_node.end_idx
+        current_length = end_idx - start_idx
+        position_vector = text_positions[:]
+        max_length = max(previous_lengths) if len(previous_lengths) > 0 else 0
+        for previous_length in previous_lengths:
+            position_vector += list(range(
+                text_positions[-1] + 1 + max_length - previous_length,
+                text_positions[-1] + 1 + max_length
+            ))
+        position_vector += list(range(
+            text_positions[-1] + 1 + max_length,
+            text_positions[-1] + 1 + max_length + current_length
+        ))
+        previous_lengths.append(current_length)
+        new_positions += [position_vector[:] for _ in range(current_length)]
+    length = positions.numel()
+    new_positions += [[0] * length for _ in range(length - len(new_positions[-1]))]
+    new_positions = [n_pos + (length - len(n_pos)) * [0] for n_pos in new_positions]
+    new_positions = torch.from_numpy(np.array(new_positions)).long().to(positions.device)
+    return new_positions

From 3c1ca82d181752a4624d74281027eadee6077421 Mon Sep 17 00:00:00 2001
From: Peter Zachares <32395644+zachares@users.noreply.github.com>
Date: Fri, 12 May 2023 15:26:39 +0100
Subject: [PATCH 933/935] Clearer code and simpler method for within LLM
 message passing (#3)

* novelty debugging

* running solution

* message passing slightly better

* simplified serialize

* current code

* flamingo inspired

* message passing correctly implemented

* positions update

* removing commented code

* causal message passing

* edge case in case using another model besides serialize

* update message passing and position embedding

* Update src/transformers/models/bloom/modeling_bloom.py

* removed unnecessary code

* clearer message passing code

* Update src/transformers/models/bloom/causal_message_passing.py

* Update src/transformers/models/bloom/causal_message_passing.py

* Update src/transformers/models/bloom/causal_message_passing.py

Co-authored-by: vahanhov <32771381+vahanhov@users.noreply.github.com>

---------

Co-authored-by: vahanhov <32771381+vahanhov@users.noreply.github.com>
---
 .../models/bloom/causal_message_passing.py    | 329 ++++++++++++------
 .../models/bloom/modeling_bloom.py            |  31 +-
 2 files changed, 245 insertions(+), 115 deletions(-)

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
index 60915ee4b6f3..f6819119aaff 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -1,12 +1,17 @@
-""" A set of functions to perform message passing on a serialized graph in an LLM """
+""" A module for learning to pass information between elements on a serialized graph in an LLM
+    without violating the causality constraint of autoregressive generation (passing information
+    backwards in the sequence)
+"""
 
 import enum
+from functools import partial
 from collections import defaultdict
 import itertools
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+from torch_scatter import scatter, scatter_softmax
 import torch_geometric
 
 from .desequence_graph_ids import SequenceElement
@@ -18,92 +23,49 @@ class GNNLayerFactory(enum.Enum):
     gat = torch_geometric.nn.GATConv
 
 
-def build_message_passing_matrices(
-    token_ids: torch.Tensor,
-    edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]]
-) -> List[Dict[str, torch.Tensor]]:
-    """ Returns the adjacency matrices required to perform causal message passing in between
-        language model blocks of an autoregressive language model
+def graph_cross_attention(
+    values: torch.Tensor,
+    key_representations: torch.Tensor,
+    query_representations: torch.Tensor,
+    edge_index: torch.Tensor
+) -> torch.Tensor:
+    """ Performs graph attention on a set of prior probabilities uing the representation of each
+        node in the graph to calculate the attention weights. The implemented attention is dot
+        product attention as implemented in the transformer architecture
     """
-    message_passing_dicts = []
-    for edge_sequence in edge_sequences:
-        message_passing_dict = defaultdict(list)
-        node2edge_idxs = defaultdict(list)
-        prev_node_idx = defaultdict(lambda: -1)
-
-        def add_element(end_idx: int, element_type: str):
-            """ Adds an element to the edge or node graphs used for message passing """
-            assert element_type in ['nodes', 'edges']
-            message_passing_dict[f"tokens2{element_type}"].append(end_idx - 1)
-            message_passing_dict[f"{element_type}2tokens"].append(end_idx)
-
-        for edge_idx, sequenced_edge in enumerate(edge_sequence):
-            pred_node, edge, succ_node = sequenced_edge
-            if edge_idx == len(edge_sequence) - 1:
-                if (
-                    not isinstance(succ_node, SequenceElement)
-                    and not isinstance(edge, SequenceElement)
-                ):
-                    continue
-                else:
-                    add_element(pred_node.end_idx, 'nodes')
-                    num_nodes = len(message_passing_dict["tokens2nodes"])
-                    if prev_node_idx[pred_node.ids] != -1:
-                        message_passing_dict['edge_index_nodes'].append(
-                            [prev_node_idx[pred_node.ids], num_nodes - 1]
-                        )
-            else:
-                add_element(pred_node.end_idx, 'nodes')
-                add_element(succ_node.end_idx, 'edges')
-                add_element(succ_node.end_idx, 'nodes')
-                node2edge_idxs[pred_node.ids].append(edge_idx)
-                node2edge_idxs[succ_node.ids].append(edge_idx)
-                num_nodes = len(message_passing_dict["tokens2nodes"])
-                message_passing_dict['edge_index_nodes'].append([num_nodes - 2, num_nodes - 1])
-                if prev_node_idx[pred_node.ids] != -1:
-                    message_passing_dict['edge_index_nodes'].append(
-                        [prev_node_idx[pred_node.ids], num_nodes - 2]
-                    )
-                if prev_node_idx[succ_node.ids] != -1:
-                    message_passing_dict['edge_index_nodes'].append(
-                        [prev_node_idx[succ_node.ids], num_nodes - 1]
-                    )
-                prev_node_idx[pred_node.ids] = num_nodes - 2
-                prev_node_idx[succ_node.ids] = num_nodes - 1
+    scaling_constant = torch.Tensor(
+        np.sqrt([key_representations.size(1)])
+    ).to(key_representations.device)
+    dot_products = (
+        query_representations[edge_index[1]]
+        * key_representations[edge_index[0]]
+    ).sum(1) / scaling_constant
+    weights = scatter_softmax(src=dot_products, index=edge_index[1], dim=0)
+    weighted_probs = weights.unsqueeze(1) * values[edge_index[0]]
+    return scatter(src=weighted_probs, index=edge_index[1], dim=0)
 
-        for edge_idxs in node2edge_idxs.values():
-            if len(edge_idxs) < 2:
-                continue
-            for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
-                message_passing_dict['edge_index_edges'].append(sorted([idx0, idx1]))
 
-        def to_torch(array: Union[List[int], List[List[int]]]) -> torch.Tensor:
-            """ Converts an array to a torch Tensor and returns it"""
-            if len(array) == 0 or isinstance(array[0], int):
-                return torch.from_numpy(np.array(array)).long().to(token_ids.device)
-            else:
-                return torch.from_numpy(np.array(array).transpose(1, 0)).long().to(token_ids.device)
+class GatedGraphCrossAttentionLayer(torch.nn.Module):
+    """ A module for performing gated cross attention between elements in a graph that
+        have been serialized in a sequence of tokens and the token sequence
 
-        message_passing_dict['tokens2edges'] = to_torch(message_passing_dict['tokens2edges'])
-        message_passing_dict['edges2tokens'] = to_torch(message_passing_dict['edges2tokens'])
-        message_passing_dict['tokens2nodes'] = to_torch(message_passing_dict['tokens2nodes'])
-        message_passing_dict['nodes2tokens'] = to_torch(message_passing_dict['nodes2tokens'])
-        message_passing_dict['edge_index_nodes'] = to_torch(message_passing_dict['edge_index_nodes'])
-        message_passing_dict['edge_index_edges'] = to_torch(message_passing_dict['edge_index_edges'])
-        message_passing_dicts.append(dict(message_passing_dict))
-    return message_passing_dicts
+        a key element of this layer is that it enforces that information about elements in the graph
+        can only be passed to tokens describing later elements in the sequence
 
+        This layer contains methods to pass information either between nodes or edges within
+        the serialized graph
 
-class CausalMessagePassingLayer(torch.nn.Module):
-    """ A torch.nn.Module for performing causal message passing within an autoregressive
-        language model
+        This layer is heavily inspired by Flamingo a paper on incorporating image information
+        into LLM inference - https://arxiv.org/pdf/2204.14198
     """
     def __init__(self, gnn_type: str, embedding_size: int):
         super().__init__()
-        self.nodes_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
-        self.edges_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
-        self.gating_parameter_a = torch.nn.Parameter(torch.zeros(1))
-        self.gating_parameter_b = torch.nn.Parameter(torch.zeros(1))
+        self.gnn_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
+        self.gating_message_passing = torch.nn.Parameter(torch.zeros(1))
+        self.gating_linear = torch.nn.Parameter(torch.zeros(1))
+        self.key_embedder = torch.nn.Linear(embedding_size, 64)
+        self.query_embedder = torch.nn.Linear(embedding_size, 64)
+        self.linear_layer = torch.nn.Linear(embedding_size, embedding_size)
 
     def forward(
         self,
@@ -112,28 +74,195 @@ def forward(
     ) -> torch.Tensor:
         new_token_embeddings = []
         for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
-            token_edges_embeddings = torch.zeros_like(t_embeddings)
-            token_nodes_embeddings = torch.zeros_like(t_embeddings)
-            if message_passing_dict['tokens2edges'].numel() > 0:
-                edges_embeddings = t_embeddings[message_passing_dict['tokens2edges']]
-                if message_passing_dict['edge_index_edges'].numel() > 0:
-                    edges_embeddings = self.edges_layer(
-                        edges_embeddings,
-                        message_passing_dict['edge_index_edges']
-                    )
-                token_edges_embeddings[message_passing_dict['edges2tokens']] = edges_embeddings
-            if message_passing_dict['tokens2nodes'].numel() > 0:
-                nodes_embeddings = t_embeddings[message_passing_dict['tokens2nodes']]
-                if message_passing_dict['edge_index_nodes'].numel() > 0:
-                    nodes_embeddings = self.nodes_layer(
-                        nodes_embeddings,
-                        message_passing_dict['edge_index_nodes']
+            new_t_embeddings = torch.zeros_like(t_embeddings)
+            if message_passing_dict['tokens2elements'].numel() > 0:
+                element_embeddings = t_embeddings[message_passing_dict['tokens2elements']]
+                if message_passing_dict['edge_index'].numel() > 0:
+                    element_embeddings = self.gnn_layer(
+                        element_embeddings,
+                        message_passing_dict['edge_index']
                     )
-                token_nodes_embeddings[message_passing_dict['nodes2tokens']] = nodes_embeddings
-            new_t_embeddings = (
-                t_embeddings
-                + torch.tanh(self.gating_parameter_a) * token_edges_embeddings
-                + torch.tanh(self.gating_parameter_b) * token_nodes_embeddings
-            )
+                start_idx, end_idx = message_passing_dict['slice_idxs']
+                new_t_embeddings[start_idx:end_idx] = graph_cross_attention(
+                    values=element_embeddings,
+                    key_representations=self.key_embedder(element_embeddings),
+                    query_representations=self.query_embedder(t_embeddings),
+                    edge_index=message_passing_dict['elements2tokens']
+                )[start_idx:]
+            new_t_embeddings = t_embeddings + torch.tanh(self.gating_message_passing) * new_t_embeddings
+            new_t_embeddings = new_t_embeddings + torch.tanh(self.gating_linear) * self.linear_layer(new_t_embeddings)
             new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
         return torch.cat(new_token_embeddings, dim=0)
+
+    @classmethod
+    def build_node_information_passing(
+        cls,
+        edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
+        device: torch.device
+    ) -> List[Dict[str, torch.Tensor]]:
+        """ Returns the indice mappings required to perform pass node information in between
+            language model blocks of an autoregressive language model for nodes in a serialized
+            graph
+        """
+        message_passing_dicts = []
+        for edge_sequence in edge_sequences:
+            message_passing_dict = {'tokens2elements': [], 'elements2tokens': [], 'edge_index': []}
+            add_node = partial(
+                cls.add_node,
+                end_idx=cls.get_sequence_end(edge_sequence),
+                last_occurence_idx=defaultdict(lambda: -1),
+                message_passing_dict=message_passing_dict
+            )
+            for edge_idx, sequenced_edge in enumerate(edge_sequence):
+                pred_node, edge, succ_node = sequenced_edge
+                if edge_idx == len(edge_sequence) - 1:
+                    if (
+                        not isinstance(succ_node, SequenceElement)
+                        and not isinstance(edge, SequenceElement)
+                    ):
+                        continue
+                    else:
+                        add_node(pred_node)
+                else:
+                    add_node(pred_node)
+                    add_node(succ_node)
+            message_passing_dicts.append(cls.to_torch(dict(message_passing_dict), device))
+        return message_passing_dicts
+
+    @classmethod
+    def build_edge_information_passing(
+        cls,
+        edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
+        device: torch.device
+    ) -> List[Dict[str, torch.Tensor]]:
+        """ Returns the indice mappings required to perform pass edge information in between
+            language model blocks of an autoregressive language model for nodes in a serialized
+            graph
+        """
+        message_passing_dicts = []
+        for edge_sequence in edge_sequences:
+            message_passing_dict = {'tokens2elements': [], 'elements2tokens': [], 'edge_index': []}
+            node2edge_idxs = defaultdict(list)
+            add_edge = partial(
+                cls.add_edge,
+                end_idx=cls.get_sequence_end(edge_sequence),
+                node2edge_idxs=node2edge_idxs,
+                message_passing_dict=message_passing_dict
+            )
+            for sequenced_edge in edge_sequence[:-1]:
+                add_edge(sequenced_edge)
+
+            # calculating adjacency matrix between edges (edges in this adjacency matrix always
+            # point from edges earlier in the serialized version of the graph to edges later in
+            # the graph)
+            for edge_idxs in node2edge_idxs.values():
+                if len(edge_idxs) < 2:
+                    continue
+                for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
+                    message_passing_dict['edge_index'].append(sorted([idx0, idx1]))
+            message_passing_dicts.append(cls.to_torch(dict(message_passing_dict), device))
+        return message_passing_dicts
+
+    @staticmethod
+    def get_sequence_end(
+        edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
+    ) -> Tuple[int, int]:
+        """ Returns last index + 1 of elements in the serialized graph sequence """
+        pred_node, edge, succ_node = edge_sequence[-1]
+        if isinstance(succ_node, SequenceElement):
+            end_idx = succ_node.end_idx
+        elif isinstance(edge, SequenceElement):
+            end_idx = edge.end_idx
+        else:
+            end_idx = pred_node.end_idx
+        return end_idx
+
+    @classmethod
+    def add_node(
+        cls,
+        current_occurence: SequenceElement,
+        end_idx: int,
+        last_occurence_idx: Dict[Tuple[int], int],
+        message_passing_dict: Dict[str, Union[List[int], List[List[int]]]]
+    ):
+        """ Each time a node is listed in a serialized version of its corresponding graph, it is
+            added as a node in a new artificial graph. This means in the new artificial graph, a
+            node in the original graph may appear more than once. For every node added to the
+            artificial graph, this function adds an edge which maps between occurences of
+            the same node in the original graph if the node has been printed previously in the
+            serialized graph. The edge points from the previous occurence to the current occurence.
+            i.e. H_1 - O, O - H_2, would create an edge from O -> O since it occurs more than
+            once in the graph
+        """
+        prev_length = len(message_passing_dict[f"tokens2elements"])
+        cls.add_element_for_information_passing(
+            start_idx=current_occurence.end_idx,
+            end_idx=end_idx,
+            message_passing_dict=message_passing_dict
+        )
+        curr_length = len(message_passing_dict[f"tokens2elements"])
+        if last_occurence_idx[current_occurence.ids] != -1 and curr_length > prev_length:
+            current_idx = len(message_passing_dict["tokens2elements"]) - 1
+            message_passing_dict['edge_index'].append(
+                [last_occurence_idx[current_occurence.ids], current_idx]
+            )
+            last_occurence_idx[current_occurence.ids] = current_idx
+
+    @classmethod
+    def add_edge(
+        cls,
+        sequenced_edge: Tuple[SequenceElement, SequenceElement, SequenceElement],
+        end_idx: int,
+        node2edge_idxs: Dict[Tuple[int], List[int]],
+        message_passing_dict: Dict[str, Union[List[int], List[List[int]]]]
+    ):
+        """ Adds an edge as element to pass information between in a serialized graph """
+        pred_node, _, succ_node = sequenced_edge
+        prev_length = len(message_passing_dict[f"tokens2elements"])
+        cls.add_element_for_information_passing(
+            start_idx=succ_node.end_idx,
+            end_idx=end_idx,
+            message_passing_dict=message_passing_dict
+        )
+        curr_length = len(message_passing_dict[f"tokens2elements"])
+        if curr_length > prev_length:
+            current_idx = len(message_passing_dict["tokens2elements"]) - 1
+            node2edge_idxs[pred_node.ids].append(current_idx)
+            node2edge_idxs[succ_node.ids].append(current_idx)
+
+    @staticmethod
+    def add_element_for_information_passing(
+        start_idx: int,
+        end_idx: int,
+        message_passing_dict: Dict[str, Union[List[int], List[List[int]]]]
+    ):
+        """ Adds an element to the message passing dictionary, the element is either a node
+            or an edge. Adding the element means adding the necessary indices to the mapping
+            tokens2elements and elements2tokens, so that it is possible to map to elements
+            and back
+        """
+        if start_idx != end_idx:
+            message_passing_dict[f"tokens2elements"].append(start_idx - 1)
+            for sequence_idx in range(start_idx, end_idx):
+                message_passing_dict[f"elements2tokens"].append(
+                    [len(message_passing_dict[f"tokens2elements"]) - 1, sequence_idx]
+                )
+
+    @staticmethod
+    def to_torch(
+        array_dict: Dict[str, Union[List[int], List[List[int]]]],
+        device: torch.device
+    ) -> Dict[str, torch.Tensor]:
+        """ Converts a dictionary of lists of integers to a dictionary of torch Tensor and returns it
+        """
+        for key, array in array_dict.items():
+            if len(array) == 0 or isinstance(array[0], int):
+                array_dict[key] = torch.from_numpy(np.array(array)).long().to(device)
+            else:
+                array_dict[key] = torch.from_numpy(np.array(array).transpose(1, 0)).long().to(device)
+        if array_dict['elements2tokens'].numel() > 0:
+            array_dict['slice_idxs'] = torch.from_numpy(np.array([
+                array_dict['elements2tokens'][1].min().item(),
+                array_dict['elements2tokens'][1].max().item() + 1
+            ])).long().to(device)
+        return array_dict
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 91aba03567c7..51c6c45ddb23 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -37,8 +37,7 @@
 from .configuration_bloom import BloomConfig
 from .desequence_graph_ids import extract_edge_sequence, SequenceElement
 from .permutation_invariant_positions import build_alibi_tensor
-from .causal_message_passing import build_message_passing_matrices, CausalMessagePassingLayer
-
+from .causal_message_passing import GatedGraphCrossAttentionLayer
 
 logger = logging.get_logger(__name__)
 
@@ -580,8 +579,7 @@ def __init__(self, config: BloomConfig):
         self.embed_dim = config.hidden_size
         self.num_heads = config.n_head
         self.graph_tokens = {}
-        self.position_type = "normal"
-        self.message_passing_type = "none"
+        self.position_type = 'normal'
 
         # Embedding + LN Embedding
         self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
@@ -644,10 +642,12 @@ def _prepare_attn_mask(
     def set_input_embeddings(self, new_embeddings: torch.Tensor):
         self.word_embeddings = new_embeddings
 
-    def init_message_passing(self, gnn_type: str):
-        self.causal_gnn_layers = torch.nn.ModuleList(
-            [CausalMessagePassingLayer(gnn_type, self.config.hidden_size)
-             for _ in range(self.config.n_layer)
+    def init_graph_information_passing(self, gnn_type: str, element_type: str):
+        assert element_type in ['nodes', 'edges'], 'unsupported message passing type'
+        self.message_passing_type = element_type
+        self.graph_information_passing_layers = torch.nn.ModuleList([
+            GatedGraphCrossAttentionLayer(gnn_type, self.config.hidden_size)
+            for _ in range(self.config.n_layer - 1)
         ])
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
@@ -743,8 +743,12 @@ def forward(
             num_heads=self.num_heads,
             dtype=hidden_states.dtype
         )
-        if self.message_passing_type != 'none':
-            message_passing_dicts = build_message_passing_matrices(token_ids, edge_sequences)
+        if hasattr(self, 'message_passing_type'):
+            if self.message_passing_type == 'nodes':
+                get_matrices = GatedGraphCrossAttentionLayer.build_node_information_passing
+            else:
+                get_matrices = GatedGraphCrossAttentionLayer.build_edge_information_passing
+            message_passing_dicts = get_matrices(edge_sequences, self.device)
 
         causal_mask = self._prepare_attn_mask(
             attention_mask,
@@ -784,11 +788,8 @@ def custom_forward(*inputs):
                 )
 
             hidden_states = outputs[0]
-            if (
-                i != len(self.h) - 1
-                and self.message_passing_type != 'none'
-            ):
-                hidden_states = self.causal_gnn_layers[i](
+            if i != len(self.h) - 1 and hasattr(self, 'message_passing_type'):
+                hidden_states = self.graph_information_passing_layers[i](
                     hidden_states,
                     message_passing_dicts
                 )

From 13affe77d3a1a26af10aa0fa2be1d60cf31a84cd Mon Sep 17 00:00:00 2001
From: Peter Zachares <32395644+zachares@users.noreply.github.com>
Date: Mon, 15 May 2023 16:26:57 +0100
Subject: [PATCH 934/935] memory efficient message passing (#4)

---
 .../models/bloom/causal_message_passing.py    | 50 ++-----------------
 1 file changed, 4 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/bloom/causal_message_passing.py
index f6819119aaff..2617115754af 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/bloom/causal_message_passing.py
@@ -23,28 +23,6 @@ class GNNLayerFactory(enum.Enum):
     gat = torch_geometric.nn.GATConv
 
 
-def graph_cross_attention(
-    values: torch.Tensor,
-    key_representations: torch.Tensor,
-    query_representations: torch.Tensor,
-    edge_index: torch.Tensor
-) -> torch.Tensor:
-    """ Performs graph attention on a set of prior probabilities uing the representation of each
-        node in the graph to calculate the attention weights. The implemented attention is dot
-        product attention as implemented in the transformer architecture
-    """
-    scaling_constant = torch.Tensor(
-        np.sqrt([key_representations.size(1)])
-    ).to(key_representations.device)
-    dot_products = (
-        query_representations[edge_index[1]]
-        * key_representations[edge_index[0]]
-    ).sum(1) / scaling_constant
-    weights = scatter_softmax(src=dot_products, index=edge_index[1], dim=0)
-    weighted_probs = weights.unsqueeze(1) * values[edge_index[0]]
-    return scatter(src=weighted_probs, index=edge_index[1], dim=0)
-
-
 class GatedGraphCrossAttentionLayer(torch.nn.Module):
     """ A module for performing gated cross attention between elements in a graph that
         have been serialized in a sequence of tokens and the token sequence
@@ -62,10 +40,6 @@ def __init__(self, gnn_type: str, embedding_size: int):
         super().__init__()
         self.gnn_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
         self.gating_message_passing = torch.nn.Parameter(torch.zeros(1))
-        self.gating_linear = torch.nn.Parameter(torch.zeros(1))
-        self.key_embedder = torch.nn.Linear(embedding_size, 64)
-        self.query_embedder = torch.nn.Linear(embedding_size, 64)
-        self.linear_layer = torch.nn.Linear(embedding_size, embedding_size)
 
     def forward(
         self,
@@ -82,15 +56,8 @@ def forward(
                         element_embeddings,
                         message_passing_dict['edge_index']
                     )
-                start_idx, end_idx = message_passing_dict['slice_idxs']
-                new_t_embeddings[start_idx:end_idx] = graph_cross_attention(
-                    values=element_embeddings,
-                    key_representations=self.key_embedder(element_embeddings),
-                    query_representations=self.query_embedder(t_embeddings),
-                    edge_index=message_passing_dict['elements2tokens']
-                )[start_idx:]
+                new_t_embeddings[message_passing_dict['elements2tokens']] = element_embeddings
             new_t_embeddings = t_embeddings + torch.tanh(self.gating_message_passing) * new_t_embeddings
-            new_t_embeddings = new_t_embeddings + torch.tanh(self.gating_linear) * self.linear_layer(new_t_embeddings)
             new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
         return torch.cat(new_token_embeddings, dim=0)
 
@@ -151,7 +118,6 @@ def build_edge_information_passing(
             )
             for sequenced_edge in edge_sequence[:-1]:
                 add_edge(sequenced_edge)
-
             # calculating adjacency matrix between edges (edges in this adjacency matrix always
             # point from edges earlier in the serialized version of the graph to edges later in
             # the graph)
@@ -166,7 +132,7 @@ def build_edge_information_passing(
     @staticmethod
     def get_sequence_end(
         edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
-    ) -> Tuple[int, int]:
+    ) -> int:
         """ Returns last index + 1 of elements in the serialized graph sequence """
         pred_node, edge, succ_node = edge_sequence[-1]
         if isinstance(succ_node, SequenceElement):
@@ -242,11 +208,8 @@ def add_element_for_information_passing(
             and back
         """
         if start_idx != end_idx:
-            message_passing_dict[f"tokens2elements"].append(start_idx - 1)
-            for sequence_idx in range(start_idx, end_idx):
-                message_passing_dict[f"elements2tokens"].append(
-                    [len(message_passing_dict[f"tokens2elements"]) - 1, sequence_idx]
-                )
+            message_passing_dict["tokens2elements"].append(start_idx - 1)
+            message_passing_dict["elements2tokens"].append(start_idx)
 
     @staticmethod
     def to_torch(
@@ -260,9 +223,4 @@ def to_torch(
                 array_dict[key] = torch.from_numpy(np.array(array)).long().to(device)
             else:
                 array_dict[key] = torch.from_numpy(np.array(array).transpose(1, 0)).long().to(device)
-        if array_dict['elements2tokens'].numel() > 0:
-            array_dict['slice_idxs'] = torch.from_numpy(np.array([
-                array_dict['elements2tokens'][1].min().item(),
-                array_dict['elements2tokens'][1].max().item() + 1
-            ])).long().to(device)
         return array_dict

From 4eb3009cd0c081badf6d4501bd553624e1aca441 Mon Sep 17 00:00:00 2001
From: Vahan <vahan@nplan.io>
Date: Fri, 11 Aug 2023 11:10:17 +0100
Subject: [PATCH 935/935] rebase from HF

---
 .../models/bloom/modeling_bloom.py            | 118 ++++++---
 .../bloom/permutation_invariant_positions.py  | 230 ------------------
 .../causal_message_passing.py                 |   9 +-
 .../desequence_graph_ids.py                   |  14 +-
 src/transformers/models/t5/modeling_t5.py     |  51 +++-
 5 files changed, 135 insertions(+), 287 deletions(-)
 delete mode 100644 src/transformers/models/bloom/permutation_invariant_positions.py
 rename src/transformers/models/{bloom => processing_graphs_within_model}/causal_message_passing.py (96%)
 rename src/transformers/models/{bloom => processing_graphs_within_model}/desequence_graph_ids.py (86%)

diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 51c6c45ddb23..848216885a36 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -16,7 +16,7 @@
 
 import math
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -35,9 +35,8 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from .configuration_bloom import BloomConfig
-from .desequence_graph_ids import extract_edge_sequence, SequenceElement
-from .permutation_invariant_positions import build_alibi_tensor
-from .causal_message_passing import GatedGraphCrossAttentionLayer
+from ..processing_graphs_within_model.desequence_graph_ids import extract_edge_sequence
+from ..processing_graphs_within_model.causal_message_passing import GatedCausalMessagePassingLayer
 
 logger = logging.get_logger(__name__)
 
@@ -85,6 +84,50 @@ def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
     return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
 
 
+def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
 def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
     """
     Dropout add function
@@ -578,8 +621,6 @@ def __init__(self, config: BloomConfig):
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.n_head
-        self.graph_tokens = {}
-        self.position_type = 'normal'
 
         # Embedding + LN Embedding
         self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
@@ -598,21 +639,11 @@ def __init__(self, config: BloomConfig):
 
     def build_alibi_tensor(
         self,
-        token_ids: torch.Tensor,
-        edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
         attention_mask: torch.Tensor,
         num_heads: int,
         dtype: torch.dtype
     ) -> torch.Tensor:
-        return build_alibi_tensor(
-            token_ids=token_ids,
-            edge_sequences=edge_sequences,
-            attention_mask=attention_mask,
-            num_heads=num_heads,
-            dtype=dtype,
-            graph_tokens=self.graph_tokens,
-            position_type=self.position_type
-        )
+        return build_alibi_tensor(attention_mask=attention_mask, num_heads=num_heads, dtype=dtype)
 
     def get_input_embeddings(self):
         return self.word_embeddings
@@ -642,12 +673,25 @@ def _prepare_attn_mask(
     def set_input_embeddings(self, new_embeddings: torch.Tensor):
         self.word_embeddings = new_embeddings
 
-    def init_graph_information_passing(self, gnn_type: str, element_type: str):
+    def init_graph_information_passing(
+        self,
+        gnn_type: str,
+        element_type: str,
+        graph_token_ids: Dict[str, int]
+    ):
+        """ Initializes a set of message passing layers to perform message passing of between
+            graph elements described in an input token id sequence
+        """
         assert element_type in ['nodes', 'edges'], 'unsupported message passing type'
         self.message_passing_type = element_type
+        self.graph_token_ids = graph_token_ids
+        self.num_gnn_layers = (
+            self.config.num_layers - 1
+            if hasattr(self.config, 'num_layers') else self.config.n_layer - 1
+        )
         self.graph_information_passing_layers = torch.nn.ModuleList([
-            GatedGraphCrossAttentionLayer(gnn_type, self.config.hidden_size)
-            for _ in range(self.config.n_layer - 1)
+            GatedCausalMessagePassingLayer(gnn_type, self.config.hidden_size)
+            for _ in range(self.num_gnn_layers)
         ])
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
@@ -659,7 +703,6 @@ def init_graph_information_passing(self, gnn_type: str, element_type: str):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        full_input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.LongTensor] = None,
@@ -732,24 +775,19 @@ def forward(
         else:
             attention_mask = attention_mask.to(hidden_states.device)
 
-        token_ids: torch.Tensor = full_input_ids if full_input_ids is not None else input_ids
-        edge_sequences = [
-            extract_edge_sequence(t_ids.tolist(), self.graph_tokens) for t_ids in token_ids
-        ]
-        alibi = self.build_alibi_tensor(
-            token_ids=token_ids,
-            edge_sequences=edge_sequences,
-            attention_mask=attention_mask,
-            num_heads=self.num_heads,
-            dtype=hidden_states.dtype
-        )
-        if hasattr(self, 'message_passing_type'):
+        if hasattr(self, "graph_tokens"):
+            assert input_ids.shape == attention_mask.shape
+            edge_sequences = [
+                extract_edge_sequence(t_ids.tolist(), self.graph_tokens) for t_ids in input_ids
+            ]
             if self.message_passing_type == 'nodes':
-                get_matrices = GatedGraphCrossAttentionLayer.build_node_information_passing
+                get_matrices = GatedCausalMessagePassingLayer.build_node_information_passing
             else:
-                get_matrices = GatedGraphCrossAttentionLayer.build_edge_information_passing
+                get_matrices = GatedCausalMessagePassingLayer.build_edge_information_passing
             message_passing_dicts = get_matrices(edge_sequences, self.device)
 
+        alibi = self.build_alibi_tensor(attention_mask, self.num_heads, hidden_states.dtype)
+
         causal_mask = self._prepare_attn_mask(
             attention_mask,
             input_shape=(batch_size, seq_length),
@@ -788,7 +826,7 @@ def custom_forward(*inputs):
                 )
 
             hidden_states = outputs[0]
-            if i != len(self.h) - 1 and hasattr(self, 'message_passing_type'):
+            if i <= self.num_gnn_layers and hasattr(self, 'graph_tokens'):
                 hidden_states = self.graph_information_passing_layers[i](
                     hidden_states,
                     message_passing_dicts
@@ -850,10 +888,10 @@ def prepare_inputs_for_generation(
         **kwargs,
     ) -> dict:
         return {
-                "input_ids": input_ids,
-                "past_key_values": None,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask
+            "input_ids": input_ids,
+            "past_key_values": None,
+            "use_cache": False,
+            "attention_mask": attention_mask
         }
 
     @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
diff --git a/src/transformers/models/bloom/permutation_invariant_positions.py b/src/transformers/models/bloom/permutation_invariant_positions.py
deleted file mode 100644
index d4e25c41273d..000000000000
--- a/src/transformers/models/bloom/permutation_invariant_positions.py
+++ /dev/null
@@ -1,230 +0,0 @@
-""" A set of functions to modify the position embeddings of BLOOM to edit position embeddings in
-    a sequence to describe the order generation position invariance of a serialized graph
-"""
-
-from collections import defaultdict
-import math
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
-
-from .desequence_graph_ids import SequenceElement
-
-
-def build_alibi_tensor(
-    token_ids: torch.Tensor,
-    edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
-    attention_mask: torch.Tensor,
-    num_heads: int,
-    dtype: torch.dtype,
-    graph_tokens: Dict[str, List[int]],
-    position_type: str = 'vanilla'
-) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
-            number of heads
-        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
-            dtype of the output tensor
-    """
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(
-        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-    token_positions = []
-    for (t_ids, edge_sequence, mask, positions) in zip(
-        token_ids,
-        edge_sequences,
-        attention_mask,
-        arange_tensor
-    ):
-        token_positions.append(
-            _get_graph_positions(t_ids, edge_sequence, mask, positions, position_type)
-        )
-    alibi = (
-        slopes.tile([token_ids.shape[0]]).unsqueeze(1).unsqueeze(2)
-        * torch.cat(token_positions, dim=0).tile([num_heads, 1, 1])
-    )
-    return alibi.reshape(
-        batch_size * num_heads,
-        seq_length if position_type in ['all_edges_previous', 'action_invariant']  else 1,
-        seq_length
-    ).to(dtype)
-
-
-def _get_graph_positions(
-    token_ids: torch.Tensor,
-    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
-    mask: torch.Tensor,
-    positions: torch.Tensor,
-    position_type: str
-) -> torch.Tensor:
-    """ Returns a revised position tenso where the position of tokens in the
-        sequence reflect the position of nodes and edges in a graph
-    """
-    if position_type == 'normal':
-        return positions.unsqueeze(0)
-    assert len(token_ids.shape) == 1, f"Incorrectly sized tensor - {token_ids.shape}"
-    if len(edge_sequence) > 0:
-        assert len(positions.shape) == 2 and positions.size(0) == 1
-        assert (
-            token_ids.shape[0] == positions.shape[1]
-        ), f"Tensors must be the same shape {positions.shape}   {token_ids.shape}"
-        if position_type == 'action_invariant':
-            new_positions = _get_action_invariant_positions(
-                positions=positions[0],
-                edge_sequence=edge_sequence
-            ) * mask
-        elif position_type == 'all_edges_previous':
-            new_positions = _get_all_edge_previous_positions(
-                positions=positions[0],
-                edge_sequence=edge_sequence
-            ) * mask
-        elif position_type == 'no_positions':
-            new_positions = _remove_positions_from_graph_tokens(
-                positions=positions[0],
-                edge_sequence=edge_sequence,
-            ) * mask
-        else:
-            raise ValueError(f"Unknown position embedding type {position_type}")
-        return new_positions.unsqueeze(0)
-    else:
-        return positions.unsqueeze(0)
-
-
-def _remove_positions_from_graph_tokens(
-    positions: torch.Tensor,
-    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
-) -> torch.Tensor:
-    """ Returns a revised position tenso where all token ids in a serialized graph are given the
-        same position
-    """
-    start_idx = edge_sequence[0][0].start_idx
-    if edge_sequence[-1][2] is None and edge_sequence[-1][1] is None:
-        end_idx = edge_sequence[-1][0].end_idx
-    elif edge_sequence[-1][2] is None:
-        end_idx = edge_sequence[-1][1].end_idx
-    else:
-        end_idx = edge_sequence[-1][2].end_idx
-    init_position = positions[start_idx].item()
-    new_positions = positions.clone()
-    new_positions[start_idx:end_idx] = init_position # setting all isolated graph tokens to have a low bias
-    return new_positions.unsqueeze(0)
-
-
-def _get_action_invariant_positions(
-    positions: torch.Tensor,
-    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
-) -> torch.Tensor:
-    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
-        the order in which they can be generated, i.e. some edges can be generated at the same time
-        so they will have similar positions
-    """
-    position_vector = positions[:edge_sequence[0][0].start_idx].tolist()
-    element_position = defaultdict(lambda: -1)
-    new_positions = [position_vector[:] for _ in range(edge_sequence[0][0].start_idx)]
-    for sequenced_edge in edge_sequence:
-        pred_node, edge, succ_node = sequenced_edge
-        position_vector += list(range(
-            max(position_vector) + 1,
-            max(position_vector) + 1 + pred_node.length
-        ))
-        new_positions += [position_vector[:] for _ in range(pred_node.length)]
-        if not isinstance(edge, SequenceElement):
-            continue
-        if element_position[pred_node.ids] > 0:
-            position_vector = position_vector[:pred_node.start_idx]
-            position_vector += list(range(
-                element_position[pred_node.ids],
-                pred_node.length + element_position[pred_node.ids]
-            ))
-        position_vector += list(range(
-            position_vector[-1] + 1,
-            position_vector[-1] + 1 + edge.length
-        ))
-        new_positions += [position_vector[:] for _ in range(edge.length)]
-        if not isinstance(succ_node, SequenceElement):
-            continue
-        position_vector += list(range(
-            position_vector[-1] + 1,
-            position_vector[-1] + 1 + succ_node.length
-        ))
-        new_positions += [position_vector[:] for _ in range(succ_node.length)]
-        if element_position[pred_node.ids] < -1:
-            element_position[pred_node.ids] = position_vector[-1] + 1
-        if element_position[succ_node.ids] < -1:
-            element_position[succ_node.ids] = position_vector[-1] + 1
-    length = positions.numel()
-    new_positions += [[0] * length for _ in range(length - len(new_positions[-1]))]
-    new_positions = [n_pos + (length - len(n_pos)) * [0] for n_pos in new_positions]
-    new_positions = torch.from_numpy(np.array(new_positions)).long().to(positions.device)
-    return new_positions
-
-
-def _get_all_edge_previous_positions(
-    positions: torch.Tensor,
-    edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]
-) -> torch.Tensor:
-    """ Returns a revised position tensor where the position of tokens in a graph sequence reflect
-        the order in which they can be generated, i.e. some edges can be generated at the same time
-        so they will have similar positions
-    """
-    text_positions = positions[:edge_sequence[0][0].start_idx].tolist()
-    new_positions = [text_positions[:] for _ in range(edge_sequence[0][0].start_idx)]
-    previous_lengths = []
-    for sequenced_edge in edge_sequence:
-        pred_node, edge, succ_node = sequenced_edge
-        start_idx = pred_node.start_idx
-        if isinstance(succ_node, SequenceElement):
-            end_idx = succ_node.end_idx
-        elif isinstance(edge, SequenceElement):
-            end_idx = edge.end_idx
-        else:
-            end_idx = pred_node.end_idx
-        current_length = end_idx - start_idx
-        position_vector = text_positions[:]
-        max_length = max(previous_lengths) if len(previous_lengths) > 0 else 0
-        for previous_length in previous_lengths:
-            position_vector += list(range(
-                text_positions[-1] + 1 + max_length - previous_length,
-                text_positions[-1] + 1 + max_length
-            ))
-        position_vector += list(range(
-            text_positions[-1] + 1 + max_length,
-            text_positions[-1] + 1 + max_length + current_length
-        ))
-        previous_lengths.append(current_length)
-        new_positions += [position_vector[:] for _ in range(current_length)]
-    length = positions.numel()
-    new_positions += [[0] * length for _ in range(length - len(new_positions[-1]))]
-    new_positions = [n_pos + (length - len(n_pos)) * [0] for n_pos in new_positions]
-    new_positions = torch.from_numpy(np.array(new_positions)).long().to(positions.device)
-    return new_positions
diff --git a/src/transformers/models/bloom/causal_message_passing.py b/src/transformers/models/processing_graphs_within_model/causal_message_passing.py
similarity index 96%
rename from src/transformers/models/bloom/causal_message_passing.py
rename to src/transformers/models/processing_graphs_within_model/causal_message_passing.py
index 2617115754af..3c5b6bd1a22d 100644
--- a/src/transformers/models/bloom/causal_message_passing.py
+++ b/src/transformers/models/processing_graphs_within_model/causal_message_passing.py
@@ -11,7 +11,6 @@
 
 import numpy as np
 import torch
-from torch_scatter import scatter, scatter_softmax
 import torch_geometric
 
 from .desequence_graph_ids import SequenceElement
@@ -23,7 +22,7 @@ class GNNLayerFactory(enum.Enum):
     gat = torch_geometric.nn.GATConv
 
 
-class GatedGraphCrossAttentionLayer(torch.nn.Module):
+class GatedCausalMessagePassingLayer(torch.nn.Module):
     """ A module for performing gated cross attention between elements in a graph that
         have been serialized in a sequence of tokens and the token sequence
 
@@ -74,6 +73,9 @@ def build_node_information_passing(
         message_passing_dicts = []
         for edge_sequence in edge_sequences:
             message_passing_dict = {'tokens2elements': [], 'elements2tokens': [], 'edge_index': []}
+            if len(edge_sequence) == 0:
+                message_passing_dicts.append(cls.to_torch(dict(message_passing_dict), device))
+                continue
             add_node = partial(
                 cls.add_node,
                 end_idx=cls.get_sequence_end(edge_sequence),
@@ -109,6 +111,9 @@ def build_edge_information_passing(
         message_passing_dicts = []
         for edge_sequence in edge_sequences:
             message_passing_dict = {'tokens2elements': [], 'elements2tokens': [], 'edge_index': []}
+            if len(edge_sequence) == 0:
+                message_passing_dicts.append(cls.to_torch(dict(message_passing_dict), device))
+                continue
             node2edge_idxs = defaultdict(list)
             add_edge = partial(
                 cls.add_edge,
diff --git a/src/transformers/models/bloom/desequence_graph_ids.py b/src/transformers/models/processing_graphs_within_model/desequence_graph_ids.py
similarity index 86%
rename from src/transformers/models/bloom/desequence_graph_ids.py
rename to src/transformers/models/processing_graphs_within_model/desequence_graph_ids.py
index 161a4bda4661..78c57f2909ce 100644
--- a/src/transformers/models/bloom/desequence_graph_ids.py
+++ b/src/transformers/models/processing_graphs_within_model/desequence_graph_ids.py
@@ -26,18 +26,18 @@ def extract_edge_sequence(
     if len(sequence) > 2:
         for elem0, elem1, elem2 in zip(sequence[:-2], sequence[1:-1], sequence[2:]):
             if (
-                elem0.token == graph_tokens['gen_edge']
+                elem0.token == graph_tokens['pred_node']
                 and elem1.token == graph_tokens['edge']
-                and elem2.token == graph_tokens['node']
+                and elem2.token == graph_tokens['succ_node']
             ): # edge syntax
                 edges.append((elem0, elem1, elem2))
     if (
         len(sequence) > 1
-        and sequence[-2].token == graph_tokens['gen_edge']
+        and sequence[-2].token == graph_tokens['pred_node']
         and sequence[-1].token == graph_tokens['edge']
     ):
         edges.append((sequence[-2], sequence[-1], None))
-    elif len(sequence) > 0 and sequence[-1].token == graph_tokens['gen_edge']:
+    elif len(sequence) > 0 and sequence[-1].token == graph_tokens['pred_node']:
         edges.append((sequence[-1], None, None))
     return edges
 
@@ -49,15 +49,13 @@ def _extract_graph_elements(
     """ Returns a parsable representation of the serialized graph in a sequence of token ids,
         if none is found, returns an empty list
     """
-    if len(graph_tokens) == 0:
-        return []
     sequence = []
     prev_token_id, prev_idx, final_idx = None, -1, len(token_ids)
     for token_idx, token_id in enumerate(token_ids):
-        if token_id == graph_tokens['gen_edge'] and prev_token_id is None:
+        if token_id == graph_tokens['pred_node'] and prev_token_id is None:
             prev_token_id, prev_idx = token_id, token_idx
         elif (
-            token_id in [graph_tokens['gen_edge'], graph_tokens['edge'], graph_tokens['node']]
+            token_id in [graph_tokens['pred_node'], graph_tokens['edge'], graph_tokens['succ_node']]
             and prev_token_id is not None
         ):
             sequence.append(SequenceElement(
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 33f29298dcca..1def99a3a09e 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -19,7 +19,7 @@
 import math
 import os
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -49,6 +49,8 @@
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_t5 import T5Config
 
+from ..processing_graphs_within_model.desequence_graph_ids import extract_edge_sequence
+from ..processing_graphs_within_model.causal_message_passing import GatedCausalMessagePassingLayer
 
 logger = logging.get_logger(__name__)
 
@@ -975,6 +977,27 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, new_embeddings):
         self.embed_tokens = new_embeddings
 
+    def init_graph_information_passing(
+        self,
+        gnn_type: str,
+        element_type: str,
+        graph_token_ids: Dict[str, int]
+    ):
+        """ Initializes a set of message passing layers to perform message passing of between
+            graph elements described in an input token id sequence
+        """
+        assert element_type in ['nodes', 'edges'], 'unsupported message passing type'
+        self.message_passing_type = element_type
+        self.graph_token_ids = graph_token_ids
+        self.num_gnn_layers = (
+            self.config.num_layers - 1
+            if hasattr(self.config, 'num_layers') else self.config.n_layer - 1
+        )
+        self.graph_information_passing_layers = torch.nn.ModuleList([
+            GatedCausalMessagePassingLayer(gnn_type, self.config.hidden_size)
+            for _ in range(self.num_gnn_layers)
+        ])
+
     def forward(
         self,
         input_ids=None,
@@ -990,6 +1013,7 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        use_cache = None
         # Model parallel
         if self.model_parallel:
             torch.cuda.set_device(self.first_device)
@@ -1075,6 +1099,18 @@ def forward(
 
         hidden_states = self.dropout(inputs_embeds)
 
+        if hasattr(self, "graph_tokens"):
+            assert input_ids.shape == attention_mask.shape
+            assert input_ids[0, 0] == self.graph_tokens['gen_edge'], "Incorrect stating token"
+            edge_sequences = [
+                extract_edge_sequence(t_ids.tolist(), self.graph_tokens) for t_ids in input_ids
+            ]
+            if self.message_passing_type == 'nodes':
+                get_matrices = GatedCausalMessagePassingLayer.build_node_information_passing
+            else:
+                get_matrices = GatedCausalMessagePassingLayer.build_edge_information_passing
+            message_passing_dicts = get_matrices(edge_sequences, self.device)
+
         for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
             layer_head_mask = head_mask[i]
             cross_attn_layer_head_mask = cross_attn_head_mask[i]
@@ -1141,6 +1177,11 @@ def custom_forward(*inputs):
 
             hidden_states, present_key_value_state = layer_outputs[:2]
 
+            if i <= self.num_gnn_layers and hasattr(self, 'graph_tokens'):
+                hidden_states = self.graph_information_passing_layers[i](
+                    hidden_states,
+                    message_passing_dicts
+                )
             # We share the position biases between the layers - the first layer store them
             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
             # (cross-attention position bias), (cross-attention weights)
@@ -1810,20 +1851,16 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         **kwargs,
     ):
-        # cut decoder_input_ids if past is used
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-
         return {
             "decoder_input_ids": input_ids,
-            "past_key_values": past_key_values,
+            "past_key_values": None,
             "encoder_outputs": encoder_outputs,
             "attention_mask": attention_mask,
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "decoder_attention_mask": decoder_attention_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,
+            "use_cache": False,
         }
 
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):